In [21]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import surprise as sp
import lightgbm
import xgboost
from random import randint

import src.surprise_model
import src.lightgbm_model

In [22]:
df_train = pd.read_csv('../data/01_raw/opiniones_train.csv')
df_users = pd.read_csv('../data/01_raw/usuarios.csv')
df_books = pd.read_csv('../data/01_raw/libros.csv')
df_test = pd.read_csv('../data/01_raw/opiniones_test.csv')

In [23]:
df_books.loc[(df_books.idioma.isna()) & (df_books.isbn == 'Español'), 'idioma'] = 'Español'
df_books = df_books[df_books.anio.notna()]
df_books.loc[df_books.anio == '(200', 'anio'] = '2002'
not_years = list(filter(lambda y: len(y) != 4, [x for x in df_books.anio.unique().tolist()]))
df_books = df_books[~(df_books.anio.isin(not_years))]
df_books.loc[df_books.isbn == 'Español', 'isbn'] = np.nan

## Predictions

In [24]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_train.drop(['puntuacion'], axis=1), df_train.puntuacion, test_size=0.3, random_state=0)

### Surprise

### SVD

In [30]:
scale = (1.0, 10.0)
algorithm = sp.prediction_algorithms.SVD
param_grid = {
    "n_epochs": [300, 700, 1000],
    "lr_all": [0.001, 0.0015, 0.002, 0.003, 0.005, 0.007],
    "reg_all": [0.2, 0.4, 0.42, 0.45, 0.49, 0.6],
    'n_factors': range(40, 130, 5)
}

In [31]:
best_params = src.surprise_model._tune_train(X_train[['usuario', 'libro']], y_train, algorithm, param_grid, scale, n_jobs=-1)
# To save time
#best_params = {'n_epochs': 300, 'lr_all': 0.001, 'reg_all': 0.2, 'n_factors': 55}

model = src.surprise_model._train(df_train[['usuario', 'libro', 'puntuacion']], algorithm, best_params, scale)
predictions = src.surprise_model._fit(zip(df_test.usuario, df_test.libro), model)
submission = pd.DataFrame({'id': df_test.id, 'puntuacion': predictions})
src.surprise_model._generate_submission(submission, './submission_surprise_svd.csv')

2020-11-18 19:58:31,528  surprise: INFO: Tunning SVD ...
2020-11-18 20:04:50,424  surprise: INFO: Training SVD ...
RMSE: 1.5841
2020-11-18 20:05:29,875  surprise: INFO: Fitting test ...
2020-11-18 20:05:29,953  surprise: INFO: Saving submission to ./submission_surprise_svd.csv ...


Resultado a mejorar en Accuracy in test

Algoritmo | RMSE | Surprise lib
- | - | -
SVD | 1.5841 | 1.5841499502324747

In [32]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_svd.csv -m "Con libreria surprise SVD with tune"

100%|████████████████████████████████████████| 237k/237k [00:04<00:00, 48.8kB/s]
Successfully submitted to Predicción de opiniones de libros - FCEN 2020

In [16]:
best_params = src.surprise_model._tune_train(X_train[['usuario', 'libro']], y_train, algorithm, param_grid, scale, n_jobs=-1)
for iteration in tqdm_notebook(range(10)):
    best_params['random_state'] = randint(0, 1000)
    model = src.surprise_model._train(df_train[['usuario', 'libro', 'puntuacion']], algorithm, best_params, scale)
    predictions_train = src.surprise_model._fit(zip(df_train.usuario, df_train.libro), model)
    df_train[f'model_{iteration}'] = predictions_train
    predictions = src.surprise_model._fit(zip(df_test.usuario, df_test.libro), model)
    df_test[f'model_{iteration}'] = predictions

2020-11-18 16:14:01,195  surprise: INFO: Tunning SVD ...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

2020-11-18 16:20:56,490  surprise: INFO: Training SVD ...
RMSE: 1.5997
2020-11-18 16:22:45,389  surprise: INFO: Training SVD ...
RMSE: 1.6002
2020-11-18 16:24:31,549  surprise: INFO: Training SVD ...
RMSE: 1.5977
2020-11-18 16:26:11,227  surprise: INFO: Training SVD ...
RMSE: 1.5997
2020-11-18 16:27:50,513  surprise: INFO: Training SVD ...
RMSE: 1.5988
2020-11-18 16:29:30,089  surprise: INFO: Training SVD ...
RMSE: 1.6001
2020-11-18 16:31:07,742  surprise: INFO: Training SVD ...
RMSE: 1.5983
2020-11-18 16:32:46,180  surprise: INFO: Training SVD ...
RMSE: 1.6009
2020-11-18 16:34:23,883  surprise: INFO: Training SVD ...
RMSE: 1.6010
2020-11-18 16:36:01,578  surprise: INFO: Training SVD ...
RMSE: 1.5994



In [19]:
df_test.to_csv('../data/03_primary/opiniones_test_01.csv', index=False)
df_train.to_csv('../data/03_primary/opiniones_train_01.csv', index=False)

### KNN

In [6]:
scale = (1.0, 10.0)
algorithm = sp.prediction_algorithms.knns.KNNBasic
param_grid = {'k': [15, 40, 70], 'min_k': [1, 3, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}}

best_params = _tune_train(X_train[['usuario', 'libro']], y_train, algorithm, param_grid, scale, n_jobs=4)
model = _train(df_train[['usuario', 'libro', 'puntuacion']], algorithm, best_params, scale)
predictions = _fit(zip(df_test.usuario, df_test.libro), model)
submission = pd.DataFrame({'id': df_test.id, 'puntuacion': predictions})

2020-11-17 17:05:51,030  surprise: INFO: Tunning KNNBasic ...
2020-11-17 17:06:53,205  surprise: INFO: Training KNNBasic ...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6860
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
_generate_submission(submission, './submission_surprise_knn.csv')

In [None]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_knn.csv -m "Con libreria surprise KNN with tune"

### LGB con modelos

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import surprise as sp
import lightgbm
import xgboost
from random import randint

import src.surprise_model
import src.lightgbm_model

In [2]:
df_train = pd.read_csv('../data/03_primary/opiniones_train_01.csv')
df_test = pd.read_csv('../data/03_primary/opiniones_test_01.csv')

In [3]:
df_train['libro'] = df_train['libro'].astype('category')
df_test['libro'] = df_test['libro'].astype('category')

In [4]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_train.drop(['puntuacion'], axis=1), df_train.puntuacion, test_size=0.3, random_state=0)

In [17]:
# Random search
grid_params = {
    'learning_rate': [0.001, 0.01, 0.05],
    'num_leaves': [4, 15],
    'boosting_type' : ['gbdt', 'goss'],
    'max_depth' : [-1, 8, 15],
    'random_state' : [42],
    'n_estimators' : [100, 500, 1000],
    'colsample_bytree' : [0.5, 0.9],
    'subsample' : [0.7, 0.9],
    'max_bin': [1, 5, 14],
#    'min_split_gain' : [0.01],
#    'min_data_in_leaf':[5, 10, 15],
    'metric':['rmse'],
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
    #'early_stopping_round': [100, 500],
    'min_data_in_leaf': [10, 20, 40],
    'min_sum_hessian_in_leaf': [0.0001, 0.0004, 0.001, 0.1]
    }
algorithm = lightgbm.LGBMRegressor()
#best_params = src.lightgbm_model._tune_train(X_train, y_train, algorithm, grid_params)

# to save time
best_params = {'subsample': 0.7, 'regression_l1': 'mae', 'regression': 'root_mean_squared_error', 'reg_lambda': 5, 'reg_alpha': 0.1, 'random_state': 42, 'num_leaves': 31, 'n_estimators': 500, 'metric': 'rmse', 'max_depth': -1, 'max_bin': 31, 'learning_rate': 0.05, 'colsample_bytree': 0.9, 'boosting_type': 'gbdt'}
algorithm.set_params(**best_params)

model = src.lightgbm_model._train(X_train, y_train, algorithm)

2020-11-18 19:37:15,251  lightgbm: INFO: Tunning lgbm ...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    9.7s finished
2020-11-18 19:37:26,607  lightgbm: INFO: Training lgbm ...


In [19]:
prediction = src.lightgbm_model._fit(df_test.drop(['id', 'puntuacion'], axis=1), model)
submission = pd.DataFrame(
        {'id': df_test.id, 'puntuacion': list(map(lambda p: 10.0 if p >= 10.0 else (1.0 if p <= 0.0 else p), np.round(prediction, 0)))})
src.lightgbm_model._generate_submission(submission, './submission_lgbm.csv')

2020-11-18 19:40:00,635  lightgbm: INFO: Fitting test ...
2020-11-18 19:40:00,765  lightgbm: INFO: Saving submission to ./submission_lgbm.csv ...


In [20]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_lgbm.csv -m "Lgbm con modelos de surprise"

100%|██████████████████████████████████████| 92.6k/92.6k [00:04<00:00, 22.8kB/s]
Successfully submitted to Predicción de opiniones de libros - FCEN 2020