In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import surprise as sp
import lightgbm
import xgboost

from src.surprise_model import _tune_train, _train, _fit, _generate_submission

In [3]:
df_train = pd.read_csv('../data/01_raw/opiniones_train.csv')
df_users = pd.read_csv('../data/01_raw/usuarios.csv')
df_books = pd.read_csv('../data/01_raw/libros.csv')
df_test = pd.read_csv('../data/01_raw/opiniones_test.csv')

In [4]:
df_books.loc[(df_books.idioma.isna()) & (df_books.isbn == 'Español'), 'idioma'] = 'Español'
df_books = df_books[df_books.anio.notna()]
df_books.loc[df_books.anio == '(200', 'anio'] = '2002'
not_years = list(filter(lambda y: len(y) != 4, [x for x in df_books.anio.unique().tolist()]))
df_books = df_books[~(df_books.anio.isin(not_years))]
df_books.loc[df_books.isbn == 'Español', 'isbn'] = np.nan

## Predictions

In [5]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_train.drop(['puntuacion'], axis=1), df_train.puntuacion, test_size=0.3, random_state=0)

In [None]:
y_train

### Surprise

### SVD

In [7]:
scale = (1.0, 10.0)
algorithm = sp.prediction_algorithms.SVD
param_grid = {
    "n_epochs": [90],
    "lr_all": [0.001, 0.0015, 0.002, 0.003, 0.005, 0.007],
    "reg_all": [0.2, 0.4, 0.42, 0.45, 0.49, 0.6],
}

best_params = _tune_train(X_train[['usuario', 'libro']], y_train, algorithm, param_grid, scale, n_jobs=-1)
model = _train(df_train[['usuario', 'libro', 'puntuacion']], algorithm, best_params, scale)
predictions = _fit(zip(df_test.usuario, df_test.libro), model)
submission = pd.DataFrame({'id': df_test.id, 'puntuacion': predictions})

2020-11-17 17:08:30,042  surprise: INFO: Tunning SVD ...
2020-11-17 17:09:21,546  surprise: INFO: Training SVD ...
RMSE: 1.5814


In [None]:
_generate_submission(submission, './submission_surprise_svd.csv')

Resultado a mejorar

Algoritmo | RMSE | Surprise lib
- | - | -
SVD | 1.5925 | 1.5925182858830331

In [None]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_svd.csv -m "Con libreria surprise SVD with tune"

### KNN

In [6]:
scale = (1.0, 10.0)
algorithm = sp.prediction_algorithms.knns.KNNBasic
param_grid = {'k': [15, 40, 70], 'min_k': [1, 3, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}}

best_params = _tune_train(X_train[['usuario', 'libro']], y_train, algorithm, param_grid, scale, n_jobs=4)
model = _train(df_train[['usuario', 'libro', 'puntuacion']], algorithm, best_params, scale)
predictions = _fit(zip(df_test.usuario, df_test.libro), model)
submission = pd.DataFrame({'id': df_test.id, 'puntuacion': predictions})

2020-11-17 17:05:51,030  surprise: INFO: Tunning KNNBasic ...
2020-11-17 17:06:53,205  surprise: INFO: Training KNNBasic ...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.6860
Computing the msd similarity matrix...
Done computing similarity matrix.


In [None]:
_generate_submission(submission, './submission_surprise_knn.csv')

In [None]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_knn.csv -m "Con libreria surprise KNN with tune"