In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import surprise as sp
import lightgbm
import xgboost

In [84]:
df_train = pd.read_csv('../data/01_raw/opiniones_train.csv')
df_users = pd.read_csv('../data/01_raw/usuarios.csv')
df_books = pd.read_csv('../data/01_raw/libros.csv')
df_test = pd.read_csv('../data/01_raw/opiniones_test.csv')

In [14]:
df_books.loc[(df_books.idioma.isna()) & (df_books.isbn == 'Español'), 'idioma'] = 'Español'
df_books = df_books[df_books.anio.notna()]
df_books.loc[df_books.anio == '(200', 'anio'] = '2002'
not_years = list(filter(lambda y: len(y) != 4, [x for x in df_books.anio.unique().tolist()]))
df_books = df_books[~(df_books.anio.isin(not_years))]
df_books.loc[df_books.isbn == 'Español', 'isbn'] = np.nan

## Predictions

In [75]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_train.drop(['puntuacion'], axis=1), df_train.puntuacion, test_size=0.3, random_state=0)

### Surprise

In [106]:
reader = sp.reader.Reader(rating_scale=(1.0 ,10.0))
X_train_surprise = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(X_train_surprise[['usuario', 'libro', 'puntuacion']], reader)

param_grid = {'n_epochs': [90], 'lr_all': [0.001, 0.0015, 0.002, 0.003, 0.005, 0.007],
              'reg_all': [0.2, 0.4, 0.42, 0.45, 0.49, 0.6]}
gs = sp.model_selection.search.RandomizedSearchCV(sp.prediction_algorithms.SVD, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
gs.fit(data_train)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

KeyboardInterrupt: 

In [100]:
best_params = gs.best_params['rmse']
best_params['random_state'] = 0
reader = sp.reader.Reader(rating_scale=(1.0 ,10.0))
data = sp.dataset.Dataset.load_from_df(df_train[['usuario', 'libro', 'puntuacion']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)

algo = sp.prediction_algorithms.SVD(**best_params)
algo.fit(trainset)
y_predictions = algo.test(testset)
print("Surprise lib: ", sp.accuracy.rmse(y_predictions))

RMSE: 1.5925
Surprise lib:  1.5925182858830331


In [103]:
algo.fit(data.build_full_trainset())
prediction = df_test[['id', 'usuario', 'libro']]
prediction['puntuacion']  = list(map(lambda x, y: algo.predict(x, y).est, prediction.usuario.values, prediction.libro))
submission = prediction[['id', 'puntuacion']]
submission.to_csv('./submission_surprise_svd.csv', index=False)

Resultado a mejorar

Algoritmo | RMSE | Surprise lib
- | - | -
SVD | 1.5925 | 1.5925182858830331

In [104]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_svd.csv -m "Con libreria surprise SVD with tune"

100%|████████████████████████████████████████| 237k/237k [00:02<00:00, 82.3kB/s]
Successfully submitted to Predicción de opiniones de libros - FCEN 2020

### KNN

In [None]:
reader = sp.reader.Reader(rating_scale=(1.0 ,10.0))
X_train_surprise = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(X_train_surprise[['usuario', 'libro', 'puntuacion']], reader)

param_grid = {'k': [15, 40, 70], 'min_k': [1, 3, 9],
              'sim_options': {'name': ['msd', 'cosine'],
                              'user_based': [False]}}

gs = sp.model_selection.search.RandomizedSearchCV(sp.prediction_algorithms.knns.KNNBasic, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
gs.fit(data_train)
# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

In [None]:
best_params = gs.best_params['rmse']
best_params['random_state'] = 0
reader = sp.reader.Reader(rating_scale=(1.0 ,10.0))
data = sp.dataset.Dataset.load_from_df(df_train[['usuario', 'libro', 'puntuacion']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)

algo = sp.prediction_algorithms.KNNBasic(**best_params)
algo.fit(trainset)
y_predictions = algo.test(testset)
print("Surprise lib: ", sp.accuracy.rmse(y_predictions))

In [None]:
algo.fit(data.build_full_trainset())
prediction = df_test[['id', 'usuario', 'libro']]
prediction['puntuacion']  = list(map(lambda x, y: algo.predict(x, y).est, prediction.usuario.values, prediction.libro))
submission = prediction[['id', 'puntuacion']]
submission.to_csv('./submission_surprise_knn.csv', index=False)

In [None]:
!kaggle competitions submit -c prediccion-de-opiniones-de-libros-fcen-2020 -f submission_surprise_knn.csv -m "Con libreria surprise KNN with tune"