In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import surprise as sp
import lightgbm
import xgboost
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

In [2]:
#df_actors = pd.read_csv('./data/movie_actors.csv')
#df_countries = pd.read_csv('./data/movie_countries.csv')
#df_directors = pd.read_csv('./data/movie_directors.csv')
#df_genres = pd.read_csv('./data/movie_genres.csv')
#df_imdb = pd.read_csv('./data/movie_imdb.csv')
#df_locations = pd.read_csv('./data/movie_locations.csv')
#df_tags = pd.read_csv('./data/movie_tags.csv')
#df_rottem = pd.read_csv('./data/movies_rt.csv')
#df_movies = pd.read_csv('./data/movies.csv')

df_ratings = pd.read_csv('./data/ratings_train.csv')
df_test = pd.read_csv('./data/ratings_test.csv')

In [3]:
# Separo mi train & test
X_train, X_test, y_train, y_test =\
    train_test_split(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating, test_size=0.3, random_state=0)

### Regresión lineal

In [28]:
model = LinearRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)
print("Linear regression: ", mean_squared_error(y_test, y_predicted))

Linear regression:  0.755803270622583


In [None]:
model.fit(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating)
prediction = model.predict(df_test.drop(['rating', 'ID', 'rating'], axis=1))
submission = pd.DataFrame(
        {'ID': df_test.ID, 'rating': np.around(prediction, 2)})
submission.to_csv('./submission_linear.csv', index=False)

In [None]:
!kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f submission_linear.csv -m "Con similaridad"

### Light GBM

In [None]:
lgbm = lightgbm.LGBMRegressor()
model = lgbm.fit(X_train, y_train)
y_predicted = model.predict(X_test)
print("Light GBM: ", mean_squared_error(y_test, y_predicted))

In [None]:
model.fit(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating)
prediction = model.predict(df_test.drop(['rating', 'ID', 'rating'], axis=1))
submission = pd.DataFrame(
        {'ID': df_test.ID, 'rating': np.around(prediction, 2)})
submission.to_csv('./submission_lgbm.csv', index=False)

In [None]:
!kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f submission_lgbm.csv -m "Con similaridad lgbm"

### XGBoost

In [None]:
xgb = xgboost.XGBRegressor()
model = xgb.fit(X_train, y_train)
y_predicted = model.predict(X_test)
print("XGBoost: ", mean_squared_error(y_test, y_predicted))

In [None]:
model.fit(df_ratings.drop(['rating', 'ID'], axis=1), df_ratings.rating)
prediction = model.predict(df_test.drop(['rating', 'ID', 'rating'], axis=1))
submission = pd.DataFrame(
        {'ID': df_test.ID, 'rating': np.around(prediction, 2)})
submission.to_csv('./submission_xgb.csv', index=False)

In [None]:
!kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f submission_xgb.csv -m "Con similaridad xgb"

### Light FM

In [6]:
from lightfm.datasets import fetch_movielens
data = fetch_movielens(min_rating=5.0)

In [13]:
data.keys()

dict_keys(['train', 'test', 'item_features', 'item_feature_labels', 'item_labels'])

In [16]:
data['item_features']

<1682x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 1682 stored elements in Compressed Sparse Row format>

In [None]:
# Instantiate and train the model
model = LightFM(loss='warp')
model.fit(data['train'], epochs=30, num_threads=2)

# Evaluate the trained model
test_precision = precision_at_k(model, data['test'], k=5).mean()

### Surprise lib

#### SVD

In [25]:
reader = sp.reader.Reader(rating_scale=(0.5 ,5))
X_train_surprise = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(X_train_surprise[['userID', 'movieID', 'rating']], reader)

param_grid = {'n_epochs': [5, 10, 50, 70], 'lr_all': [0.001, 0.002, 0.005, 0.007],
              'reg_all': [0.1, 0.2, 0.4, 0.6]}
gs = sp.model_selection.search.RandomizedSearchCV(sp.prediction_algorithms.SVD, param_grid, measures=['rmse', 'mse'], cv=5, n_jobs=-1)
gs.fit(data_train)
# best RMSE score
print(gs.best_score['mse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['mse'])

0.6147880689256694
{'n_epochs': 50, 'lr_all': 0.007, 'reg_all': 0.1}


In [28]:
reader = sp.reader.Reader(rating_scale=(0.5 ,5))
data = sp.dataset.Dataset.load_from_df(df_ratings[['userID', 'movieID', 'rating']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)

algo = sp.prediction_algorithms.SVD(n_epochs=50, lr_all=0.007, reg_all=0.1, random_state=0)
algo.fit(trainset)
y_predictions = algo.test(testset)
print("Surprise lib: ", sp.accuracy.mse(y_predictions))

MSE: 0.6089
Surprise lib:  0.6089434856685295


In [29]:
algo.fit(data.build_full_trainset())
prediction = df_test[['ID', 'userID', 'movieID']]
prediction['rating']  = list(map(lambda x, y: algo.predict(x, y).est, prediction.userID.values, prediction.movieID))
submission = prediction[['ID', 'rating']]
submission.to_csv('./submission_surprise_svd.csv', index=False)

In [30]:
!kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f submission_surprise_svd.csv -m "Con libreria surprise SVD with tune"

100%|███████████████████████████████████████| 2.06M/2.06M [00:10<00:00, 211kB/s]
Successfully submitted to Recomendación de Películas - FCEN 2020

In [20]:
list(map(lambda x, y: algo.predict(x, y), prediction.head().userID, prediction.head().movieID))

[Prediction(uid=29202, iid=1608, r_ui=None, est=2.456659316208836, details={'was_impossible': False}),
 Prediction(uid=32283, iid=1485, r_ui=None, est=3.3188930224552307, details={'was_impossible': False}),
 Prediction(uid=32283, iid=30, r_ui=None, est=3.940630196148756, details={'was_impossible': False}),
 Prediction(uid=32283, iid=1545, r_ui=None, est=4.031069120138712, details={'was_impossible': False}),
 Prediction(uid=51198, iid=1258, r_ui=None, est=4.096084760390009, details={'was_impossible': False})]

#### KNN

In [61]:
reader = sp.reader.Reader(rating_scale=(0.5 ,5))
data = sp.dataset.Dataset.load_from_df(df_ratings[['userID', 'movieID', 'rating']], reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
sim_options = {
    'name': 'cosine',
    'user_based': True #False
}
algo = sp.prediction_algorithms.knns.KNNBasic(k=40, min_k=1, sim_options=sim_options)
algo.fit(trainset)
y_predictions = algo.test(testset)
print("Surprise lib: ", sp.accuracy.mse(y_predictions))

Computing the cosine similarity matrix...
Done computing similarity matrix.
MSE: 0.7759
Surprise lib:  0.7759103178921984


In [62]:
algo.fit(data.build_full_trainset())
prediction = df_test[['ID', 'userID', 'movieID']]
prediction['rating']  = list(map(lambda x, y: algo.predict(x, y).est, prediction.userID.values, prediction.movieID))
submission = prediction[['ID', 'rating']]
submission.to_csv('./submission_surprise_knn.csv', index=False)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [63]:
!kaggle competitions submit -c recomendacion-de-peliculas-fcen-2020 -f submission_surprise_knn.csv -m "Con libreria surprise KNN"

100%|███████████████████████████████████████| 2.06M/2.06M [00:05<00:00, 408kB/s]
Successfully submitted to Recomendación de Películas - FCEN 2020