# Recomendaciones con librería de Python Surprise (Baseline, Item-Item, MF)

## Importar librerías
### Importar librerias de surprise

In [10]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy
from surprise.accuracy import rmse as surp_rmse
from surprise.accuracy import mae as surp_mae
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV

### Importar librerías python

In [12]:
import numpy as np
import pandas as pd
import pyarrow
import random
from collections import defaultdict


## Lectura de datos

In [14]:
df = pd.read_parquet(engine="pyarrow", path="/data/data_modelo_deep_learning.parquet")

In [17]:
df = df[['CLIENTE_ID', 'PRODUCTO_ID','PURCHASE_PRED']]

Unnamed: 0,CLIENTE_ID,PRODUCTO_ID,PURCHASE_PRED
0,120263164,99939,0.161865
1,120263164,99055,0.160651
2,120263164,98746,0.195037
3,120263164,97153,0.251862
4,120263164,2909,0.271077


## Uso de Surprise
### Transformación set de datos

In [18]:
# crando el objeto reader con escala entre 0 y 1
reader = Reader(rating_scale=(0, 1))

# creando el set de datos en surprise
data = Dataset.load_from_df(df, reader)

In [19]:
# Comprobando lectura de datos
pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])

Unnamed: 0,user_id,item_id,rating,timestamp
0,0120263164,000000000000099939,0.161865,
1,0120263164,000000000000099055,0.160651,
2,0120263164,000000000000098746,0.195037,
3,0120263164,000000000000097153,0.251862,
4,0120263164,000000000000002909,0.271077,
...,...,...,...,...
548889,0110234933,000000000000001119,0.693583,
548890,0110234655,000000000000084725,0.110394,
548891,0110234933,000000000000001740,0.649988,
548892,0110234933,000000000000001885,0.885219,


### Predicción y evaluación

In [29]:
# Computando predicciones
def compute_predictions(predictions,k=8):
  df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
  df_pred['rank_by_client'] = df_pred.groupby('uid')['rui'].rank(method='first',ascending=False)
  df_pred['rank_by_model'] = df_pred.groupby('uid')['est'].rank(method='first',ascending=False)
  df_pred = df_pred.sort_values(['uid', 'rank_by_client'],ascending = [True, True])

  df_pred_clients_k = df_pred[df_pred['uid'].isin(df_pred[df_pred['rank_by_client']==k*2]['uid'].unique())]
  df_pred_clients_k = df_pred_clients_k[df_pred_clients_k['rank_by_client']<=k].copy()
  df_pred_clients_k['Precions_k'] = np.where(df_pred_clients_k['rank_by_model']<=k,1,0)
  df_pred_clients_k['Accuracy'] = np.where(df_pred_clients_k['rank_by_model']==df_pred_clients_k['rank_by_client'],1,0)
  df_pred_clients_k['Discounted_Gain'] =    np.where(df_pred_clients_k['rank_by_model']<=k,((k+1 - df_pred_clients_k['rank_by_client'])/k)/ np.log2(df_pred_clients_k['rank_by_model'] + 1),0)
  df_pred_clients_k['Ideal_Discounted_Gain'] =  ((k+1- df_pred_clients_k['rank_by_client']) / k )/ np.log2(df_pred_clients_k['rank_by_client'] + 1)
  df_pred_clients_k['MRR'] = np.where(df_pred_clients_k['rank_by_model']==df_pred_clients_k['rank_by_client'], 1/df_pred_clients_k['rank_by_client'] ,0)
  df_pred_clients_k['IMRR'] = 1/df_pred_clients_k['rank_by_client']
  # Compute ndcg@k
  grouped = df_pred_clients_k.groupby('uid').agg({'MRR': 'sum', 'IMRR': 'sum',  'Ideal_Discounted_Gain': 'sum' ,'Discounted_Gain': 'sum' })

  grouped['ndcgk'] =grouped['Discounted_Gain'] / grouped['Ideal_Discounted_Gain']

  ndcgk_mean = grouped['ndcgk'].mean()
  # Compute MRR@k

  mrr_ratio =  grouped['MRR'].mean()/ grouped['IMRR'].mean()

  return df_pred_clients_k['Precions_k'].mean(),df_pred_clients_k['Accuracy'].mean(),ndcgk_mean, mrr_ratio

In [30]:
folds = 4
k = 6
algorithms = [    SVD(n_epochs=10, lr_all=0.002),     SVDpp(),  NMF(n_factors=15, n_epochs=80, biased=False),     NormalPredictor(),     KNNBasic(k=6, min_k=1, sim_options={'name': 'cosine', 'user_based': False}),     BaselineOnly(bsl_options={"method": "als","n_epochs": 12})]
results = {str(algo).split(' ')[0].split('.')[-1]:[] for algo in algorithms}

kf = KFold(n_splits=folds, random_state=42)

for trainset, testset in kf.split(data):
    for algo in algorithms:
        print ("\n Attempting: ", str(algo).split(' ')[0].split('.')[-1], '\n')
        algo.fit(trainset)
        predictions = algo.test(testset)
        precision_k, accuracy, ndcg, mrr = compute_predictions(predictions,k)  
        # Guardando resultados en lista
        results[str(algo).split(' ')[0].split('.')[-1]].append([precision_k, accuracy, ndcg,mrr, surp_rmse(predictions), surp_mae(predictions)])
        


 Attempting:  SVD 

RMSE: 0.1699
MAE:  0.1328

 Attempting:  SVDpp 

RMSE: 0.1654
MAE:  0.1284

 Attempting:  NMF 

RMSE: 0.1570
MAE:  0.1193

 Attempting:  NormalPredictor 

RMSE: 0.3308
MAE:  0.2671

 Attempting:  KNNBasic 

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.2748
MAE:  0.2288

 Attempting:  BaselineOnly 

Estimating biases using als...
RMSE: 0.1485
MAE:  0.1172

 Attempting:  SVD 

RMSE: 0.1707
MAE:  0.1334

 Attempting:  SVDpp 

RMSE: 0.1671
MAE:  0.1299

 Attempting:  NMF 

RMSE: 0.1571
MAE:  0.1193

 Attempting:  NormalPredictor 

RMSE: 0.3308
MAE:  0.2670

 Attempting:  KNNBasic 

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.2735
MAE:  0.2275

 Attempting:  BaselineOnly 

Estimating biases using als...
RMSE: 0.1486
MAE:  0.1173

 Attempting:  SVD 

RMSE: 0.1711
MAE:  0.1337

 Attempting:  SVDpp 

RMSE: 0.1672
MAE:  0.1302

 Attempting:  NMF 

RMSE: 0.1571
MAE:  0.1196

 Attempting:  NormalPr

In [31]:
# Nombre de las columnas a evaluar
column_names = ['Precision@k', 'Accuracy@k','NDCG@k','MRR@k', 'RMSE', 'MAE']

In [32]:
# Obteniendo diccionario por número de pliegues, modelo y resultados.
results_dictionary =  {"Model":[],"Fold":[],"Results":[]}
for algo in algorithms:
  algo_name = str(algo).split(' ')[0].split('.')[-1]
  algo_results = results[algo_name]
  if algo_name=="SVDpp": algo_name="SVD++"
  for i in range(0,4):
    results_list = algo_results[i][:]
    Fold = "Fold_" + str(i+1)
    results_dictionary["Fold"].append(Fold)
    results_dictionary["Model"].append(algo_name)
    results_dictionary["Results"].append(results_list)
print(results_dictionary)

{'Model': ['SVD', 'SVD', 'SVD', 'SVD', 'SVD++', 'SVD++', 'SVD++', 'SVD++', 'NMF', 'NMF', 'NMF', 'NMF', 'NormalPredictor', 'NormalPredictor', 'NormalPredictor', 'NormalPredictor', 'KNNBasic', 'KNNBasic', 'KNNBasic', 'KNNBasic', 'BaselineOnly', 'BaselineOnly', 'BaselineOnly', 'BaselineOnly'], 'Fold': ['Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4'], 'Results': [[0.7353141205474794, 0.2531242030094364, 0.8138817418777987, 0.3702512374498384, 0.16985011913388715, 0.1328265749744149], [0.7291081012011245, 0.2498935173353778, 0.8119280140604053, 0.3657370947356709, 0.17069864768920429, 0.13342691192848555], [0.7373425854536109, 0.25674633770239014, 0.8140768476056812, 0.37235238479868954, 0.1710823431775814, 0.13365285696581375], [0.7321991984309713, 0.24938176856826128, 0.8102693260907058, 0.3646564791

In [33]:
# Transformando resultados a pandas
df_results = pd.DataFrame.from_dict(results_dictionary, orient='index').T
df_results[column_names] = pd.DataFrame(df_results.Results.tolist(), index= df_results.index)
df_results.drop(columns = ['Results'], inplace=True)
df_results_transpose=df_results.set_index(["Model","Fold"]).stack().reset_index().rename(columns={"level_2": "Metric", 0: "Values"})
df_results

Unnamed: 0,Model,Fold,Precision@k,Accuracy@k,NDCG@k,MRR@k,RMSE,MAE
0,SVD,Fold_1,0.735314,0.253124,0.813882,0.370251,0.16985,0.132827
1,SVD,Fold_2,0.729108,0.249894,0.811928,0.365737,0.170699,0.133427
2,SVD,Fold_3,0.737343,0.256746,0.814077,0.372352,0.171082,0.133653
3,SVD,Fold_4,0.732199,0.249382,0.810269,0.364656,0.170918,0.133536
4,SVD++,Fold_1,0.746196,0.263538,0.823133,0.377998,0.165374,0.128418
5,SVD++,Fold_2,0.741417,0.255942,0.820307,0.370288,0.167126,0.129875
6,SVD++,Fold_3,0.74818,0.265099,0.82538,0.381265,0.16722,0.130181
7,SVD++,Fold_4,0.741451,0.257227,0.819935,0.374317,0.167611,0.130188
8,NMF,Fold_1,0.760223,0.282241,0.841153,0.401361,0.157016,0.11928
9,NMF,Fold_2,0.754792,0.271872,0.838902,0.390097,0.157052,0.11931


In [35]:
# Analizando resultados finales
df_results[['Model','Precision@k', 'Accuracy@k', 'NDCG@k', 'MRR@k', 'RMSE','MAE']].groupby('Model').describe().loc[:,(slice(None),['mean','std'])].T[['NormalPredictor','KNNBasic','BaselineOnly','SVD', 'SVD++','NMF']]

Unnamed: 0,Model,NormalPredictor,KNNBasic,BaselineOnly,SVD,SVD++,NMF
Precision@k,mean,0.39755,0.563977,0.757987,0.733491,0.744311,0.758083
Precision@k,std,0.005082,0.006035,0.002751,0.003607,0.003419,0.002867
Accuracy@k,mean,0.064599,0.104463,0.278405,0.252286,0.260451,0.278522
Accuracy@k,std,0.001653,0.002132,0.004953,0.003404,0.004541,0.005712
NDCG@k,mean,0.338188,0.549021,0.839606,0.812539,0.822189,0.839895
NDCG@k,std,0.004823,0.007521,0.00141,0.001797,0.002562,0.001503
MRR@k,mean,0.065118,0.121375,0.397473,0.368249,0.375967,0.397183
MRR@k,std,0.001727,0.004658,0.004682,0.003654,0.004732,0.005415
RMSE,mean,0.330636,0.273492,0.14843,0.170637,0.166833,0.156963
RMSE,std,0.000178,0.001018,0.000183,0.000548,0.000995,0.000187
