# Recommendation Systems 3

### Surprise Model

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [21]:
import sys
sys.path.append('../../')

from A_Model_Recommendation.ETL import ETL_class

In [22]:
path_rating = './../dataset/ratings/'
path_titles = './../dataset/titles/'
etl = ETL_class(path_titles, path_rating)

df_ratings = etl.get_ratings()[['userId', 'rating', 'movieId']].rename(columns={'movieId':'id'})
print(df_ratings.shape)
df_ratings.head()

(11024165, 3)


Unnamed: 0,userId,rating,id
0,1,1.0,as680
1,1,4.5,ns2186
2,1,5.0,hs2381
3,1,5.0,ns3663
4,1,5.0,as9500


In [24]:
df_count = df_ratings[['userId','id']].groupby('id').count()
df_count.reset_index(inplace=True)
df_count.rename(columns={'userId': 'count'}, inplace=True)

# The filter on movies is that each movie must be count with 500 or more grades
df_count = df_count.loc[df_count['count'] >= 500]
print(df_count.shape)
df_count.head()

(4221, 2)


Unnamed: 0,id,count
0,as1,502
1,as10,513
16,as1012,508
25,as1020,529
28,as1023,502


In [25]:
# this is the final data we shall deal with
df = pd.merge(df_ratings, df_count, on='id', how='left')
df.dropna(inplace=True)
print(df.shape)
df.head()

(2158026, 4)


Unnamed: 0,userId,rating,id,count
1,1,4.5,ns2186,501.0
5,1,4.0,as3004,506.0
13,1,3.5,ns3830,527.0
16,1,5.0,hs2056,507.0
17,1,5.0,ns5413,521.0


In [26]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'id', 'rating']], reader)

In [27]:
# Train an SVD algorithm on the dataset.
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0040  1.0055  1.0085  1.0056  1.0071  1.0062  0.0015  
MAE (testset)     0.7838  0.7840  0.7867  0.7840  0.7858  0.7849  0.0012  
Fit time          22.86   19.99   19.46   23.29   22.97   21.72   1.64    
Test time         4.56    4.10    4.17    5.22    4.21    4.45    0.41    


{'test_rmse': array([1.00403425, 1.00551995, 1.00852877, 1.00560784, 1.00712571]),
 'test_mae': array([0.7837924 , 0.78403552, 0.78667504, 0.78400773, 0.78580946]),
 'fit_time': (22.857370853424072,
  19.99215292930603,
  19.460212230682373,
  23.294392347335815,
  22.973990201950073),
 'test_time': (4.558228969573975,
  4.104132652282715,
  4.169611692428589,
  5.218188047409058,
  4.212496995925903)}

Here we can see and uniformity on the RMSE and MAE no depending in the number of Fold. So we could conclude that the model has a moderated metric evaluations.

In [43]:
# so make prodiction let's choose randomly an user and a movie
u = np.random.choice(df.shape[0])
l = np.random.choice(df.shape[0])
user = df.iloc[u, 0]
movie = df.iloc[l, 2]

threshold = 3.5

prediction = algo.predict(user,movie)[3]
print('For the movie {} the user {} would grade it at {}'.format(movie, user, prediction))
if prediction > threshold:
    print('Then the movie is recommended')
else:
    print('Then the movie is not recommended')
algo.predict(user,movie)

For the movie ds14 the user 27833 would grade it at 3.4929522908688604
Then the movie is not recommended


Prediction(uid=27833, iid='ds14', r_ui=None, est=3.4929522908688604, details={'was_impossible': False})