<a href="https://colab.research.google.com/github/lalesafarzade/Recommendation_system_Project/blob/lale/Notebooks/3.Recommender_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
movie_df=pd.read_csv('/content/drive/MyDrive/ml-latest-small (2)/ml-latest-small/movies.csv')
rating_df=pd.read_csv('/content/drive/MyDrive/ml-latest-small (2)/ml-latest-small/ratings.csv')

In [13]:
df = pd.merge(rating_df,movie_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [55]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2)

## Memory-Based Collaborative Filtering"

In [57]:
user_movie_train = train_data.pivot('userId','movieId','rating').fillna(0).to_numpy()
user_movie_test = test_data.pivot('userId','movieId','rating').fillna(0).to_numpy()

In [58]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

user_similarity = cosine_similarity(user_movie_train)
item_similarity = cosine_similarity(user_movie_train.T)

In [59]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [60]:
item_prediction = predict(user_movie_train, item_similarity, type='item')
user_prediction = predict(user_movie_train, user_similarity, type='user')

In [61]:
### Evaluation

In [62]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [63]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, user_movie_test)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, user_movie_test)))

User-based CF RMSE: 3.4565950667796423
Item-based CF RMSE: 3.4034303919655624


## Model-based Collaborative Filtering

In [66]:
n_users = df.userId.nunique()
n_items = df.movieId.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

Num. of Users: 610
Num of Movies: 9724
The sparsity level of MovieLens100K is 98.3%


### SVD

In [67]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(user_movie_train, k = 4)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, user_movie_test)))

User-based CF MSE: 3.452770037018036


In [68]:
## tuning

In [72]:
#! pip install surprise

In [74]:
from surprise import SVD,Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [78]:
reader = Reader()
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8749  0.8751  0.8720  0.8722  0.8751  0.8738  0.0014  
MAE (testset)     0.6727  0.6719  0.6704  0.6706  0.6717  0.6715  0.0009  
Fit time          6.37    5.29    5.29    5.31    5.26    5.51    0.44    
Test time         0.19    0.23    0.15    0.16    0.23    0.19    0.04    


{'fit_time': (6.3749213218688965,
  5.294899940490723,
  5.291177272796631,
  5.3099095821380615,
  5.257055282592773),
 'test_mae': array([0.67273166, 0.67194358, 0.67040916, 0.67057792, 0.67169805]),
 'test_rmse': array([0.8748705 , 0.87505521, 0.87196345, 0.87219543, 0.87508716]),
 'test_time': (0.19024300575256348,
  0.23436641693115234,
  0.1490797996520996,
  0.1562786102294922,
  0.2329702377319336)}

In [79]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe578329c90>

In [82]:
df[df['userId'] == 8]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
372,8,47,4.0,839463546,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
576,8,50,5.0,839463644,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
857,8,110,3.0,839463527,Braveheart (1995),Action|Drama|War
1367,8,231,4.0,839463470,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
1500,8,235,3.0,839464076,Ed Wood (1994),Comedy|Drama
1823,8,296,4.0,839463422,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2429,8,356,3.0,839463527,Forrest Gump (1994),Comedy|Drama|Romance|War
2792,8,367,3.0,839463564,"Mask, The (1994)",Action|Comedy|Crime|Fantasy
3002,8,457,3.0,839463489,"Fugitive, The (1993)",Thriller
3191,8,480,4.0,839463527,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [83]:
algo.predict(8,47,4)

Prediction(uid=8, iid=47, r_ui=4, est=4.090455217725302, details={'was_impossible': False})

In [85]:
algo.predict(8,441,4)

Prediction(uid=8, iid=441, r_ui=4, est=3.756563112397142, details={'was_impossible': False})

In [87]:
## SAVING TRAINED MODEL
from surprise import dump
import os
path = F"/content/drive/MyDrive/ml_model"
print (">> Starting dump")
# Dump algorithm and reload it.
file_name = os.path.expanduser(path)
dump.dump(file_name, algo=algo)
print (">> Dump done")
print(path)

>> Starting dump
>> Dump done
/content/drive/MyDrive/ml_model


In [88]:
def load_model(model_filename):
    print (">> Loading dump")
    from surprise import dump
    import os
    file_name = os.path.expanduser(model_filename)
    _, loaded_model = dump.load(file_name)
    print (">> Loaded dump")
    return loaded_model

In [89]:
# predicitng
from pprint import pprint as pp
model_filename = path
def itemRating(user, item):
    uid = str(user)
    iid = str(item) 
    loaded_model = load_model(model_filename)
    prediction = loaded_model.predict(user, item, verbose=True)
    rating = prediction.est
    details = prediction.details
    uid = prediction.uid
    iid = prediction.iid
    true = prediction.r_ui
    ret = {
        'user': user, 
        'item': item, 
        'rating': rating, 
        'details': details,
        'uid': uid,
        'iid': iid,
        'true': true
        }
    pp (ret)
    print ('\n\n')
    return ret
print(itemRating(user = 8, item = 441))

>> Loading dump
>> Loaded dump
user: 8          item: 441        r_ui = None   est = 3.76   {'was_impossible': False}
{'details': {'was_impossible': False},
 'iid': 441,
 'item': 441,
 'rating': 3.756563112397142,
 'true': None,
 'uid': 8,
 'user': 8}



{'user': 8, 'item': 441, 'rating': 3.756563112397142, 'details': {'was_impossible': False}, 'uid': 8, 'iid': 441, 'true': None}
