In [33]:
import pandas as pd
import numpy as np
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [34]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [35]:
ratings = ratings.drop(columns = 'timestamp')
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [36]:
tags = tags.drop(columns = 'timestamp')
tags

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA
...,...,...,...
3678,606,7382,for katie
3679,606,7936,austere
3680,610,3265,gun fu
3681,610,3265,heroic bloodshed


In [37]:
tagsRatings = pd.merge(tags, ratings, on = ['userId', 'movieId'])
tagsRatings

Unnamed: 0,userId,movieId,tag,rating
0,2,60756,funny,5.0
1,2,60756,Highly quotable,5.0
2,2,60756,will ferrell,5.0
3,2,89774,Boxing story,5.0
4,2,89774,MMA,5.0
...,...,...,...,...
3471,606,6107,World War II,4.0
3472,606,7382,for katie,4.5
3473,610,3265,gun fu,5.0
3474,610,3265,heroic bloodshed,5.0


In [38]:
df = pd.merge(movies, tagsRatings, on = ('movieId'))

In [39]:
df

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,4.0
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars,4.0
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime,3.5
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy,3.5
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama,3.5


# Data Cleaning

In [40]:
df.isna().sum()

movieId    0
title      0
genres     0
userId     0
tag        0
rating     0
dtype: int64

In [41]:
df['rating'].value_counts()

4.0    999
5.0    883
3.5    577
4.5    496
3.0    274
2.0    102
2.5     80
1.0     31
1.5     26
0.5      8
Name: rating, dtype: int64

In [42]:
ratingList = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

def makeNotFloat(oldList, columnName):
    binaryList = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    newVals = dict(zip(oldList, binaryList))
    return df[columnName].replace(newVals, inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  3476 non-null   int64  
 1   title    3476 non-null   object 
 2   genres   3476 non-null   object 
 3   userId   3476 non-null   int64  
 4   tag      3476 non-null   object 
 5   rating   3476 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 190.1+ KB


In [43]:
makeNotFloat(ratingList, 'rating')

In [44]:
df['rating'].value_counts()

40.0    999
50.0    883
35.0    577
45.0    496
30.0    274
20.0    102
25.0     80
10.0     31
15.0     26
5.0       8
Name: rating, dtype: int64

In [45]:
df['rating'] = df['rating'].astype(int)
df['rating'].value_counts()

40    999
50    883
35    577
45    496
30    274
20    102
25     80
10     31
15     26
5       8
Name: rating, dtype: int64

# Model data prep

In [71]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split

reader = Reader()
data = Dataset.load_from_df(ratings,reader)
train, test = train_test_split(data, test_size=0.2)

In [72]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

sim_cos = {'name':'cosine', 'user_based':False}
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x202fdc6dfd0>

In [73]:
basic.sim

array([[1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

In [74]:
predictions = basic.test(test)
print(accuracy.rmse(predictions))

RMSE: 0.9815
0.9815131909603673


In [95]:
ratings_rmse = []

In [96]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
thePrediction = f'RMSE: {accuracy.rmse(predictions)}'
print(thePrediction)ratings_rmse.append(thePrediction)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9745


NameError: name 'thePreediction' is not defined

In [104]:
ratings_rmse.append(thePrediction)
ratings_rmse

['RMSE: 0.9745038027105931']

In [76]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(train)
predictions = knn_means.test(test)
thePrediction = f'RMSE: {accuracy.rmse(predictions)}'
print(thePrediction)ratings_rmse.append(thePrediction)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9069
0.9069002207425205


In [77]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
thePrediction = f'RMSE: {accuracy.rmse(predictions)}'
print(thePrediction)ratings_rmse.append(thePrediction)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8832
0.8831574605696311


In [78]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
thePrediction = f'RMSE: {accuracy.rmse(predictions)}'
print(thePrediction)ratings_rmse.append(thePrediction)

RMSE: 0.8929
0.8929105836023069


In [79]:
user_34_prediction = knn_means.predict('100', '222')
user_34_prediction

Prediction(uid='100', iid='222', r_ui=None, est=3.499894629840829, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [81]:
predictions[:10]

[Prediction(uid=45, iid=1240, r_ui=3.0, est=3.965581811845958, details={'was_impossible': False}),
 Prediction(uid=541, iid=357, r_ui=4.0, est=3.517092789109819, details={'was_impossible': False}),
 Prediction(uid=323, iid=161, r_ui=4.0, est=3.2821729167853357, details={'was_impossible': False}),
 Prediction(uid=127, iid=2915, r_ui=2.0, est=3.2067922519457928, details={'was_impossible': False}),
 Prediction(uid=84, iid=1258, r_ui=4.0, est=4.011474958876073, details={'was_impossible': False}),
 Prediction(uid=186, iid=2528, r_ui=4.0, est=3.7498993238513334, details={'was_impossible': False}),
 Prediction(uid=393, iid=3354, r_ui=3.0, est=3.1963533535765, details={'was_impossible': False}),
 Prediction(uid=181, iid=172, r_ui=1.0, est=2.777290299486622, details={'was_impossible': False}),
 Prediction(uid=50, iid=7327, r_ui=4.0, est=2.973325676641492, details={'was_impossible': False}),
 Prediction(uid=274, iid=61323, r_ui=3.5, est=3.23067222838688, details={'was_impossible': False})]

In [83]:
user = 222
item = 222
knn_baseline.predict(user, item)

Prediction(uid=222, iid=222, r_ui=None, est=2.8941137989866266, details={'actual_k': 10, 'was_impossible': False})

In [89]:
from surprise import BaselineOnly
from surprise.model_selection import cross_validate

In [90]:
cross_validate(BaselineOnly(), data, verbose = True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8704  0.8711  0.8757  0.8725  0.8724  0.8724  0.0018  
MAE (testset)     0.6715  0.6715  0.6757  0.6729  0.6710  0.6725  0.0017  
Fit time          0.21    0.23    0.24    0.26    0.27    0.24    0.02    
Test time         0.15    0.07    0.15    0.07    0.16    0.12    0.04    


{'test_rmse': array([0.87044776, 0.87110203, 0.8757216 , 0.8724972 , 0.87235542]),
 'test_mae': array([0.67146287, 0.67151158, 0.67565992, 0.67287296, 0.67097639]),
 'fit_time': (0.21442651748657227,
  0.2343738079071045,
  0.24135446548461914,
  0.2610480785369873,
  0.2702770233154297),
 'test_time': (0.14815568923950195,
  0.07081031799316406,
  0.14561057090759277,
  0.07480049133300781,
  0.15658140182495117)}

In [92]:
from surprise import KNNBasic
KNN = KNNBasic().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [106]:
KNN.get_neighbors(iid=item, k=1)

[53]