In [34]:
import pandas as pd
import numpy as np
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [35]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [36]:
ratings = ratings.drop(columns = 'timestamp')
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [37]:
tags = tags.drop(columns = 'timestamp')
tags

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA
...,...,...,...
3678,606,7382,for katie
3679,606,7936,austere
3680,610,3265,gun fu
3681,610,3265,heroic bloodshed


In [38]:
tagsRatings = pd.merge(tags, ratings, on = ['userId', 'movieId'])
tagsRatings

Unnamed: 0,userId,movieId,tag,rating
0,2,60756,funny,5.0
1,2,60756,Highly quotable,5.0
2,2,60756,will ferrell,5.0
3,2,89774,Boxing story,5.0
4,2,89774,MMA,5.0
...,...,...,...,...
3471,606,6107,World War II,4.0
3472,606,7382,for katie,4.5
3473,610,3265,gun fu,5.0
3474,610,3265,heroic bloodshed,5.0


In [39]:
df = pd.merge(movies, tagsRatings, on = ('movieId'))

In [40]:
df

Unnamed: 0,movieId,title,genres,userId,tag,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,3.5
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,4.0
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,4.0
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,star wars,4.0
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,anime,3.5
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,comedy,3.5
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,gintama,3.5


# Data Cleaning

In [41]:
df.isna().sum()

movieId    0
title      0
genres     0
userId     0
tag        0
rating     0
dtype: int64

In [42]:
df['rating'].value_counts()

4.0    999
5.0    883
3.5    577
4.5    496
3.0    274
2.0    102
2.5     80
1.0     31
1.5     26
0.5      8
Name: rating, dtype: int64

In [43]:
ratingList = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]

def makeNotFloat(oldList, columnName):
    binaryList = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    newVals = dict(zip(oldList, binaryList))
    return df[columnName].replace(newVals, inplace = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3476 entries, 0 to 3475
Data columns (total 6 columns):
movieId    3476 non-null int64
title      3476 non-null object
genres     3476 non-null object
userId     3476 non-null int64
tag        3476 non-null object
rating     3476 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 190.1+ KB


In [44]:
makeNotFloat(ratingList, 'rating')

In [45]:
df['rating'].value_counts()

40.0    999
50.0    883
35.0    577
45.0    496
30.0    274
20.0    102
25.0     80
10.0     31
15.0     26
5.0       8
Name: rating, dtype: int64

In [46]:
df['rating'] = df['rating'].astype(int)
df['rating'].value_counts()

40    999
50    883
35    577
45    496
30    274
20    102
25     80
10     31
15     26
5       8
Name: rating, dtype: int64

# Model data prep

In [47]:
!pip install surprise

Collecting surprise
  Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Installing collected packages: surprise
Successfully installed surprise-0.1


In [48]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split

reader = Reader()
data = Dataset.load_from_df(ratings,reader)
train, test = train_test_split(data, test_size=0.2)

In [49]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

sim_cos = {'name':'cosine', 'user_based':False}
basic = knns.KNNBasic(sim_options=sim_cos)
basic.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x25053256470>

In [50]:
basic.sim

array([[1.        , 0.        , 0.98053741, ..., 0.        , 1.        ,
        1.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.98053741, 0.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.        , 1.        , ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ]])

In [51]:
predictions = basic.test(test)
print(accuracy.rmse(predictions))

RMSE: 0.9750
0.9750456694068246


In [52]:
ratings_rmse = []

In [53]:
sim_pearson = {'name':'pearson', 'user_based':False}
basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
thePrediction = f'KNNBasics: {accuracy.rmse(predictions)}'
print(thePrediction)
ratings_rmse.append(thePrediction)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9688
KNNBasics: 0.9687931385225901


In [54]:
#ratings_rmse.append(thePrediction)
ratings_rmse

['KNNBasics: 0.9687931385225901', 'KNNBasics: 0.9687931385225901']

In [55]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(train)
predictions = knn_means.test(test)
thePrediction = f'KNNWithMeans: {accuracy.rmse(predictions)}'
print(thePrediction)
ratings_rmse.append(thePrediction)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9038
KNNWithMeans: 0.9037840052580681


In [56]:
sim_pearson = {'name':'pearson', 'user_based':False}
knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
thePrediction = f'KNNBaseline: {accuracy.rmse(predictions)}'
print(thePrediction)
ratings_rmse.append(thePrediction)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8796
KNNBaseline: 0.8795846236951746


In [57]:
from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
thePrediction = f'svd: {accuracy.rmse(predictions)}'
print(thePrediction)
ratings_rmse.append(thePrediction)

RMSE: 0.8905
svd: 0.8904603701749568


In [58]:
user_34_prediction = knn_means.predict('100', '222')
user_34_prediction

Prediction(uid='100', iid='222', r_ui=None, est=3.500619824465711, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [59]:
predictions[:10]

[Prediction(uid=86, iid=2805, r_ui=4.0, est=3.4331537543536186, details={'was_impossible': False}),
 Prediction(uid=52, iid=55118, r_ui=5.0, est=4.335446065504453, details={'was_impossible': False}),
 Prediction(uid=169, iid=1265, r_ui=4.0, est=4.355766441826906, details={'was_impossible': False}),
 Prediction(uid=100, iid=3244, r_ui=4.0, est=3.834060203572167, details={'was_impossible': False}),
 Prediction(uid=367, iid=4053, r_ui=3.0, est=3.695207602135862, details={'was_impossible': False}),
 Prediction(uid=288, iid=3708, r_ui=2.0, est=3.2572990062102454, details={'was_impossible': False}),
 Prediction(uid=182, iid=5313, r_ui=0.5, est=2.9793043968403867, details={'was_impossible': False}),
 Prediction(uid=153, iid=106782, r_ui=1.0, est=2.900387103389297, details={'was_impossible': False}),
 Prediction(uid=554, iid=1196, r_ui=5.0, est=4.183992172072658, details={'was_impossible': False}),
 Prediction(uid=111, iid=8917, r_ui=3.5, est=3.3460784020907806, details={'was_impossible': Fals

In [60]:
user = 222
item = 222
knn_baseline.predict(user, item)

Prediction(uid=222, iid=222, r_ui=None, est=2.951955355239523, details={'actual_k': 13, 'was_impossible': False})

In [61]:
from surprise import BaselineOnly
from surprise.model_selection import cross_validate

In [62]:
cross_validate(BaselineOnly(), data, verbose = True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8654  0.8768  0.8708  0.8730  0.8771  0.8726  0.0043  
MAE (testset)     0.6688  0.6768  0.6730  0.6715  0.6735  0.6727  0.0026  
Fit time          0.28    0.22    0.36    0.39    0.42    0.33    0.07    
Test time         0.25    0.21    0.22    0.14    0.26    0.22    0.04    


{'test_rmse': array([0.86537671, 0.87684599, 0.87077798, 0.87302457, 0.87711921]),
 'test_mae': array([0.66880073, 0.67676444, 0.6730238 , 0.67152592, 0.67352352]),
 'fit_time': (0.28006553649902344,
  0.21677374839782715,
  0.35820794105529785,
  0.3935811519622803,
  0.41872143745422363),
 'test_time': (0.24705028533935547,
  0.2108297348022461,
  0.21867966651916504,
  0.14172005653381348,
  0.2637972831726074)}

In [63]:
from surprise import KNNBasic
KNN = KNNBasic().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [64]:
KNN.get_neighbors(iid=item, k=1)

[288]

In [65]:
ratings_rmse

['KNNBasics: 0.9687931385225901',
 'KNNBasics: 0.9687931385225901',
 'KNNWithMeans: 0.9037840052580681',
 'KNNBaseline: 0.8795846236951746',
 'svd: 0.8904603701749568']

In [67]:
data = Dataset.load_from_df(ratings,reader)
train, test = train_test_split(data, test_size=0.2)

ratings_rmse = []

sim_pearson = {'name':'pearson', 'user_based':False}

basic_pearson = knns.KNNBasic(sim_options=sim_pearson)
basic_pearson.fit(train)
predictions = basic_pearson.test(test)
thePrediction = f'KNNBasics: {accuracy.rmse(predictions)}'
ratings_rmse.append(thePrediction)

knn_means = knns.KNNWithMeans(sim_options=sim_pearson)
knn_means.fit(train)
predictions = knn_means.test(test)
thePrediction = f'KNNWithMeans: {accuracy.rmse(predictions)}'
ratings_rmse.append(thePrediction)

knn_baseline = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
thePrediction = f'KNNBaseline: {accuracy.rmse(predictions)}'
ratings_rmse.append(thePrediction)

from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV
svd = SVD(n_factors=100, n_epochs=10, lr_all=0.005, reg_all=0.4)
svd.fit(train)
predictions = svd.test(test)
thePrediction = f'svd: {accuracy.rmse(predictions)}'
ratings_rmse.append(thePrediction)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9710
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9021
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8820
RMSE: 0.8923


In [69]:
ratings_rmse

['KNNBasics: 0.9710235145259094',
 'KNNWithMeans: 0.9021246523459076',
 'KNNBaseline: 0.8820032441151862',
 'svd: 0.8922936982597599']

In [70]:
from surprise import NormalPredictor

In [None]:
normPred = knns.KNNBaseline(sim_options=sim_pearson)
knn_baseline.fit(train)
predictions = knn_baseline.test(test)
thePrediction = f'KNNBaseline: {accuracy.rmse(predictions)}'
ratings_rmse.append(thePrediction)