In [1]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
!wget -O moviedataset.zip http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
print('unziping ...')
!unzip -o -j moviedataset.zip 

--2020-12-06 08:37:28--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘moviedataset.zip’


2020-12-06 08:37:29 (950 KB/s) - ‘moviedataset.zip’ saved [978202/978202]

unziping ...
Archive:  moviedataset.zip
  inflating: links.csv               
  inflating: tags.csv                
  inflating: ratings.csv             
  inflating: README.txt              
  inflating: movies.csv              


In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = movies['title'].apply(lambda x: x.strip())

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
movies_df = pd.merge(movies, ratings, on='movieId', how='inner')

In [9]:
movies_df

Unnamed: 0,movieId,title,genres,year,userId,rating,timestamp
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,1,4.0,964982703
1,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,5,4.0,847434962
2,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,7,4.5,1106635946
3,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,15,2.5,1510577970
4,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,17,4.5,1305696483
...,...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,2017,184,4.0,1537109082
100832,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,2017,184,3.5,1537109545
100833,193585,Flint,Drama,2017,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,2018,184,3.5,1537110021


In [10]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 365kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670913 sha256=e3d7c89712a90fe3aae471cb17542498e34d068983ddee7f9aeddc2e2d030e7d
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [11]:
import itertools
import surprise
from collections import defaultdict
import pandas as pd
import numpy as np

In [12]:
movies_df.rating.unique()

array([4. , 4.5, 2.5, 3.5, 3. , 5. , 0.5, 2. , 1.5, 1. ])

In [13]:
reader = surprise.Reader(rating_scale=(0.5, 5))

In [14]:
data = surprise.Dataset.load_from_df(movies_df[['userId', 'movieId', 'rating']], reader)

In [15]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f94de7e2978>

In [16]:
from surprise.model_selection import train_test_split

In [17]:
trainSet, testSet = train_test_split(data, test_size=0.3)

In [18]:
from surprise import SVD

algo = SVD()

In [19]:
algo.fit(trainSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f94e0963e48>

In [20]:
predictions = algo.test(testSet)

In [21]:
predictions

[Prediction(uid=409, iid=2772, r_ui=1.0, est=3.311751749477972, details={'was_impossible': False}),
 Prediction(uid=365, iid=156609, r_ui=4.5, est=2.271600701073555, details={'was_impossible': False}),
 Prediction(uid=599, iid=4963, r_ui=4.0, est=2.9727050871639853, details={'was_impossible': False}),
 Prediction(uid=603, iid=1273, r_ui=4.0, est=3.912259375684905, details={'was_impossible': False}),
 Prediction(uid=603, iid=3448, r_ui=2.0, est=3.14670723498811, details={'was_impossible': False}),
 Prediction(uid=83, iid=586, r_ui=1.5, est=2.896146727748897, details={'was_impossible': False}),
 Prediction(uid=227, iid=8873, r_ui=4.5, est=4.269717096963842, details={'was_impossible': False}),
 Prediction(uid=429, iid=279, r_ui=3.0, est=4.031298552379895, details={'was_impossible': False}),
 Prediction(uid=474, iid=1348, r_ui=2.5, est=3.290691373558882, details={'was_impossible': False}),
 Prediction(uid=249, iid=2717, r_ui=3.5, est=3.339878702480591, details={'was_impossible': False}),
 

In [22]:
from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 0.8848


0.8848147533572075

In [23]:
accuracy.mse(predictions)

MSE: 0.7829


0.782897147758576

In [24]:
from surprise.model_selection import cross_validate

In [25]:
alg = SVD()

In [26]:
cross_validate(alg, data, measures=['RMSE', 'MSE'], cv=5, verbose=True)

Evaluating RMSE, MSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8692  0.8732  0.8780  0.8776  0.8734  0.8743  0.0033  
MSE (testset)     0.7554  0.7625  0.7709  0.7701  0.7628  0.7644  0.0057  
Fit time          5.00    5.06    5.12    5.39    5.01    5.11    0.14    
Test time         0.23    0.14    0.15    0.24    0.14    0.18    0.04    


{'fit_time': (5.001370191574097,
  5.060319185256958,
  5.116904973983765,
  5.386591196060181,
  5.009317874908447),
 'test_mse': array([0.75544114, 0.76249851, 0.77092468, 0.77012463, 0.76280204]),
 'test_rmse': array([0.86916117, 0.87321161, 0.87802317, 0.87756745, 0.87338539]),
 'test_time': (0.22502636909484863,
  0.14333271980285645,
  0.1472792625427246,
  0.23684215545654297,
  0.1404869556427002)}

In [27]:
def GetTopN(predictions, n=10, minimumRating=4.0):
    topN = defaultdict(list)


    for userID, movieID, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(userID)].append((int(movieID), estimatedRating))

    for userID, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(userID)] = ratings[:n]

    return topN

In [28]:
GetTopN(predictions)

defaultdict(list,
            {1: [(2959, 5),
              (1196, 5),
              (2329, 5),
              (1136, 5),
              (2028, 5),
              (2571, 5),
              (50, 5),
              (1206, 4.958437831411675),
              (1291, 4.934969936198636),
              (296, 4.922105130313302)],
             2: [(79132, 4.277902935659061),
              (48516, 4.1855501729762015),
              (1704, 4.118815071508424),
              (99114, 4.076491871525864)],
             4: [(1197, 4.503571467167415),
              (1719, 4.3666815470922575),
              (4226, 4.3189480853966105),
              (1084, 4.30572754307201),
              (1198, 4.303571739957482),
              (260, 4.184875544616285),
              (593, 4.116059073549946),
              (2959, 4.082979574137445)],
             5: [(608, 4.405395320591073),
              (527, 4.352409204069003),
              (318, 4.321815132920043),
              (265, 4.298360096430488),
              (24

In [29]:
from surprise.model_selection import LeaveOneOut

In [30]:
loo = LeaveOneOut(n_splits=1, random_state=1)

In [36]:
algos = SVD(random_state=11)

In [37]:
for myTrain, myTest in loo.split(data):

    # Train model without left-out ratings
    algos.fit(myTrain)

    # Predicts ratings for left-out ratings only
    print("Predict ratings for left-out set...")
    leftOutPredictions = algos.test(myTest)

    # Build predictions for all ratings not in the training set
    print("Predict all missing ratings...")
    bigTestSet = myTrain.build_anti_testset()
    allPredictions = algos.test(bigTestSet)

    print("Compute top 10 recs per user...")
    topNPredicted = GetTopN(allPredictions, n=10)

Predict ratings for left-out set...
Predict all missing ratings...
Compute top 10 recs per user...


In [38]:
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0

    # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1

        total += 1

    # Compute overall precision
    return hits/total

In [39]:
HitRate(topNPredicted, leftOutPredictions)

0.027868852459016394