In [77]:
import warnings
warnings.filterwarnings("ignore")

In [78]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD
import random
random.seed(0)
np.random.seed(0)

## Analyse Dataset


### Movie lens Dataset
Reading dataset (MovieLens 1M movie ratings dataset: downloaded from https://www.kaggle.com/datasets/shubhammehta21/movie-lens-small-latest-dataset)


In [79]:
rating_path="ratings.csv"

In [80]:
rating_df=pd.read_csv("ratings.csv")
rating_df.head(1)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703


In [81]:
movieID_to_name = {}
name_to_movieID = {}
with open("movies.csv", newline='', encoding='ISO-8859-1') as csvfile:
        movieReader = csv.reader(csvfile)
        next(movieReader)  #Skip header line
        for row in movieReader:
            movieID = int(row[0])
            movieName = row[1]
            movieID_to_name[movieID] = movieName
            name_to_movieID[movieName] = movieID

In [82]:
def getMovieName( movieIDs):
    result=[]
    for i in movieIDs:
        if movieID in movieID_to_name:
            result.append( movieID_to_name[i])
        
    return result
def getMovieID(movieName):
    if movieName in name_to_movieID:
        return name_to_movieID[movieName]
    else:
        return 0

In [83]:
def getMovie( movieID):
    result=[]
    if movieID in movieID_to_name:
           return  movieID_to_name[movieID]
        
    return ""

## Prepare dataset

In [84]:

rating_df=rating_df[['userId','movieId','rating']]

In [85]:

import csv
from surprise import Dataset,Reader
from surprise.model_selection import cross_validate
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5),skip_lines=1)
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_file('ratings.csv', reader)

In [86]:
fullTrainSet = data.build_full_trainset()


# Define recommender metrics

### Rating Prediction

In [87]:
from surprise import accuracy
from collections import defaultdict
def MAE(predictions):
    return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
    return accuracy.rmse(predictions, verbose=False)

### Popular movies based on rating count

In [88]:

from collections import defaultdict
def getPopularityMoviesBasedOnRatingCount(top_n=None):
    ratings = defaultdict(int)
    rankings = defaultdict(int)
    with open(rating_path, newline='') as csvfile:
        ratingReader = csv.reader(csvfile)
        next(ratingReader)
        for row in ratingReader:
            movieID = int(row[1])
            #increse rating count for that movie
            ratings[movieID] += 1
    rank = 1
    for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
        rankings[movieID] = rank
        rank += 1
    if top_n is None:
        return rankings
    else:
        return sorted(rankings, key=rankings.get, reverse=False)[:top_n]
    

### Top N predictions

In [89]:
def Get_Top_Predictions(predictions, n=10, minimumRating=3.0):
    topN = defaultdict(list)
    for uid, mid, actualRating, estimatedRating, _ in predictions:
        if (estimatedRating >= minimumRating):
            topN[int(uid)].append((int(mid), estimatedRating))
    for uid, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[int(uid)] = ratings[:n]
    return topN

###  See how often we recommended a movie the user actually rated

In [90]:
def HitRate(topNPredicted, leftOutPredictions):
    hits = 0
    total = 0
    # For each left-out rating
    for leftOut in leftOutPredictions:
        userID = leftOut[0]
        leftOutMovieID = leftOut[1]
        # Is it in the predicted top 10 for this user?
        hit = False
        for movieID, predictedRating in topNPredicted[int(userID)]:
            if (int(leftOutMovieID) == int(movieID)):
                hit = True
                break
        if (hit) :
            hits += 1
        total += 1
    # Compute overall precision
    return hits/total


### How often we recommended a movie the user actually liked 


In [91]:
def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=4.0):
    hits = 0
    total = 0
    # For each left-out rating
    for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
        # Only look at ability to recommend things the users actually liked...
        if (actualRating >= ratingCutoff):
            # Is it in the predicted top 10 for this user?
            hit = False
            for movieID, predictedRating in topNPredicted[int(userID)]:
                if (int(leftOutMovieID) == movieID):
                    hit = True
                    break
            if (hit) :
                hits += 1
            total += 1
    return hits/total

### Percentage of users have at least one "good" recommendation

In [92]:

def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
    hits = 0
    for userID in topNPredicted.keys():
        hit = 0
        for mid, predictedRating in topNPredicted[userID]:
            if (predictedRating >= ratingThreshold):
                hit = 1
                break
        if (hit==1):
            hits += 1
    return hits / numUsers

### Diversity is a measure of how your recommendations are different from each other.

In [93]:
def Diversity(topNPredicted, simsAlgo):
    n = 0
    total = 0
    simsMatrix = simsAlgo.compute_similarities()
    for userID in topNPredicted.keys():
        pairs = itertools.combinations(topNPredicted[userID], 2)
        for pair in pairs:
            movie1 = pair[0][0]
            movie2 = pair[1][0]
            innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
            innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
            similarity = simsMatrix[innerID1][innerID2]
            total += similarity
            n += 1
    S = total / n
    return (1-S)


### Novelty is a measure of how popular the items system recommending.

In [94]:
def Novelty(topNPredicted, rankings):
    n = 0
    total = 0
    for uid in topNPredicted.keys():
        for rating in topNPredicted[uid]:
            mid = rating[0]
            rank = rankings[mid]
            total += rank
            n += 1
    return total / n

In [95]:
getMovieName(getPopularityMoviesBasedOnRatingCount(10))

['Forrest Gump (1994)',
 'Shawshank Redemption, The (1994)',
 'Pulp Fiction (1994)',
 'Silence of the Lambs, The (1991)',
 'Matrix, The (1999)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Jurassic Park (1993)',
 'Braveheart (1995)',
 'Terminator 2: Judgment Day (1991)',
 "Schindler's List (1993)"]

### Similarity algoritm- KNNBaseline

In [96]:

from surprise import KNNBaseline


print("\nComputing item similarities so we can measure diversity later...")
options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=options)
simsAlgo.fit(fullTrainSet)



Computing item similarities so we can measure diversity later...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1898f947d68>

In [97]:
from surprise.model_selection import LeaveOneOut
import itertools

  

In [98]:
from surprise.model_selection.split import train_test_split
class Evaluate:
    def __init__(self, algo):
        self.algo = algo
    def start(self, n=10, verbose=True):
        before=datetime.now() 
        trainSet, testSet = train_test_split(data, test_size=.25, random_state=1)
        algo.fit(trainSet)
        after= datetime.now() 
        duration = after-before                         
        duration = duration.total_seconds() 
        print('time taken '+str(duration)+'s')
        predictions = algo.test(testSet)
        rmse= RMSE(predictions)
        mae= MAE(predictions)
        print("Computing RMSE "+str(rmse))
        print("Computing MAE "+str(mae))
        return duration,rmse,mae
    def topNresult(self):
        loo = LeaveOneOut(n_splits=1, random_state=1)
        hit_rate=0
        cumm_hit_rate=0
        for trainSet, testSet in loo.split(data):
            print("Computing recommendations with leave-one-out...")

            # Train model without left-out ratings
            self.algo.fit(trainSet)

            # Predicts ratings for left-out ratings only
            print("Predict ratings for left-out set...")
            leftOutPredictions = algo.test(testSet)

            # Build predictions for all ratings not in the training set
            print("Predict all missing ratings...")
            bigTestSet = trainSet.build_anti_testset()
            allPredictions = algo.test(bigTestSet)

            # Compute top 10 recs for each user
            print("Compute top 10 recs per user")
            topNPredicted = Get_Top_Predictions(allPredictions, n=10)

            # See how often we recommended a movie the user actually rated
            hit_rate=HitRate(topNPredicted, leftOutPredictions)
            print("\nHit Rate: ",hit_rate )

            # See how often we recommended a movie the user actually liked
            cumm_hit_rate=CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0)
            print("\ncHR (Cumulative Hit Rate, rating >= 4): ", cumm_hit_rate)
            
            return hit_rate,cumm_hit_rate
        
    def getModel(self):
        return self.algo
    def otherMetrics(self):
     
            self.algo.fit(fullTrainSet)
            bigTestSet = fullTrainSet.build_anti_testset()
            all_pred = algo.test(bigTestSet)
            topN= Get_Top_Predictions(all_pred, n=10)
            ranking=getPopularityMoviesBasedOnRatingCount()
            diversity=Diversity(topN, simsAlgo)
            # Measure diversity of recommendations:
            print("\nDiversity: ", diversity)
            novelty=Novelty(topN, ranking)
            # Measure novelty (average popularity rank of recommendations):
            print("\nNovelty (average popularity rank): ",novelty )
            return novelty,diversity

# Apply SVD Algorithm

In [99]:

from surprise import SVD
from datetime import datetime
algo = SVD()
evaluate=Evaluate(algo)
svd_duration,svd_rmse,svd_mae=evaluate.start()
svd_hit_rate,svd_cumm_hit_rate=evaluate.topNresult()
svd_novelty,svd_diversity=evaluate.otherMetrics()
algosvd=evaluate.getModel()

time taken 4.494346s
Computing RMSE 0.8800950930093621
Computing MAE 0.675733544076352
Computing recommendations with leave-one-out...
Predict ratings for left-out set...
Predict all missing ratings...
Compute top 10 recs per user

Hit Rate:  0.027868852459016394

cHR (Cumulative Hit Rate, rating >= 4):  0.042134831460674156
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Diversity:  0.9647219593750859

Novelty (average popularity rank):  511.51864629538363


### Make rating predictions

In [100]:
rating_df[rating_df['rating']==1 ].head(5)

Unnamed: 0,userId,movieId,rating
205,1,3176,1.0
308,4,126,1.0
314,4,222,1.0
320,4,296,1.0
329,4,441,1.0


In [101]:
userid=str(4)
movieid=str(441)
actual=1 
pred = algosvd.predict(userid,movieid ,actual,  verbose=True)

user: 4          item: 441        r_ui = 1.00   est = 2.94   {'was_impossible': False}


In [102]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = fullTrainSet.build_anti_testset()
print("\nComputing recommendations...")
predictions = algosvd.test(testset)


Computing recommendations...
