# User and Item based Collaborative Filtering

## Importing Libraries

In [40]:
import os #  Module for using operating system dependent functionality
import csv # Reading csv files
import sys # Module contains functions to allow python to interact with system
import re # Reader module
from surprise import Dataset
from surprise import Reader
from collections import defaultdict # default dictionary in surprise library
import heapq
from surprise import SVD,SVDpp # Singular Value decomposition and Singular Value decomposition ++
from surprise.model_selection import cross_validate #cross validation module in surprise lib
from operator import itemgetter
from surprise import KNNWithMeans # KNN with means Algorithm
from surprise import accuracy # Accuracy function
from surprise.model_selection import train_test_split # Test and train data split lib from surprise

### Creating a Movielens class to fetch data and to define relative functions to it

In [43]:
class MovieLens:

    movieID_to_name = {}
    name_to_movieID = {}
    ratingsPath = 'C:/Users/Imran/Desktop/Thesis coding/New_Coding/ml-latest-small/ratings.csv'
    moviesPath = 'C:/Users/Imran/Desktop/Thesis coding/New_Coding/ml-latest-small/movies.csv'
    
    def loadMovieLensLatestSmall(self):

        os.chdir(os.path.dirname(sys.argv[0])) # Look for files relative to the directory we are running from

        ratingsDataset = 0
        self.movieID_to_name = {}
        self.name_to_movieID = {}

        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)

        with open(self.moviesPath, newline='', encoding='utf-8') as csvfile:
                movieReader = csv.reader(csvfile)
                next(movieReader)  #Skip header line
                for row in movieReader:
                    movieID = int(row[0])
                    movieName = row[1]
                    self.movieID_to_name[movieID] = movieName
                    self.name_to_movieID[movieName] = movieID

        return ratingsDataset
    
    # Fetching Movie name based on movie id
    def getMovieName(self, movieID):
        if movieID in self.movieID_to_name:
            return self.movieID_to_name[movieID]
        else:
            return ""

### Loading our dataset from above define Movielens class

In [44]:
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()

# User based Collaborative Filtering

In [14]:
trainset, testset = train_test_split(data, test_size=.25)
sim_options = {'name': 'cosine','user_based': True}
sim_options_other = {'name': 'pearson','user_based': True}

### Initiating our KNNWithMeans Model and compute the similarities

In [15]:
model = KNNWithMeans(k=50, sim_options=sim_options)
model.fit(trainset)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


### Get top N similar users to our test subject

In [16]:
User = '85' # User id defined here
k = 10 # Top-10 Neighbours
testUserInnerID = trainset.to_inner_uid(User)
similarityRow = simsMatrix[testUserInnerID]

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append( (innerID, score) )
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])

### Get the stuff they rated, and add up ratings for each item, weighted by user similarity

In [17]:
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainset.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

### Create a dictionary of stuff the user has already seen

In [18]:
watched = {}
for itemID, rating in trainset.ur[testUserInnerID]:
    watched[itemID] = 1

## Output

In [19]:
# Get top-rated items from similar users:
print("\nUser based Collaborative filtering results for user: {}\n".format(User))
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainset.to_raw_iid(itemID)
        
        print(ml.getMovieName(int(movieID)))
        pos += 1
        if (pos > 10):
            break


User based Collaborative filtering results for user: 85

Fargo (1996)
Shawshank Redemption, The (1994)
Silence of the Lambs, The (1991)
Star Wars: Episode IV - A New Hope (1977)
Godfather, The (1972)
Saving Private Ryan (1998)
Victor/Victoria (1982)
Usual Suspects, The (1995)
12 Angry Men (1957)
Apartment, The (1960)
Being John Malkovich (1999)


# Item based Collaborative Filtering

#### Switching user_based parameter in (sim_options_icf) to (False) to utilize item based collaborative filtering

In [20]:
sim_options_icf = {'name': 'cosine','user_based': False}
sim_options_icf_other = {'name': 'pearson','user_based': False}

### Intiating our KNNBasic Algorithm and fit our model in trainSet

In [21]:
model_icf = KNNWithMeans(sim_options=sim_options_icf)
model_icf.fit(trainset)
simsMatrix_icf = model_icf.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


## Top N Items

In [22]:
User = '85' # define user here
k = 10
testItemInnerID = trainset.to_inner_uid(Item)

# Get the top K items we rated
testItemRatings = trainset.ur[testItemInnerID]
kNeighbors_icf = heapq.nlargest(k, testItemRatings, key=lambda t: t[1])

## Candidates Generation

In [23]:
# Get similar items to stuff we liked (weighted by rating)
candidates_icf = defaultdict(float)
for itemID_icf, rating_icf in kNeighbors_icf:
    similarityRow_icf = simsMatrix_icf[itemID_icf]
    for innerID_icf, score_icf in enumerate(similarityRow_icf):
        candidates_icf[innerID_icf] += score_icf * (rating_icf / 5.0)

## Already watched item dictionary

In [24]:
watched_items = {}
for itemID_icf, rating_icf in trainset.ur[testItemInnerID]:
    watched_items[itemID_icf] = 1

## Item based Output

In [25]:
print("\nItem based Collaborative filtering results for user: {}\n".format(Item))
# Get top-rated items from similar users:
pos_icf = 0
for itemID_icf, ratingSum_icf in sorted(candidates_icf.items(), key=itemgetter(1), reverse=True):
    if not itemID_icf in watched_items:
        movieID = trainset.to_raw_iid(itemID_icf)
        print(ml.getMovieName(int(movieID)))
        pos_icf += 1
        if (pos_icf > 10):
            break


Item based Collaborative filtering results for user: 85

Formula 51 (2001)
Kafka (1991)
Nine Lives of Fritz the Cat, The (1974)
Duck, You Sucker (1971)
Dinner Rush (2000)
Life and Death of Peter Sellers, The (2004)
Down to You (2000)
Great Rock 'n' Roll Swindle, The (1980)
Big Red One, The (1980)
Last Seduction, The (1994)
Breakout (1975)


# Matrix Factorization

In [26]:
# using above defined trainedset
trainset = trainset
def GetAntiTestSetForUser(testSubject):
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                                 i in trainset.all_items() if
                                 i not in user_items]
        return anti_testset

In [27]:
# Defining svd model and fitting training set

model_svd = SVD()
model_svd.fit(trainset)
User = 85 # define user here
testset = GetAntiTestSetForUser(User)
predictions_svd = model_svd.test(testset)

In [28]:
recommendations_svd = []
print("\nMatrix factorization results using SVD for user: {}\n".format(User))
for userID, movieID, actualRating, estimatedRating, _ in predictions_svd:
    intMovieID = int(movieID)
    recommendations_svd.append((intMovieID, estimatedRating))
    recommendations_svd.sort(key=lambda x: x[1], reverse=True)
            
for ratings in recommendations_svd[:10]:
    print(ml.getMovieName(ratings[0]))


Matrix factorization results using SVD for user: 85

12 Angry Men (1957)
Shawshank Redemption, The (1994)
North by Northwest (1959)
Cinema Paradiso (Nuovo cinema Paradiso) (1989)
Catch Me If You Can (2002)
Inglourious Basterds (2009)
Departed, The (2006)
Matrix, The (1999)
Star Wars: Episode IV - A New Hope (1977)
Prestige, The (2006)


In [29]:
vali_svd = cross_validate(model_svd,data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8995  0.8907  0.8976  0.8939  0.8994  0.8962  0.0034  
MAE (testset)     0.6913  0.6857  0.6917  0.6904  0.6915  0.6901  0.0022  
Fit time          6.35    6.68    6.95    6.60    6.79    6.67    0.20    
Test time         0.44    0.29    0.28    0.34    0.25    0.32    0.07    


## SVD++ Algorithm

In [30]:
model_svdpp = SVDpp()
model_svdpp.fit(trainset)
testset = GetAntiTestSetForUser(85) # define user here
predictions_SVDpp = model_svdpp.test(testset)

In [31]:
recommendations_svdpp = []
print("\nMatrix factorization results using SVDpp for user: {}\n".format(User))
for userID, movieID, actualRating, estimatedRating, _ in predictions_SVDpp:
    intMovieID = int(movieID)
    recommendations_svdpp.append((intMovieID, estimatedRating))
    recommendations_svdpp.sort(key=lambda x: x[1], reverse=True)
            
for ratings in recommendations_svdpp[:10]:
    print(ml.getMovieName(ratings[0]), ratings[1])


Matrix factorization results using SVDpp for user: 85

Wallace & Gromit: A Close Shave (1995) 4.426455962385103
Cool Hand Luke (1967) 4.402788205697453
Shakespeare in Love (1998) 4.36263566431943
Cinema Paradiso (Nuovo cinema Paradiso) (1989) 4.352866880360092
Naked Gun: From the Files of Police Squad!, The (1988) 4.348641403590705
It Happened One Night (1934) 4.339278189455737
Ran (1985) 4.288524649341065
Three Colors: White (Trzy kolory: Bialy) (1994) 4.284801543767897
Departed, The (2006) 4.281489764215328
Amores Perros (Love's a Bitch) (2000) 4.278302449158791


In [32]:
vali_svdpp = cross_validate(model_svdpp,data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8803  0.8821  0.8896  0.8882  0.8872  0.8855  0.0036  
MAE (testset)     0.6745  0.6786  0.6843  0.6834  0.6770  0.6795  0.0037  
Fit time          561.21  589.16  607.05  597.94  657.02  602.48  31.30   
Test time         9.88    10.87   10.00   10.99   9.77    10.30   0.52    
