In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
a = 2
b = 3
a
b

2

3

## User & Item based Collaborative Filtering

### Load data

In [3]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv("3a - ratings_sub.csv",encoding = "ISO-8859-1")

In [5]:
ratings.shape

(9, 7)

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,1111549459,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",Action|Drama|Romance,2000
1,1,5,4.0,1111549392,Snatch (2000),Comedy|Crime|Thriller,2000
2,1,3,4.0,1111559503,"O Brother, Where Art Thou? (2000)",Adventure|Comedy|Crime,2000
3,2,3,4.5,1283448812,"O Brother, Where Art Thou? (2000)",Adventure|Comedy|Crime,2000
4,2,4,4.0,1283448602,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,2001


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
userId       9 non-null int64
movieId      9 non-null int64
rating       9 non-null float64
timestamp    9 non-null int64
title        9 non-null object
genres       9 non-null object
year         9 non-null int64
dtypes: float64(1), int64(4), object(2)
memory usage: 584.0+ bytes


In [8]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [9]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

In [10]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  3


<b> Q: Who are the users with maximum no of movies watched? </b>

In [11]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

1    3
3    3
2    3
Name: userId, dtype: int64

### Transforming data to surprise format

In [12]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [13]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [14]:
data

<surprise.dataset.DatasetAutoFolds at 0x1fca6e2a3c8>

In [15]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [16]:
type(trainset)

surprise.trainset.Trainset

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [17]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [18]:
for keys in user_records.keys():
    print(keys)

0
1
2


In [20]:
user_records[1]

[(1, 4.0), (3, 4.0)]

In [27]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1))

3
Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)


In [28]:
user_records[0]

[(0, 0.5), (3, 1.5)]

### Training the model

In [29]:
from surprise import KNNWithMeans
from surprise import accuracy
from surprise import Prediction

In [30]:
algo = KNNWithMeans(k=51, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)


Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1fca7117c50>

### Find K most similiar items

### Evaluating Model Performance

In [31]:
len(testset)

3

In [34]:
testset[:]

[('3', 'Shrek (2001)', 3.0),
 ('2', 'Shrek (2001)', 4.0),
 ('1', 'O Brother, Where Art Thou? (2000)', 4.0)]

In [35]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.9789


0.9789450103725609

In [41]:
# View all predictions
test_pred[:]

[Prediction(uid='3', iid='Shrek (2001)', r_ui=3.0, est=3.25, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='2', iid='Shrek (2001)', r_ui=4.0, est=3.25, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'}),
 Prediction(uid='1', iid='O Brother, Where Art Thou? (2000)', r_ui=4.0, est=2.5, details={'actual_k': 0, 'was_impossible': False})]

In [42]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [44]:
test_pred_df.loc[test_pred_df.was_impossible]

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
0,3,Shrek (2001),3.0,3.25,"{'was_impossible': True, 'reason': 'User and/o...",True
1,2,Shrek (2001),4.0,3.25,"{'was_impossible': True, 'reason': 'User and/o...",True


<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

### Predictions

In [48]:
# Make prediction for a single user
algo.predict(uid="1",iid="O Brother, Where Art Thou? (2000)")

Prediction(uid='1', iid='O Brother, Where Art Thou? (2000)', r_ui=None, est=2.5, details={'actual_k': 0, 'was_impossible': False})

### Generating top n recommendations

In [49]:
testset_new = trainset.build_anti_testset()

In [52]:
len(testset_new)

6

In [53]:
testset_new[:]

[('3', 'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)', 3.25),
 ('3', 'Donnie Darko (2001)', 3.25),
 ('1', 'O Brother, Where Art Thou? (2000)', 3.25),
 ('1', 'Donnie Darko (2001)', 3.25),
 ('2', 'Crouching Tiger, Hidden Dragon (Wo hu cang long) (2000)', 3.25),
 ('2', 'Snatch (2000)', 3.25)]

In [54]:
predictions = algo.test(testset_new[:])

In [55]:
predictions_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])

In [56]:
predictions_df.columns = ["userId","movie_name","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [60]:
predictions_df

Unnamed: 0,userId,movie_name,est_rating
1,3,Donnie Darko (2001),5.0
0,3,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",4.0
4,2,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",4.0
5,2,Snatch (2000),2.75
3,1,Donnie Darko (2001),5.0
2,1,"O Brother, Where Art Thou? (2000)",2.5


In [68]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

In [69]:
top_10_recos

Unnamed: 0,userId,movie_name,est_rating
0,3,Donnie Darko (2001),5.0
1,3,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",4.0
2,2,"Crouching Tiger, Hidden Dragon (Wo hu cang lon...",4.0
3,2,Snatch (2000),2.75
4,1,Donnie Darko (2001),5.0
5,1,"O Brother, Where Art Thou? (2000)",2.5


## SVD Based Recommendation

In [70]:
ratings.shape

(9, 7)

In [71]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [72]:
ratings.shape

(9, 7)

In [73]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [74]:
from surprise import SVD
from surprise import accuracy

In [84]:
svd_model = SVD(n_factors=4,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1fca872fa58>

In [85]:
test_pred = svd_model.test(testset)

In [86]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])

In [87]:
test_pred_df.head()

Unnamed: 0,0,1,2
0,3,Shrek (2001),3.25
1,2,Shrek (2001),3.25
2,1,"O Brother, Where Art Thou? (2000)",1.0


In [88]:
test_pred_df.columns = ["userId","movie_name","est_rating"]
test_pred_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [89]:
test_pred_df.head()

Unnamed: 0,userId,movie_name,est_rating
0,3,Shrek (2001),3.25
1,2,Shrek (2001),3.25
2,1,"O Brother, Where Art Thou? (2000)",1.0


In [90]:
top_10_recos = test_pred_df.groupby("userId").head(10).reset_index(drop=True)

In [91]:
top_10_recos.head(30)

Unnamed: 0,userId,movie_name,est_rating
0,3,Shrek (2001),3.25
1,2,Shrek (2001),3.25
2,1,"O Brother, Where Art Thou? (2000)",1.0


In [96]:

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 1.7912


1.7911821050170564