In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
a = 2
b = 3
a
b

2

3

## User & Item based Collaborative Filtering

### Load data

In [3]:
import pandas as pd
import numpy as np

In [4]:
ratings = pd.read_csv("3 - ratings_sub.csv",encoding = "ISO-8859-1")

In [5]:
ratings.shape

(487469, 7)

In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,3218,3889,1.0,1172532894,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
1,3663,3889,1.0,1044474348,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
2,3704,3889,3.0,971391538,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
3,8877,3889,1.0,1050744366,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0
4,9599,3889,0.5,1378056755,Highlander: Endgame (Highlander IV) (2000),Action|Adventure|Fantasy,2000.0


In [7]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487469 entries, 0 to 487468
Data columns (total 7 columns):
userId       487469 non-null int64
movieId      487469 non-null int64
rating       487469 non-null float64
timestamp    487469 non-null int64
title        487469 non-null object
genres       487469 non-null object
year         487469 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 26.0+ MB


In [8]:
ratings.userId=ratings.userId.astype(str)
ratings.movieId=ratings.movieId.astype(str)

In [9]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'year'], dtype='object')

### Data Exploration & Transformation

<b> Find the top 10 most popular movies watched </b>


In [10]:
# Total unique users 
print("total unique users - ",len(ratings["userId"].unique()))

total unique users -  2827


<b> Q: Who are the users with maximum no of movies watched? </b>

In [11]:
# Users with max no of movies watches
ratings["userId"].value_counts().head()

37629     200
105856    200
119152    200
122882    200
89138     200
Name: userId, dtype: int64

### Transforming data to surprise format

In [12]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))

In [13]:
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [14]:
data

<surprise.dataset.DatasetAutoFolds at 0x2aa33ae42e8>

In [15]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [16]:
type(trainset)

surprise.trainset.Trainset

### Making sense of trainset 

Points to Note:
    
    1) Trainset is no longer a pandas dataframe. Rather, it's a specific datatypes defined by the Surprise library
    2) UserId and ItemId in the pandas dataframe can contain any value (either string/integer etc). However, Trainset convert these raw ids into numeric indexes called as "inner id"
    3) Methods are provided to convert rw id to inner id and vice verca

In [17]:
# user item rating data can be obtained as follows
user_records = trainset.ur
type(user_records)

collections.defaultdict

In [18]:
for keys in user_records.keys():
    print(keys)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [19]:
user_records[0]

[(0, 3.0),
 (195, 4.0),
 (1066, 3.5),
 (999, 3.5),
 (237, 3.0),
 (1577, 3.0),
 (932, 2.0),
 (247, 4.5),
 (2215, 3.0),
 (221, 4.0),
 (745, 3.0),
 (133, 3.0),
 (249, 3.0),
 (1065, 2.5),
 (255, 3.5),
 (167, 4.0),
 (586, 3.5),
 (1234, 4.0),
 (259, 4.5),
 (729, 2.5),
 (236, 3.5),
 (181, 3.5),
 (3245, 3.5),
 (1014, 3.0),
 (577, 5.0),
 (2789, 3.5),
 (91, 4.0),
 (10, 4.0),
 (19, 3.5),
 (274, 4.0),
 (2135, 3.0),
 (1419, 3.5),
 (695, 4.0),
 (1373, 3.5),
 (850, 3.0),
 (334, 4.0),
 (2759, 3.0),
 (222, 3.0),
 (37, 4.0),
 (380, 2.5),
 (544, 4.0),
 (542, 4.5),
 (1135, 5.0),
 (650, 5.0),
 (4625, 3.5),
 (341, 1.0),
 (780, 4.0),
 (2371, 3.0),
 (661, 4.0),
 (4742, 4.5),
 (1660, 3.5),
 (4189, 2.5),
 (110, 2.5),
 (2349, 3.0),
 (2285, 3.5),
 (2623, 3.0),
 (1001, 4.0),
 (1490, 3.0),
 (171, 4.0),
 (465, 4.0),
 (733, 5.0),
 (894, 3.0),
 (3771, 3.0),
 (933, 3.0),
 (1083, 3.0),
 (3003, 3.0),
 (11, 3.0),
 (756, 2.5),
 (604, 3.5),
 (258, 4.0),
 (725, 5.0),
 (320, 3.5),
 (1838, 5.0),
 (383, 3.0),
 (3977, 3.0),
 (19

In [20]:
# However the ids are the inner ids and not the raw ids
# raw ids can be obatined as follows

print(trainset.to_raw_uid(0))
print(trainset.to_raw_iid(1066))

248
Step Up 2 the Streets (2008)


In [21]:
user_records[0]

[(0, 3.0),
 (195, 4.0),
 (1066, 3.5),
 (999, 3.5),
 (237, 3.0),
 (1577, 3.0),
 (932, 2.0),
 (247, 4.5),
 (2215, 3.0),
 (221, 4.0),
 (745, 3.0),
 (133, 3.0),
 (249, 3.0),
 (1065, 2.5),
 (255, 3.5),
 (167, 4.0),
 (586, 3.5),
 (1234, 4.0),
 (259, 4.5),
 (729, 2.5),
 (236, 3.5),
 (181, 3.5),
 (3245, 3.5),
 (1014, 3.0),
 (577, 5.0),
 (2789, 3.5),
 (91, 4.0),
 (10, 4.0),
 (19, 3.5),
 (274, 4.0),
 (2135, 3.0),
 (1419, 3.5),
 (695, 4.0),
 (1373, 3.5),
 (850, 3.0),
 (334, 4.0),
 (2759, 3.0),
 (222, 3.0),
 (37, 4.0),
 (380, 2.5),
 (544, 4.0),
 (542, 4.5),
 (1135, 5.0),
 (650, 5.0),
 (4625, 3.5),
 (341, 1.0),
 (780, 4.0),
 (2371, 3.0),
 (661, 4.0),
 (4742, 4.5),
 (1660, 3.5),
 (4189, 2.5),
 (110, 2.5),
 (2349, 3.0),
 (2285, 3.5),
 (2623, 3.0),
 (1001, 4.0),
 (1490, 3.0),
 (171, 4.0),
 (465, 4.0),
 (733, 5.0),
 (894, 3.0),
 (3771, 3.0),
 (933, 3.0),
 (1083, 3.0),
 (3003, 3.0),
 (11, 3.0),
 (756, 2.5),
 (604, 3.5),
 (258, 4.0),
 (725, 5.0),
 (320, 3.5),
 (1838, 5.0),
 (383, 3.0),
 (3977, 3.0),
 (19

<b> In Class Assignment </b>

Confirm the raw to internal id mapping with original data, for a given user/item combination (uid - 0 & iid - 1066)


### Training the model

In [22]:
from surprise import KNNWithMeans
from surprise import accuracy
from surprise import Prediction

In [23]:
algo = KNNWithMeans(k=51, sim_options={'name': 'pearson', 'user_based': False})
algo.fit(trainset)


Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2aa33c3b5f8>

### Find K most similiar items

<b> In-class assignment </b>

Which movies are most similiar to Finding Nemo? (Hint: Use <b> get_neighbors </b> method of the algo object)

### Evaluating Model Performance

In [24]:
len(testset)

121868

In [25]:
testset[0:5]

[('107317', 'Signs (2002)', 2.5),
 ('103061', 'Inconvenient Truth, An (2006)', 4.5),
 ('84115', 'Battlefield Earth (2000)', 2.5),
 ('130756',
  'Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)',
  2.0),
 ('24878', 'Drive (2011)', 4.5)]

In [26]:
# Evalute on test set
test_pred = algo.test(testset)

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.8113


0.8113433713272009

In [49]:
# View a particular prediction
test_pred[12]

# To access a particular value, say estimate simply mention test_pred[12].est

Prediction(uid='7051', iid='Black Hawk Down (2001)', r_ui=5.0, est=3.867693529499944, details={'actual_k': 51, 'was_impossible': False})

In [50]:
test_pred[12].details["actual_k"]

51

In [51]:
# convert results to dataframe
test_pred_df = pd.DataFrame(test_pred)
test_pred_df["was_impossible"] = [x["was_impossible"] for x in test_pred_df["details"]]

In [62]:
test_pred_df.loc[test_pred_df.was_impossible].tail(5)

Unnamed: 0,uid,iid,r_ui,est,details,was_impossible
159,36730,Grill Point (Halbe Treppe) (2002),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
604,131040,Escape from Planet Earth (2013),2.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
827,116349,No Good Deed (2014),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
1865,124431,Films to Keep You Awake: The Christmas Tale (P...,0.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2020,21811,Insanitarium (2008),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2118,122288,Something Real and Good (2013),3.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2125,2606,In the Land of Blood and Honey (2011),4.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2365,37062,Suburban Gothic (2014),2.5,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
2991,54414,Spring Breakdown (2009),2.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True
3088,69225,"11th Hour, The (2007)",4.0,3.511396,"{'was_impossible': True, 'reason': 'User and/o...",True


<b> In class assignment </b>

What does <i>"was impossible": True indicate?</i>  

For how many cases in Test set, the predictions are set to "was_impossible"? And what could be the reasons for it?

### Predictions

In [63]:
# Mkae prediction for a single user
algo.predict(uid="41891",iid="Wrong Trousers, The (1993)")

Prediction(uid='41891', iid='Wrong Trousers, The (1993)', r_ui=None, est=3.511396303620614, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

### Generating top n recommendations

In [64]:
testset_new = trainset.build_anti_testset()

In [65]:
len(testset_new)

17308818

In [66]:
testset_new[0:5]

[('248', 'Disturbia (2007)', 3.511396303620614),
 ('248', 'Hamlet 2 (2008)', 3.511396303620614),
 ('248', 'Unbreakable (2000)', 3.511396303620614),
 ('248', 'Finding Neverland (2004)', 3.511396303620614),
 ('248', 'X2: X-Men United (2003)', 3.511396303620614)]

In [67]:
predictions = algo.test(testset_new[0:10000])

In [68]:
predictions_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in predictions])

In [69]:
predictions_df.columns = ["userId","movie_name","est_rating"]
predictions_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [73]:
predictions_df.head(10)

Unnamed: 0,userId,movie_name,est_rating
8040,45844,Elizabeth I (2005),5.0
9039,45844,Star Wars Uncut: Director's Cut (2012),5.0
9147,45844,Lucky Break (2001),5.0
9413,45844,Dog Pound (2010),5.0
9497,45844,911 in Plane Site (2004),5.0
9507,45844,Wild Things: Diamonds in the Rough (2005),5.0
9539,45844,Serial (Bad) Weddings (Qu'est-ce Qu'on An Fit ...,5.0
9877,45844,Bag It (2010),5.0
9909,45844,Triad Election (Election 2) (Hak se wui yi wo ...,5.0
9978,45844,Stromberg - Der Film (2014),5.0


In [76]:
top_10_recos = predictions_df.groupby("userId").head(10).reset_index(drop=True)

In [77]:
top_10_recos

Unnamed: 0,userId,movie_name,est_rating
0,45844,Elizabeth I (2005),5.0
1,45844,Star Wars Uncut: Director's Cut (2012),5.0
2,45844,Lucky Break (2001),5.0
3,45844,Dog Pound (2010),5.0
4,45844,911 in Plane Site (2004),5.0
5,45844,Wild Things: Diamonds in the Rough (2005),5.0
6,45844,Serial (Bad) Weddings (Qu'est-ce Qu'on An Fit ...,5.0
7,45844,Bag It (2010),5.0
8,45844,Triad Election (Election 2) (Hak se wui yi wo ...,5.0
9,45844,Stromberg - Der Film (2014),5.0


## SVD Based Recommendation

In [78]:
# Lets exclude movies with very few ratings, say less than 5
movie_count = ratings["title"].value_counts(ascending=False)
pop_movie = movie_count.loc[movie_count.values > 200].index
len(pop_movie)


567

In [79]:
ratings = ratings.loc[ratings.title.isin(pop_movie)]
ratings.shape

(350710, 7)

In [80]:
from surprise import Dataset,Reader
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'title', 'rating']], reader)

In [81]:
ratings.shape

(350710, 7)

In [82]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25,random_state=123)

# to build on full data
#trainset = data.build_full_trainset()

In [83]:
from surprise import SVD
from surprise import accuracy

In [100]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2aa375c94a8>

In [101]:
test_pred = svd_model.test(testset)

In [102]:
test_pred_df = pd.DataFrame([[x.uid,x.iid,x.est] for x in test_pred])

In [103]:
test_pred_df.head()

Unnamed: 0,0,1,2
0,7531,You Can Count on Me (2000),4.748084
1,45291,Eternal Sunshine of the Spotless Mind (2004),3.319848
2,92026,High Fidelity (2000),3.802283
3,122811,"Curious Case of Benjamin Button, The (2008)",3.363119
4,83105,Bruce Almighty (2003),3.161558


In [104]:
test_pred_df.columns = ["userId","movie_name","est_rating"]
test_pred_df.sort_values(by = ["userId", "est_rating"],ascending=False,inplace=True)

In [105]:
test_pred_df.head()

Unnamed: 0,userId,movie_name,est_rating
29185,99863,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le)...",3.620923
26419,99863,"Lord of the Rings: The Return of the King, The...",3.553263
23777,99863,"Lord of the Rings: The Two Towers, The (2002)",3.544782
58233,99863,"O Brother, Where Art Thou? (2000)",3.360659
40312,99863,Big Fish (2003),3.286475


In [106]:
top_10_recos = test_pred_df.groupby("userId").head(10).reset_index(drop=True)

In [107]:
top_10_recos.head(30)

Unnamed: 0,userId,movie_name,est_rating
0,99863,"Amelie (Fabuleux destin d'AmÃ©lie Poulain, Le)...",3.620923
1,99863,"Lord of the Rings: The Return of the King, The...",3.553263
2,99863,"Lord of the Rings: The Two Towers, The (2002)",3.544782
3,99863,"O Brother, Where Art Thou? (2000)",3.360659
4,99863,Big Fish (2003),3.286475
5,99863,Eternal Sunshine of the Spotless Mind (2004),3.238179
6,99863,Battle Royale (Batoru rowaiaru) (2000),3.197466
7,99863,"Bourne Ultimatum, The (2007)",3.189066
8,99863,Mystic River (2003),3.126122
9,99863,28 Days Later (2002),3.068587


In [108]:

# compute RMSE
accuracy.rmse(test_pred)

RMSE: 0.7692


0.7691777998767567