# License 
***
Copyright (C) 2017-2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

***
# Kaggle Movie Lens and Basic Collaborative Filtering

1. General imports and inits

In [24]:
# basic packages for recommendation
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

# to upload local files
import io
from google.colab import files  

SEED = 12345 # for better reproducibility

2. Import train data

In [29]:
# special google collab command to upload a file from computer
# REQUIRES STUDENT INPUT
# import: u.data and u.item
uploaded = files.upload() 

Saving u_item.txt to u_item (1).txt


In [26]:
# 3
uploaded.keys() # what is stored in that Python object?

dict_keys(['u_data.txt'])

4. Load ratings data

In [27]:
r_cols = r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(io.StringIO(uploaded['u_data.txt'].decode('latin-1')), sep='\t',
          names=r_cols) # name in quotes here must match name in 3 above

5. Load items data

In [30]:
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv(io.StringIO(uploaded['u_item.txt'].decode('latin-1')), sep='|',
                     names=i_cols) # name in quotes here must match name in 3 above
movies.index += 1 # necessary for later join

6. View ratings data 
* Sparse! but in dense COO format here
* Read as: user 196 rated movie 242 with a 3
* As a sparse matrix, rows would be all users, columns would be all movies, and data would be movie ratings


In [31]:
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


7. View movies data

In [32]:
movies.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


8. Summarize loaded data

In [33]:
print('Total movies:', len(movies))
print('Total ratings:', len(ratings))

Total movies: 1682
Total ratings: 100000


9. Expand COO matrix to large sparse matrix
* This is dumb and done just to use ... scikit-learn!
* For actual big data, this would be extremely inefficient and probably fail

In [34]:
df_ratings = ratings.pivot(index='user_id', columns='movie_id', values='rating') # expand 
df_ratings.fillna(0, inplace=True) # impute nans with 0's
df_ratings # NOTE: movies and users are indexed from 1, not 0!

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


10. Factorize ratings into:
* H matrix with all users
* W matrix with all movies

In [43]:
mf_model = NMF(n_components=15, init='random', random_state=SEED, max_iter=1000) # REQUIRES STUDENT INPUT
H = mf_model.fit_transform(df_ratings)
W = mf_model.components_
print(H.shape) # H has all users
print(W.shape) # W has all movies

(943, 15)
(15, 1682)


11. Fit many small clusters in H

In [44]:
kmeans_model = KMeans(n_clusters=50, random_state=SEED).fit(H) # REQUIRES STUDENT INPUT: make many small clusters of users

12. Add cluster labels

In [45]:
cols = ['archetype_' + str(i) for i in range(0, H.shape[1])]
H_df = pd.DataFrame(H, columns=cols)
H_df['cluster'] = kmeans_model.predict(H)
H_df.sort_values(by='cluster', inplace=True)
H_df # scroll to right to see cluster labels

Unnamed: 0,archetype_0,archetype_1,archetype_2,archetype_3,archetype_4,archetype_5,archetype_6,archetype_7,archetype_8,archetype_9,archetype_10,archetype_11,archetype_12,archetype_13,archetype_14,cluster
632,0.091321,0.000000,0.658057,0.000000,0.083425,0.356989,0.070591,0.000000,0.000000,0.172974,0.000000,0.000000,0.014567,0.129230,0.000000,0
558,0.418866,0.000000,0.704610,0.000000,0.257810,0.000000,0.038389,0.000000,0.000000,0.000000,0.109949,0.301702,0.000000,0.000000,0.000000,0
134,0.000000,0.000000,0.701084,0.000000,0.047545,0.000000,0.062400,0.000000,0.039132,0.511325,0.000000,0.003513,0.000000,0.000000,0.000000,0
492,0.000000,0.000000,0.635536,0.000000,0.380017,0.245605,0.121922,0.021994,0.122890,0.000000,0.000000,0.000000,0.246077,0.235500,0.239904,0
136,0.000000,0.000000,0.845443,0.000000,0.003465,0.116649,0.031185,0.000000,0.000000,0.000000,0.000000,0.000000,0.276554,0.332489,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,0.432022,0.564235,0.003787,0.000000,0.292995,0.655286,0.000000,0.349768,0.040018,0.138157,0.000000,0.000000,0.039060,0.037786,0.161302,47
344,3.471219,1.926715,0.158288,0.000000,0.000000,0.371719,0.018700,0.150752,0.095083,0.140903,0.486481,0.000000,0.141020,0.374641,0.000000,48
173,2.885530,1.686758,0.000000,0.448505,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.369466,0.000000,0.003038,0.449738,0.000000,48
845,1.961135,0.000000,1.928399,1.125831,0.092590,0.108787,0.008671,0.243509,0.061219,0.551978,0.120025,2.190714,0.000000,0.000000,0.012144,49


13. Extract users from first cluster

In [46]:
cluster_0_user_ids = H_df[H_df['cluster'] == 0].index # the row index of H corresponds to user ids - 1 
cluster_0_user_ids = cluster_0_user_ids + 1
cluster_0_user_ids = sorted(cluster_0_user_ids)
cluster_0_user_ids # see users ids in cluster 0

[5,
 135,
 137,
 183,
 186,
 246,
 249,
 352,
 411,
 493,
 559,
 566,
 603,
 621,
 633,
 643,
 660,
 738,
 778,
 806,
 833,
 843,
 844,
 862,
 868]

14. Extract Movie IDs sorted by ranking for cluster 0

In [47]:
cluster_0_movie_ids = df_ratings.loc[cluster_0_user_ids, :] # extract the rows from the ratings matrix for cluster 0 users
cluster_0_movie_ids = pd.DataFrame(cluster_0_movie_ids.sum(axis=0), columns=(['total_ratings'])) # sum the columns for those users to total their ratings for each movie
cluster_0_movie_ids.sort_values(by='total_ratings', ascending=False, inplace=True) # sort the movies by their total ratings
cluster_0_movie_ids = cluster_0_movie_ids[cluster_0_movie_ids['total_ratings'] > 0] # drop movies with 0 ratings (unwatched movies)
cluster_0_movie_ids

Unnamed: 0_level_0,total_ratings
movie_id,Unnamed: 1_level_1
50,89.0
172,84.0
174,81.0
181,79.0
210,76.0
...,...
1089,1.0
575,1.0
442,1.0
579,1.0


15. Join name information and see most popular movie titles in cluster 0

In [48]:
cluster_0_movie_ids.join(movies[['movie_id', 'title']], on='movie_id', how='left',  lsuffix='_L', rsuffix='_R') # action movies!

Unnamed: 0_level_0,total_ratings,movie_id,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
50,89.0,50,Star Wars (1977)
172,84.0,172,"Empire Strikes Back, The (1980)"
174,81.0,174,Raiders of the Lost Ark (1981)
181,79.0,181,Return of the Jedi (1983)
210,76.0,210,Indiana Jones and the Last Crusade (1989)
...,...,...,...
1089,1.0,1089,Speed 2: Cruise Control (1997)
575,1.0,575,City Slickers II: The Legend of Curly's Gold (...
442,1.0,442,"Amityville Curse, The (1990)"
579,1.0,579,Fatal Instinct (1993)


16. Determine which movies user 5 has seen
* User 5 is in cluster 0

In [49]:
user_5_movie_ids = df_ratings.loc[5, :] # select user 5 ratings
user_5_movie_ids = list(user_5_movie_ids[user_5_movie_ids > 0].index) # drop unrated, unwatched movies
print(user_5_movie_ids) # movies user 5 has watched and rated
print(len(user_5_movie_ids))

[1, 2, 17, 21, 24, 25, 29, 40, 42, 50, 62, 63, 66, 69, 70, 79, 80, 89, 90, 94, 95, 98, 99, 100, 101, 102, 105, 109, 110, 121, 135, 139, 143, 144, 145, 151, 153, 154, 162, 163, 167, 168, 169, 172, 173, 174, 176, 181, 183, 185, 186, 189, 194, 200, 204, 208, 209, 210, 211, 214, 216, 219, 222, 225, 226, 227, 228, 229, 230, 231, 233, 234, 235, 239, 241, 243, 250, 257, 259, 267, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457]
175


17. See recommended movie titles for user 5

In [50]:
recs = cluster_0_movie_ids.drop(user_5_movie_ids, axis=0) # drop the movies user 5 has seen from cluster_0_movie_ids
recs.join(movies[['movie_id', 'title']], on='movie_id', how='left',  lsuffix='_L', rsuffix='_R').head() # join to title information and show best recommended titles

Unnamed: 0_level_0,total_ratings,movie_id,title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
56,74.0,56,Pulp Fiction (1994)
195,68.0,195,"Terminator, The (1984)"
96,67.0,96,Terminator 2: Judgment Day (1991)
12,64.0,12,"Usual Suspects, The (1995)"
117,61.0,117,"Rock, The (1996)"
