In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")

## Implement a baseline recommender

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


**Merge/join the ratings and movies tables together**


In [5]:
movie_ratings = ratings.merge(movies)
movie_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


**Average rating for each movie in the dataset**

In [6]:
avg_rating = movie_ratings.groupby('title')['rating'].mean()
avg_rating

title
'71 (2014)                                   4.000000
'Hellboy': The Seeds of Creation (2004)      4.000000
'Round Midnight (1986)                       3.500000
'Salem's Lot (2004)                          5.000000
'Til There Was You (1997)                    4.000000
                                               ...   
eXistenZ (1999)                              3.863636
xXx (2002)                                   2.770833
xXx: State of the Union (2005)               2.000000
¡Three Amigos! (1986)                        3.134615
À nous la liberté (Freedom for Us) (1931)    1.000000
Name: rating, Length: 9719, dtype: float64

**Filter out movies that have been watched by less than 20 users**

In [7]:
ratings_per_title = movie_ratings.groupby('title')['userId'].count()
ratings_per_title.sort_values(ascending=False).head(10)


title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
Name: userId, dtype: int64

In [8]:
movie_ratings_full = avg_rating.reset_index().merge(ratings_per_title.reset_index())
movie_ratings_full.columns = ['title', 'avg_rating', 'nr_ratings']

movie_ratings_full = movie_ratings_full.merge(movies[['title']])


In [9]:
movie_ratings_full

Unnamed: 0,title,avg_rating,nr_ratings
0,'71 (2014),4.000000,1
1,'Hellboy': The Seeds of Creation (2004),4.000000,1
2,'Round Midnight (1986),3.500000,2
3,'Salem's Lot (2004),5.000000,1
4,'Til There Was You (1997),4.000000,2
...,...,...,...
9719,eXistenZ (1999),3.863636,22
9720,xXx (2002),2.770833,24
9721,xXx: State of the Union (2005),2.000000,5
9722,¡Three Amigos! (1986),3.134615,26


In [10]:
movie_ratings_full['year'] = movie_ratings_full['title'].str.extract(r'\((\d{4})\)')

In [11]:
movie_ratings_full

Unnamed: 0,title,avg_rating,nr_ratings,year
0,'71 (2014),4.000000,1,2014
1,'Hellboy': The Seeds of Creation (2004),4.000000,1,2004
2,'Round Midnight (1986),3.500000,2,1986
3,'Salem's Lot (2004),5.000000,1,2004
4,'Til There Was You (1997),4.000000,2,1997
...,...,...,...,...
9719,eXistenZ (1999),3.863636,22,1999
9720,xXx (2002),2.770833,24,2002
9721,xXx: State of the Union (2005),2.000000,5,2005
9722,¡Three Amigos! (1986),3.134615,26,1986


In [12]:
movies_20 = movie_ratings_full[movie_ratings_full['nr_ratings'] > 19]
movies_20

Unnamed: 0,title,avg_rating,nr_ratings,year
8,(500) Days of Summer (2009),3.666667,42,2009
18,10 Things I Hate About You (1999),3.527778,54,1999
23,101 Dalmatians (1996),3.074468,47,1996
24,101 Dalmatians (One Hundred and One Dalmatians...,3.431818,44,1961
34,12 Angry Men (1957),4.149123,57,1957
...,...,...,...,...
9708,Zoolander (2001),3.509259,54,2001
9712,Zootopia (2016),3.890625,32,2016
9719,eXistenZ (1999),3.863636,22,1999
9720,xXx (2002),2.770833,24,2002


In [13]:
movie_ratings

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [14]:
# Combining the movie table with movies that have been rated at least 20 times
df = pd.merge(movie_ratings, movies_20, how='right')
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,avg_rating,nr_ratings,year
0,15,69757,4.0,1299425345,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,42,2009
1,18,69757,4.0,1455750175,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,42,2009
2,22,69757,0.5,1268726748,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,42,2009
3,41,69757,3.5,1458938869,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,42,2009
4,62,69757,4.5,1521490345,(500) Days of Summer (2009),Comedy|Drama|Romance,3.666667,42,2009
...,...,...,...,...,...,...,...,...,...
67979,477,2478,3.5,1200944830,¡Three Amigos! (1986),Comedy|Western,3.134615,26,1986
67980,555,2478,3.0,978823434,¡Three Amigos! (1986),Comedy|Western,3.134615,26,1986
67981,561,2478,4.0,1491092281,¡Three Amigos! (1986),Comedy|Western,3.134615,26,1986
67982,597,2478,3.0,941641402,¡Three Amigos! (1986),Comedy|Western,3.134615,26,1986


In [24]:
df['title'].value_counts()

Forrest Gump (1994)                           329
Shawshank Redemption, The (1994)              317
Pulp Fiction (1994)                           307
Silence of the Lambs, The (1991)              279
Matrix, The (1999)                            278
                                             ... 
Perks of Being a Wallflower, The (2012)        20
Adventures in Babysitting (1987)               20
Solaris (2002)                                 20
Bill & Ted's Bogus Journey (1991)              20
Transformers: Revenge of the Fallen (2009)     20
Name: title, Length: 1297, dtype: int64

**User-item matrix**

In [25]:
user_item = pd.pivot_table(df, 
                           values='rating', 
                           index='userId', 
                           columns='movieId'
)
user_item

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,4.0,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,2.5,,2.5,,4.0,...,,,,,,,,,,
607,4.0,,,,,,,3.0,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,4.0,,4.5,,...,,,,,,,,,,
609,3.0,,,,,,4.0,,,,...,,,,,,,,,,


In [26]:
# calculate the fraction of missing entries (=sparsity)
np.mean(user_item.isna().values)

0.9143086106588927

### User-item matrix as a sparse matrix

In [27]:
from scipy.sparse import csr_matrix

In [28]:
# Initialize a sparse matrix 
# (data, (row_ind, col_ind)
user_item = csr_matrix((df['rating'], (df['userId'], df['movieId'])))
user_item

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 67901 stored elements in Compressed Sparse Row format>

In [15]:
# collaborative filtering = look at ratings only!

def recommend_random(query, ratings, k=10):
    """
    Filters and recommends k random movies for any given input query.
    Returns a list of k movie ids    
    """
    # 1. candiate generation
    
    # filter out movies that the user has allready seen
    
    
    
    # 2. "scoring"
    
    # calculate a random sample of movies
    
    
    # 3. "ranking"
    
    # no ranking applied   
    

In [16]:
def recommend_popular(query, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # filter out movies that the user has allready seen
    
    # filter out movies that have been watched by less than 20/50/100... users
    
   
    # 2. scoring
    
    # calculate the average rating for each movie
    
    
    # 3. ranking
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
def recommend_popular(query, ratings, k=10):
    # create a new dataframe
    df_agg = ratings
    # add a column for the number of people who reviewed
    df_agg['nb_reviews'] = 1
    # aggregate columns
    df_agg = ratings.groupby(df['title'], as_index=False)['title', 'genres', 'rating', 'nb_reviews'].agg(
    {
        'title': 'first',
        'genres': 'first',
        'rating': 'mean',
        'nb_reviews': 'sum'})
        
    # rename the rating columns as it now corresponds to the average
    df_agg.rename(columns={'rating': 'average_rating'}, inplace=True)
    
    # make sure the best rated movies are shown on top
    df_agg.sort_values('average_rating', ascending=False, inplace=True)
    
    # only keep the movies that have been reviewed by 20 people or more
    df_agg_over_20 = df_agg[df_agg['nb_reviews'] > 20]
    
    # store the watched movies indexes in a list
    movies_I_rated = list(query.keys())
    
    final_df = df_agg_over_20[df_agg_over_20.index.isin(movies_I_rated) == False].head(k)
    
    return df_agg_over_20

recommend_popular(query, df, k=20)
    

In [17]:
query = {
        #movieId: rating
        162: 4,
        23: 1,
        1: 5
}

recommend_popular(query, ratings)

[364, 372, 43, 34, 243]

In [18]:
# recommender.py
# from recommender import recommend_popular

# NMF (Non Negative Matrix Factorization)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import pickle

---
## 1. Model Development

### Preprocessing

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [4]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')

In [46]:
ratings
ratings.groupby('userId')['movieId'].count().sort_values(ascending=False)

userId
414    1070
599     926
68      852
474     724
274     700
       ... 
320       9
324       9
397       9
578       8
175       4
Name: movieId, Length: 610, dtype: int64

In [5]:
# for calculating recommendations
query = {
    # movieId, rating
    12:4, 
    92:5,
    177:4,
    196:5,
    891:4,
    1128:5,
    1258:5,
    1320:4
}


# for testing the recommender after getting some recommendations
relevant_items = [
    1331, 1333, 1347, 1977, 
    2279, 2389, 2517, 2560, 2644, 2754, 
    2901, 3294, 3652, 3693   
]

In [6]:
# which movies are in the query?

movies.set_index('movieId').loc[query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
12,Dracula: Dead and Loving It (1995),Comedy|Horror
92,Mary Reilly (1996),Drama|Horror|Thriller
177,Lord of Illusions (1995),Horror
196,Species (1995),Horror|Sci-Fi
891,Halloween: The Curse of Michael Myers (Hallowe...,Horror|Thriller
1128,"Fog, The (1980)",Horror
1258,"Shining, The (1980)",Horror
1320,Alien³ (a.k.a. Alien 3) (1992),Action|Horror|Sci-Fi|Thriller


In [7]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
# calculate the number of ratings per movie
rating_per_movie = ratings.groupby('movieId')['userId'].count()
rating_per_movie

movieId
1         215
2         110
3          52
4           7
5          49
         ... 
193581      1
193583      1
193585      1
193587      1
193609      1
Name: userId, Length: 9724, dtype: int64

In [9]:
# filter the ratings matrix and only keep the popular movies
popular_movies = rating_per_movie.loc[rating_per_movie > 20]
popular_movies



movieId
1         215
2         110
3          52
5          49
6         102
         ... 
148626     26
152081     32
164179     26
166528     27
168252     25
Name: userId, Length: 1235, dtype: int64

In [10]:
#filter the ratings matrix and only keep the popular movies
ratings = ratings.set_index('movieId').loc[popular_movies.index]
ratings = ratings.reset_index()
ratings

Unnamed: 0,movieId,userId,rating,timestamp
0,1,1,4.0,964982703
1,1,5,4.0,847434962
2,1,7,4.5,1106635946
3,1,15,2.5,1510577970
4,1,17,4.5,1305696483
...,...,...,...,...
66653,168252,567,4.0,1525283936
66654,168252,586,5.0,1529899336
66655,168252,596,5.0,1535627159
66656,168252,599,3.5,1498529615


In [11]:
# Initialize a sparse user-item rating matrix 
# (data, (row_ind, col_ind)
R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

### **Training**

In [70]:
# initialize the unsupervised model
# 55 hidden features, F=55
model = NMF(n_components=250, init='nndsvd', max_iter=10000, tol=0.01, verbose=2)

# fit it to the user-item rating matrix
model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.11613451284031337
violation: 0.08053688370206033
violation: 0.05326543515970545
violation: 0.04043735611856619
violation: 0.03313291067865663
violation: 0.02814076295497202
violation: 0.024794339031466866
violation: 0.022376169738739793
violation: 0.020136596465642018
violation: 0.01762279319926174
violation: 0.014786297633888388
violation: 0.012565085444402407
violation: 0.010819151629662946
violation: 0.009425276217797929
Converged at iteration 16


### Model inspection

In [71]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

#### the hidden features

In [72]:
model.components_.shape

(250, 168253)

In [73]:
# user-'genre' matrix [611x55]
P=model.transform(R)

# movie-'genre' matrix [55x168253]
Q=model.components_

P.shape, Q.shape

violation: 1.0
violation: 1.2530538669923332
violation: 0.35187003646304804
violation: 0.14273615707169746
violation: 0.0569570533822573
violation: 0.02529047746528446
violation: 0.012387996037167443
violation: 0.006681851008656857
Converged at iteration 9


((611, 250), (250, 168253))

In [74]:
# user with id 1: sparse format
R[1,:]

<1x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 180 stored elements in Compressed Sparse Row format>

In [75]:
# user with id 1: dense embedding
P[1, :]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.32887111e-03, 0.00000000e+00,
       0.00000000e+00, 2.73667551e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [76]:
# dense embedding for movie with id 1
Q[:, 1]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 5.86072451e-03, 0.00000000e+00, 6.69790368e-01,
       4.52328245e-01, 1.61193990e-01, 0.00000000e+00, 4.05536059e+00,
       1.55737956e+00, 1.61497719e+00, 1.62481073e+00, 1.55401198e+00,
       2.83253738e+00, 0.00000000e+00, 0.00000000e+00, 1.22841955e+00,
       1.77424949e+00, 2.31922563e+00, 8.23286393e+00, 0.00000000e+00,
       0.00000000e+00, 1.12310759e+01, 3.85375885e+00, 1.24093282e-01,
       0.00000000e+00, 0.00000000e+00, 8.71267403e-01, 0.00000000e+00,
       2.75631152e+00, 2.94357317e-01, 3.24474753e+00, 0.00000000e+00,
       1.52755935e-06, 2.75228114e+00, 3.81275238e-02, 5.50768061e+00,
       0.00000000e+00, 0.00000000e+00, 1.78583484e+00, 0.00000000e+00,
       0.00000000e+00, 1.13944507e+00, 0.00000000e+00, 1.81496313e+00,
       0.00000000e+00, 9.64776861e-03, 1.27612834e+00, 2.33252837e+00,
       0.00000000e+00, 9.01040864e-02, 0.00000000e+00, 0.00000000e+00,
      

In [77]:
# reconstructed matrix Rhat
R_hat = P.dot(Q)
R_hat

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.01698316e+00, 2.06582624e-02, ...,
        0.00000000e+00, 0.00000000e+00, 2.03514451e-03],
       [0.00000000e+00, 7.22389109e-02, 2.59256610e-02, ...,
        0.00000000e+00, 0.00000000e+00, 3.12371299e-01],
       ...,
       [0.00000000e+00, 2.54453090e+00, 1.95458285e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.47216838e-02],
       [0.00000000e+00, 1.12186164e+00, 4.09861913e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.08012191e-02],
       [0.00000000e+00, 5.02590536e+00, 1.28849697e-01, ...,
        0.00000000e+00, 0.00000000e+00, 5.00816277e+00]])

In [78]:
# R -> encoding -> P -> decoding -> Rhat (one-liner to get R_hat)
R_hat = model.inverse_transform(model.transform(R))

violation: 1.0
violation: 1.2530538669923332
violation: 0.35187003646304804
violation: 0.14273615707169746
violation: 0.0569570533822573
violation: 0.02529047746528446
violation: 0.012387996037167443
violation: 0.006681851008656857
Converged at iteration 9


In [79]:
R_hat

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.01698316e+00, 2.06582624e-02, ...,
        0.00000000e+00, 0.00000000e+00, 2.03514451e-03],
       [0.00000000e+00, 7.22389109e-02, 2.59256610e-02, ...,
        0.00000000e+00, 0.00000000e+00, 3.12371299e-01],
       ...,
       [0.00000000e+00, 2.54453090e+00, 1.95458285e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.47216838e-02],
       [0.00000000e+00, 1.12186164e+00, 4.09861913e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.08012191e-02],
       [0.00000000e+00, 5.02590536e+00, 1.28849697e-01, ...,
        0.00000000e+00, 0.00000000e+00, 5.00816277e+00]])

In [80]:
R.shape, R_hat.shape

((611, 168253), (611, 168253))

In [81]:
# reconstruction error
np.sqrt(np.sum(np.square(R - R_hat)))

377.4592355293162

In [82]:
model.reconstruction_err_

378.5686530991186

## 2. Model deployment: Make recommendations for a new user

### Save the trained model on your hard drive

In [83]:
with open('./nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [84]:
!ls

Project_10_Movie_Recommender.ipynb nmf_recommender.pkl
[34mml-latest-small[m[m


### Read the model from hard drive

In [85]:
with open('./nmf_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [86]:
model.reconstruction_err_

378.5686530991186

### Receive a user query

In [87]:
query

{12: 4, 92: 5, 177: 4, 196: 5, 891: 4, 1128: 5, 1258: 5, 1320: 4}

In [88]:
R[1,:]

<1x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 180 stored elements in Compressed Sparse Row format>

### Construct a user vector

we need the same input as was used during training!

In [89]:
list(query.values())

[4, 5, 4, 5, 4, 5, 5, 4]

In [90]:
data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
data, row_ind, col_ind

([4, 5, 4, 5, 4, 5, 5, 4],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [12, 92, 177, 196, 891, 1128, 1258, 1320])

In [91]:
# new user vector: needs to have the same format as the training data

user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
user_vec

<1x168253 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [92]:
R

<611x168253 sparse matrix of type '<class 'numpy.float64'>'
	with 66658 stored elements in Compressed Sparse Row format>

### Calculate the score

1. transform the user vector to its dense representation (encoding) 
2. inverse transform the dense vector into the sparse representation (decoding)

$$
\hat{r}_{ij} = p_i' \cdot q_j 
$$

In [93]:
# user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat


scores = model.inverse_transform(model.transform(user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

violation: 1.0
violation: 1.8275764969325887
violation: 0.13327891106847606
violation: 0.031992128098199464
violation: 0.014482782766025762
violation: 0.009048532645141915
Converged at iteration 7


0         0.000000
1         0.049521
2         0.148514
3         0.033792
4         0.000000
            ...   
168248    0.000000
168249    0.000000
168250    0.000000
168251    0.000000
168252    0.000006
Length: 168253, dtype: float64

### Give recommendations

In [94]:
query.keys()

dict_keys([12, 92, 177, 196, 891, 1128, 1258, 1320])

In [95]:
# give a zero score to movies the user has allready seen
scores[query.keys()] = 0

In [96]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

924      0.480519
1199     0.470290
1206     0.456220
541      0.427109
741      0.314480
           ...   
56753    0.000000
56754    0.000000
56755    0.000000
56756    0.000000
84126    0.000000
Length: 168253, dtype: float64

In [97]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([924, 1199, 1206, 541, 741, 4226, 1274, 750, 47, 3000], dtype='int64')

In [98]:
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
1199,Brazil (1985),Fantasy|Sci-Fi
1206,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
541,Blade Runner (1982),Action|Sci-Fi|Thriller
741,Ghost in the Shell (Kôkaku kidôtai) (1995),Animation|Sci-Fi
4226,Memento (2000),Mystery|Thriller
1274,Akira (1988),Action|Adventure|Animation|Sci-Fi
750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
3000,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy


In [99]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candidate generation
    data = list(query.values())   # the ratings of the new user
    row_ind = list(query.keys())  # we use just a single row 0 for this user
    col_ind = list(query.keys())  # the columns (=movieId) of the ratings
    data, row_ind, col_ind
    
    # construct a user vector
    user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
   
    # 2. scoring
    
    # calculate the score with the NMF model
    # user_vec -> encoding -> p_user_vec -> decoding -> user_vec_hat
    scores = model.inverse_transform(model.transform(user_vec))
    # convert to a pandas series
    scores = pd.Series(scores[0])
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    # give a zero score to movies the user has allready seen
    scores[query.keys()] = 0
    # return the top-k highst rated movie ids or titles
    recommendations = scores.head(10).index
    return recommendations
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi
1199,Brazil (1985),Fantasy|Sci-Fi
1206,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller
541,Blade Runner (1982),Action|Sci-Fi|Thriller
741,Ghost in the Shell (Kôkaku kidôtai) (1995),Animation|Sci-Fi
4226,Memento (2000),Mystery|Thriller
1274,Akira (1988),Action|Adventure|Animation|Sci-Fi
750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
3000,Princess Mononoke (Mononoke-hime) (1997),Action|Adventure|Animation|Drama|Fantasy


# Neighborhood Based Filtering for Recommender Systems

In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [101]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['userId'].count()

# filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.loc[ratings_per_movie > 20].index

# filter the ratings matrix and only keep the popular movies
ratings = ratings.loc[ratings['movieId'].isin(popular_movies)]

# Initialize a sparse user-item rating matrix
# (data, (row_ind, col_ind)
R = csr_matrix(
    (ratings['rating'], (ratings['userId'], ratings['movieId'])))

R.shape

(611, 168253)

### Training (new!)

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [102]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [103]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')

# fit it to the user-item rating matrix
model.fit(R)

### Save the trained model on your hard drive

In [106]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [108]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

### Receive a user query

In [110]:
query

{12: 4, 92: 5, 177: 4, 196: 5, 891: 4, 1128: 5, 1258: 5, 1320: 4}

### Construct a user vector (same as before!)

we need the same input as was used during training!

In [112]:
# new user vector: needs to have the same format as the training data
# pre fill it with zeros
user_vec = np.repeat(2.5, 168253)

# fill in the ratings that arrived from the query
user_vec[list(query.keys())] = list(query.values())

In [114]:
user_vec[list(query.keys())]

array([4., 5., 4., 5., 4., 5., 5., 4.])

### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [115]:
# calculates the distances to all other users in the data!
distances, userIds = model.kneighbors([user_vec], n_neighbors=10, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
userIds = userIds[0]

In [116]:
distances, userIds

(array([0.92244712, 0.92789635, 0.93143678, 0.93618135, 0.93659358,
        0.93804828, 0.94089089, 0.94244787, 0.94296981, 0.94357157]),
 array([414, 599,  68, 474, 274, 448, 380, 288, 480, 608]))

In [117]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds]
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
414,1,4.0,961438127
414,2,3.0,961594981
414,3,4.0,961439278
414,5,2.0,961437647
414,6,3.0,961515642
...,...,...,...
608,51255,4.5,1189563894
608,51662,5.0,1189563889
608,51935,4.0,1189563886
608,53996,5.0,1189380659


In [118]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood
scores = neighborhood.groupby('movieId')['rating'].sum()
scores

movieId
1         37.5
2         29.5
3         19.0
5         10.5
6         27.5
          ... 
148626    11.0
152081    12.0
164179    14.5
166528    16.5
168252    16.5
Name: rating, Length: 1228, dtype: float64

### Give recommendations (same as before!)

In [119]:
query.keys()

dict_keys([12, 92, 177, 196, 891, 1128, 1258, 1320])

In [121]:
# give a zero score to movies the user has already seen
allready_seen = scores.index.isin(query.keys())
scores.loc[allready_seen] = 0

In [122]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

movieId
1196    48.0
1210    45.0
296     45.0
4226    45.0
260     45.0
        ... 
315      2.0
186      1.5
196      0.0
1320     0.0
1258     0.0
Name: rating, Length: 1228, dtype: float64

In [123]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([1196, 1210, 296, 4226, 260, 8636, 1136, 4993, 1291, 2959], dtype='int64', name='movieId')

In [124]:
# let's see the recommendations!
movies.set_index('movieId').loc[recommendations]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4226,Memento (2000),Mystery|Thriller
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
1291,Indiana Jones and the Last Crusade (1989),Action|Adventure
2959,Fight Club (1999),Action|Crime|Drama|Thriller


In [None]:
# collaborative filtering = look at ratings only!
def recommend_neighborhood(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained nearest neighbors model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    query = 
    
    # construct a user vector
    
   
    # 2. scoring
    
    # find n neighbors
    
    # calculate their average rating
    
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]