In [2]:
import os
import pandas as pd
from functools import reduce
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.decomposition import NMF



In [3]:
df_list = []
for filename in os.listdir('./data/ml-latest-small'):
    if filename.endswith(".csv"):
        df = pd.read_csv('./data/ml-latest-small/' + f'{filename}')
        df_list.append(df)
        
for df in df_list:
    print(len(df))
    print(df.head(2))
    
    
df_movie_names = df_list[3]
df_ratings = df_list[2]

9742
   movieId  imdbId  tmdbId
0        1  114709   862.0
1        2  113497  8844.0
3683
   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
100836
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
9742
   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2    Jumanji (1995)                   Adventure|Children|Fantasy


### Check unique values

In [4]:
df_ratings.shape, df_ratings['movieId'].nunique(),df_movie_names.shape, df_movie_names['title'].nunique()
# Why less unique titles? -> some movies have more than one movieId
# movieId is NOT UNIQUE

((100836, 4), 9724, (9742, 3), 9737)

### Create unique movieId column in ratings_df and movie_title_df

#### Find duplicate names/ movies that are assigned to more than one movieId

In [5]:
duplicateRowsDF = df_movie_names[df_movie_names.duplicated(subset=['title'])]
duplicateRowsDF

Unnamed: 0,movieId,title,genres
5601,26958,Emma (1996),Romance
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


In [6]:
dup_titles = duplicateRowsDF['title'].tolist()
dup_titles

['Emma (1996)',
 'War of the Worlds (2005)',
 'Confessions of a Dangerous Mind (2002)',
 'Eros (2004)',
 'Saturn 3 (1980)']

In [7]:
dup = df_movie_names[df_movie_names['title'].isin(dup_titles)]
dup.sort_values(by='title', ascending=True)

Unnamed: 0,movieId,title,genres
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
650,838,Emma (1996),Comedy|Drama|Romance
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
9135,147002,Eros (2004),Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi


In [8]:
# create list out of these movieIds:
dup_ids = dup['movieId'].tolist()
dup_ids

[838, 2851, 6003, 26958, 32600, 34048, 64997, 144606, 147002, 168358]

In [9]:
# check which movieId these titles have in the ratings df:
df_ratings = df_list[2]

dup_in_rat = df_ratings[df_ratings['movieId'].isin(dup_ids)]
dup_in_rat['movieId'].unique()
# this dataframe also has the different movie Ids for the same films 

array([  2851,    838,  34048,  64997,   6003, 144606, 147002,  26958,
       168358,  32600])

#### change the movieIds that occur two times to the first occuring movieId

In [10]:
# in ratings_df & movie_name df

# create movieId_unique column 
# 1. copy the existing ids
df_ratings['movieId_unique'] = df_ratings['movieId'] 
df_movie_names['movieId_unique'] = df_movie_names['movieId']

# create a dictionary with unique movieIds for the films (take the lower number):
movie_id_dict = {144606: 6003, 
                 26958: 838, 
                 147002: 32600,
                 168358: 2851,
                 64997: 34048   } 

# Remap the values of the dataframe 
df_ratings = df_ratings.replace({"movieId_unique": movie_id_dict}) 
df_movie_names = df_movie_names.replace({"movieId_unique": movie_id_dict}) 

# test 1:
df_ratings[df_ratings['movieId']==64997]

Unnamed: 0,userId,movieId,rating,timestamp,movieId_unique
4747,28,64997,3.5,1234850075,34048
11451,68,64997,2.5,1230497715,34048


In [11]:
# test 2: 
df_movie_names[df_movie_names['movieId']==64997]

Unnamed: 0,movieId,title,genres,movieId_unique
6932,64997,War of the Worlds (2005),Action|Sci-Fi,34048


In [12]:
df_ratings['movieId_unique'].nunique(), df_movie_names['movieId_unique'].nunique()
# there are more movie names than rated movies !

(9719, 9737)

### -> some movies do not have a rating!

### left-merge ratings_df and movie_names_df

In [13]:
# left merge to keep only movies with existing ratings
df = pd.merge(df_ratings, df_movie_names, on='movieId_unique', how='left')
# check for number of unique ids
df['movieId_unique'].nunique()

9719

In [14]:
df.head()

Unnamed: 0,userId,movieId_x,rating,timestamp,movieId_unique,movieId_y,title,genres
0,1,1,4.0,964982703,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,3,3,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,6,6,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,47,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [16]:
# export cleaned up dateframe
df.to_csv('./data/ml-latest-small/ratings_names_uniqueids.csv')

###  construct a dictionary for name: id 

In [13]:
# To map movie names to movie ID’s, construct a {name: id} dictionary
title_dict = dict(zip(df.title, df.movieId_unique))
len(title_dict)

9719

In [36]:
#title_dict

---

### Create rating matrix 
* with rows=users, 
* columns=movies, 
* values in the matrix= user_rating a for movie_id=1 etc

In [14]:
# creating ratings matrix (R)
R = df.pivot_table(index='userId', columns='movieId_unique', values='rating', dropna=False)
R.shape # 610 users , 9724 movieIds

(610, 9719)

In [15]:
R.head()

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


### Handle missing data


In [16]:
med_values = R.median().median()
R.fillna(med_values,inplace=True)
R.head()

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,3.5,4.0,3.5,3.5,4.0,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,4.0,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


### Train NMF

In [17]:
m = NMF(n_components=20)

In [18]:
m.fit(R)
# 60 components: ((610, 60), (60, 9719), 228.1942897155037)
# 20 components: ((610, 20), (20, 9719), 274.6005132350494)



NMF(n_components=20)

### Check out the sub-matrices, and the reconstruction error

In [19]:
Q = m.components_
P = m.transform(R)
error = m.reconstruction_err_ #this is an absolute score, so no intuition from looking at in isolation! 
P.shape, Q.shape, error

((610, 20), (20, 9719), 274.1675308223677)

In [20]:
new_R = np.dot(P,Q)
pd.DataFrame(new_R.round(1), columns=R.columns, index=R.index)

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.1,3.6,3.7,3.5,3.5,3.8,3.6,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.6,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.4,3.5,3.5,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.7,3.5,3.4,3.5,3.4,3.6,3.4,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,3.6,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.8,3.4,3.5,3.5,3.2,3.7,3.3,3.5,3.5,3.2,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
607,3.6,3.5,3.6,3.5,3.5,3.7,3.5,3.5,3.5,3.6,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
608,2.5,2.1,2.4,3.6,3.1,3.7,3.6,3.6,3.4,3.4,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
609,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


---

### Making a prediction based on new user input

Use the NMF model to produce recommendations for one user.

- To map movie names to movie ID’s, construct a {name: id} dictionary
- To deal with small differences in the names, the fuzzywuzzy package is quite useful
- Create an vector of three movies the user likes. Set these to 5 and all others to zero.

#### Create user input

In [21]:
### use fuzzywuzzy to allow not exact movie title input from user











In [37]:
# List of movies the user likes:
# vals = ['Kicking and Screaming (1995)', 'White Squall (1996)', 'In the Bleak Midwinter (1995)']
vals = ['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)']

In [38]:
# vals = []
# user_mov_1 = input('Enter a movie that you liked:')
# user_mov_2 = input('Enter a 2nd movie that you liked:')
# user_mov_3 = input('Enter a 3rd movie that you liked:')

# vals = [user_mov_1, user_mov_2, user_mov_3]

In [39]:
keys = [title_dict.get(val) for val in vals]
keys, vals

([1, 3, 6], ['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)'])

In [40]:
# create ratings_list
ratings_5 = [5 for y in keys]
# create small dict
d_fill = dict(zip(keys, ratings_5))
d_fill

{1: 5, 3: 5, 6: 5}

### transform user input to user movie vector (for model input)

In [46]:
# create vector for user with zeros (missing values = 0 in this case)
dict_new_user = dict.fromkeys(df.movieId_unique,0)

In [52]:
# update the values with the rated movie ids 
dict_new_user.update(d_fill)

# look at the first entries
first5pairs = {k: dict_new_user[k] for k in list(dict_new_user)[:6]}
first5pairs

{1: 5, 3: 5, 6: 5, 47: 0, 50: 0, 70: 0}

In [55]:
# transform into an array/ vector of the values
user_arr = np.array(list(dict_new_user.values()))
# reshape
user_arr = user_arr.reshape(1,9719)

#### Prediction

In [58]:
#Prediction step 1 - generate extra a user_P
user_P = m.transform(user_arr)
user_P

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.00911187, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [79]:
#new user R - reconstruct R but for this new user only
# (column of first matrix = row of second matrix)
user_R = np.dot(user_P,Q)
user_R

array([[0.05359065, 0.02448565, 0.03699081, ..., 0.00074329, 0.00074329,
        0.0005409 ]])

In [80]:
user_R.shape

(1, 9719)

In [82]:
# get rid of one dimension of the array
user_R = user_R[0]
user_R

array([0.05359065, 0.02448565, 0.03699081, ..., 0.00074329, 0.00074329,
       0.0005409 ])

In [83]:
# get the corresponding array of movie titles
df.title.values

array(['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', ...,
       'Get Out (2017)', 'Logan (2017)', 'The Fate of the Furious (2017)'],
      dtype=object)

In [119]:
# zip into tuples of rating and film title 
recommendations = list(zip(user_R,df.title.values))

In [118]:
# remove the first three ones (the films that the user has already seen):
rec = recommendations[3:]

In [105]:
sorted_top_5_rec = sorted(rec, key = lambda x: x[0], reverse=True)[:5]
sorted_top_5_rec

[(0.08722625834886101, 'Payback (1999)'),
 (0.07839718365675095, 'Before Sunrise (1995)'),
 (0.07782583944667583, 'Under the Sand (2000)'),
 (0.07516946926616076, "Buffalo '66 (a.k.a. Buffalo 66) (1998)"),
 (0.07200633481851017, 'Grosse Pointe Blank (1997)')]

In [117]:
# list comprehension to get the top 5 film titles for the current user
[x[1] for x in sorted_top_5_rec]

['Payback (1999)',
 'Before Sunrise (1995)',
 'Under the Sand (2000)',
 "Buffalo '66 (a.k.a. Buffalo 66) (1998)",
 'Grosse Pointe Blank (1997)']