In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import glob

In [2]:
files = glob.glob("data/ml-latest-small/*csv")

In [3]:
names = [file.split("/")[-1].split(".csv")[0] for file in files]
pd_dict = {}
for file, name in zip(files, names):
    pd_dict[name] = pd.read_csv(file)

In [4]:
movies = pd_dict["movies"]
ratings = pd_dict["ratings"]

KeyError: 'movies'

In [5]:
# Get user ids
user_ids = ratings["userId"]

In [6]:
# Select user ids
n_user_ids = 50
np.random.seed(42)
selected_user_ids = np.random.choice(np.unique(list(user_ids)), size=n_user_ids, replace=False)

In [7]:
# create a boolean that has the above user ids
selection_boolean = []
for user_id in list(user_ids):
    if user_id in selected_user_ids:
        selection_boolean.append(True)
    else:
        selection_boolean.append(False)

In [8]:
# use the boolean above to select rows in ratings
ratings_subset = ratings[selection_boolean]

In [9]:
# set index for ratings_subset
ratings_subset.set_index("movieId", inplace=True)

In [10]:
# set index for movies
movies.set_index("movieId", inplace=True)

In [11]:
# join ratings_subset and movies
ratings_join = ratings_subset.join(movies)

In [12]:
ratings_subset

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,3,0.5,1306463578
527,3,0.5,1306464275
647,3,0.5,1306463619
688,3,0.5,1306464228
720,3,0.5,1306463595
...,...,...,...
4022,607,4.0,997847173
4023,607,3.0,997847173
4054,607,3.0,997847173
4069,607,3.0,997847203


In [13]:
ratings_join

Unnamed: 0_level_0,userId,rating,timestamp,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,31,5.0,850466616,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,71,5.0,864737933,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,78,4.0,1252575124,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,82,2.5,1084467729,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,132,2.0,1157921785,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...
179817,599,3.0,1516604716,Darkest Hour (2017),Drama|War
180031,599,3.5,1518298493,The Shape of Water (2017),Adventure|Drama|Fantasy
180297,599,3.0,1516604804,The Disaster Artist (2017),Comedy|Drama
181315,599,3.5,1517370374,Phantom Thread (2017),Drama|Romance


In [14]:
# select only userid, rating and title
ratings_movie = ratings_join[["userId", "rating", "title"]]
ratings_movie


Unnamed: 0_level_0,userId,rating,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,31,5.0,Toy Story (1995)
1,71,5.0,Toy Story (1995)
1,78,4.0,Toy Story (1995)
1,82,2.5,Toy Story (1995)
1,132,2.0,Toy Story (1995)
...,...,...,...
179817,599,3.0,Darkest Hour (2017)
180031,599,3.5,The Shape of Water (2017)
180297,599,3.0,The Disaster Artist (2017)
181315,599,3.5,Phantom Thread (2017)


In [15]:
# Pivot the table (reshape to wide format)
ratings_movie = ratings_movie.pivot_table(index="userId", columns="title", aggfunc="mean")

In [16]:
# Save this reshaped table
ratings_movie

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zone 39 (1997),Zookeeper (2011),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
31,,,,,,,,,,,...,,,,,,,,,,
56,,,,,,,,,,,...,,,,,,,,,,
71,,,,,,,,,,,...,,,,,,,,,,
78,,,,,,,,,,,...,,,,,,,,,,
82,,,,,,,,1.0,,,...,,,,4.0,,,,3.5,,
102,,,,,,,,,,,...,,,,,,,,,,
105,,5.0,,,,,,,,,...,4.0,,,,4.5,,,3.5,,
132,,,,,,4.0,,,,,...,,,,,,,,,,


In [17]:
# Fill missing values with mean of the column
ratings_movie.fillna(ratings.mean(), inplace=True)
ratings_movie

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zone 39 (1997),Zookeeper (2011),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
11,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
31,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
56,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
71,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
78,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
82,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,1.0,3.501557,3.501557,...,3.501557,3.501557,3.501557,4.0,3.501557,3.501557,3.501557,3.5,3.501557,3.501557
102,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557
105,3.501557,5.0,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,...,4.0,3.501557,3.501557,3.501557,4.5,3.501557,3.501557,3.5,3.501557,3.501557
132,3.501557,3.501557,3.501557,3.501557,3.501557,4.0,3.501557,3.501557,3.501557,3.501557,...,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557,3.501557


In [18]:
# Save reshaped file
ratings_movie.reset_index(inplace=True)


In [19]:
ratings_movie.columns = [i[1] for i in ratings_movie.columns]
ratings_movie.columns = ["userId" if column == "" else column for column in list(ratings_movie.columns)] 
ratings_movie.set_index("userId", inplace=True)
ratings_movie.to_csv("ratings_movie_reshaped.csv")


In [20]:
# Define a non-negative matrix factorization with 20 components
nmf = NMF(n_components=20, max_iter=5000)

In [21]:
# Fit the ratings to the nmf
nmf.fit(ratings_movie)



NMF(max_iter=5000, n_components=20)

In [22]:
# save the model
filename = 'nmf_movies.sav'
import pickle
pickle.dump(nmf, open(filename, "wb"))

In [23]:
sample_movies = np.random.choice(ratings_movie.columns, replace=False, size=10)
own_ratings = {}
for sample_movie in sample_movies:
    own_ratings[sample_movie] = np.random.randint(1, 6)
own_ratings

{'Gosford Park (2001)': 1,
 'High Noon (1952)': 4,
 'Presto (2008)': 2,
 'Iron Man (2008)': 1,
 'Transporter 2 (2005)': 5,
 'Horton Hears a Who! (2008)': 3,
 'Down in the Valley (2005)': 4,
 'Public Enemies (2009)': 4,
 'Muse, The (1999)': 4,
 'Alesha Popovich and Tugarin the Dragon (2004)': 3}

In [24]:
own_ratings = {'Omen, The (1976)': 5,
 'Parent Trap, The (1998)': 4,
 'Ivan Vasilievich: Back to the Future (Ivan Vasilievich menyaet professiyu) (1973)': 5,
 'Mutant Aliens (2001)': 5,
 'Death Note: Desu nôto (2006–2007)': 4,
 'Affair to Remember, An (1957)': 3,
 'Superman II (1980)': 3,
 'English Patient, The (1996)': 3,
 'Midnight Run (1988)': 4,
 'Best of the Best (1989)': 2}
 

In [25]:
# Read ratings movie reshaped
reshaped_data = pd.read_csv("ratings_movie_reshaped.csv", index_col=0)

In [26]:
# Get movies
movies = list(reshaped_data.columns)

In [27]:
# Create a full own ratings dictionary
own_ratings_full = {}
for movie in movies:
    if movie in own_ratings:
        own_ratings_full[movie] = own_ratings[movie]
    else:
        own_ratings_full[movie] = None

In [28]:
# Create a dataframe for the above movies
own_ratings_df = pd.DataFrame(own_ratings_full, index=[0])
own_ratings_df

Unnamed: 0,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zone 39 (1997),Zookeeper (2011),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,,,,,,,,,,,...,,,,,,,,,,


In [29]:
# Fill missing values
own_ratings_df.fillna(own_ratings_df.iloc[0].mean(), inplace=True)
own_ratings_df

Unnamed: 0,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zone 39 (1997),Zookeeper (2011),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,...,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8


In [30]:
# Calculate matrix P
P = nmf.transform(own_ratings_df)
Pdf = pd.DataFrame(P)
Pdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,4.561908,0.324778,0.629078,0.946687,0.208707,0.328516,0.748823,0.443471,0.460558,0.066432,0.337443,0.013264,0.22609,0.316076,0.347743,0.310077,0.581056,0.321206,0.224575,0.271615


In [31]:
# Extract matrix Q (movie-genre)
Q = nmf.components_
Qdf = pd.DataFrame(Q)
Qdf.columns = reshaped_data.columns

In [32]:
# Generating predicted ratings by multiplying P and Q.T
predicted_ratings = Pdf.dot(Qdf)

In [33]:
predicted_ratings

Unnamed: 0,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 (1979),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),...,Zombieland (2009),Zone 39 (1997),Zookeeper (2011),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,3.80169,3.799519,3.797765,3.795778,3.798252,3.810104,3.789351,3.761345,3.779373,3.753708,...,3.757217,3.80169,3.798648,3.775336,3.798783,3.795804,3.82387,3.804441,3.819994,3.784375


In [34]:
# Remove already rated movies
predicted_ratings.drop(list(own_ratings.keys()), axis=1, inplace=True)

In [35]:
predictions = predicted_ratings.T
predictions.columns = ["rating"]
top_3 = list(predictions.sort_values("rating", ascending=False).iloc[:3].index)
top_3

['Shawshank Redemption, The (1994)',
 'Forrest Gump (1994)',
 'Pulp Fiction (1994)']