In [45]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [46]:
movies_cluster_ratings_file = "data/movies_clusters_ratings.csv"
ratings_file = "data/ratings.csv"

In [47]:
movies = pd.read_csv(movies_cluster_ratings_file)
ratings = pd.read_csv(ratings_file)

In [48]:
# Remove movie ids and movies in ratings that are not listed in movies
ratings_ids = list(ratings["movieId"])
movies_ids = list(movies["movieid"])
boolean = []
for rating_id in ratings_ids:
    if rating_id in movies_ids:
        boolean.append(True)
    else:
        print(f'This movie id: <{rating_id}> is not listed under movies')
        boolean.append(False)

This movie id: <176601> is not listed under movies
This movie id: <147250> is not listed under movies
This movie id: <171749> is not listed under movies
This movie id: <171631> is not listed under movies
This movie id: <171891> is not listed under movies
This movie id: <140956> is not listed under movies
This movie id: <40697> is not listed under movies
This movie id: <140956> is not listed under movies
This movie id: <149334> is not listed under movies
This movie id: <171495> is not listed under movies
This movie id: <140956> is not listed under movies
This movie id: <167570> is not listed under movies
This movie id: <143410> is not listed under movies
This movie id: <162414> is not listed under movies
This movie id: <140956> is not listed under movies
This movie id: <40697> is not listed under movies
This movie id: <156605> is not listed under movies
This movie id: <171495> is not listed under movies


In [49]:
# Discard the unlisted movies
ratings_new = ratings[boolean]

In [50]:
 # Get movied and title table
movie_ids_and_title = movies[["movieid", "title"]]

In [51]:
# Get movieid and title dictionary
movie_collection_dict = movie_ids_and_title.set_index("title").to_dict()["movieid"]

In [83]:
# create user-item matrix through pivoting
user_item_matrix = ratings_new.pivot(index="userId", columns="movieId", values="rating")
user_item_matrix.columns = list(user_item_matrix.columns)
user_item_matrix

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [84]:
user_item_matrix

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [85]:
# Save this as a user-item matrix
user_item_matrix.reset_index(inplace=True)
user_item_matrix.to_csv("data/my_user_item_matrix.csv", index=False)
user_item_matrix.set_index("userId", inplace=True)

In [86]:
# we need to fill in the missing values. We user a constant 
imputer = SimpleImputer(strategy="constant", fill_value=2.5)

In [87]:
# Fit_transform the user-item matrix with imputer
imputed_df = pd.DataFrame(imputer.fit_transform(user_item_matrix))
imputed_df.columns = user_item_matrix.columns
imputed_df.index = user_item_matrix.index
user_item_matrix = imputed_df
len(user_item_matrix.columns), user_item_matrix

(9711,
         1       2       3       4       5       6       7       8       \
 userId                                                                   
 1          4.0     2.5     4.0     2.5     2.5     4.0     2.5     2.5   
 2          2.5     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 3          2.5     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 4          2.5     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 5          4.0     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 ...        ...     ...     ...     ...     ...     ...     ...     ...   
 606        2.5     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 607        4.0     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 608        2.5     2.0     2.0     2.5     2.5     2.5     2.5     2.5   
 609        3.0     2.5     2.5     2.5     2.5     2.5     2.5     2.5   
 610        5.0     2.5     2.5     2.5     2.5     5.0     2.5     2.5   
 
         9      

In [57]:
# load NMF
from sklearn.decomposition import NMF

In [88]:
# initialize the nmf model with 20 components
nmf_model = NMF(n_components=20)

In [89]:
# fit the model to user-item matrix to decompose it to P and Q
nmf_model.fit(user_item_matrix)



NMF(n_components=20)

In [90]:
# Save the model as pickle
import pickle
#pickle.dump(nmf_model, open("nmf_model.sav", "wb"))

In [61]:
# load the model
model = pickle.load(open("nmf_model.sav", "rb"))
model

NMF(n_components=20)

In [62]:
# Get some user ratings
user_rating = {
        "four Rooms": 5,
        "sudden DEath": 3,
        "oThelo": 4,
        "Nixons": 3,
        "Golden eye": 1,
        "Total Eclipse": 5,
        "NadJa": 3,
        "forbiden planet": 4
    }

In [63]:
# Get all movie titles
movie_collection_dict = movie_ids_and_title.set_index("movieid").to_dict()["title"]
movie_titles = list(movie_collection_dict.values())
movie_ids = list(movie_collection_dict.keys())

In [64]:
 # User fuzzywuzzy to get the correct movie rating
import utils
real_requested_movies = [utils.match_movie_title(i, movie_titles) for i in user_rating.keys()]
real_requested_movies

['Four Rooms',
 'Sudden Death',
 'Othello',
 'Nixon',
 'GoldenEye',
 'Total Eclipse',
 'Nadja',
 'Forbidden Planet']

In [65]:
# Get movie ids for the movies requested by user
user_movie_ids_dict = {}
for user_movie in real_requested_movies:
    for dict_movie_title, dict_movie_id in zip(movie_collection_dict.values(), movie_collection_dict.keys()):
        if user_movie == dict_movie_title:
            user_movie_ids_dict[user_movie] = dict_movie_id
user_movie_ids_dict




{'Four Rooms': 18,
 'Sudden Death': 9,
 'Othello': 26,
 'Nixon': 14,
 'GoldenEye': 10,
 'Total Eclipse': 202,
 'Nadja': 184,
 'Forbidden Planet': 1301}

In [66]:
# interchange the user movie title with ids in the user query
user_movie_ratings = {}
for user_rating_key, user_movie_id in zip(user_rating, user_movie_ids_dict):
    user_movie_ratings[user_movie_ids_dict[user_movie_id]] = user_rating[user_rating_key]
user_movie_ratings
    

{18: 5, 9: 3, 26: 4, 14: 3, 10: 1, 202: 5, 184: 3, 1301: 4}

In [67]:
# Add unrated movie ids to the user rating
user_rating_full = {}
user_item_movie_ids = list(user_item_matrix.columns)
for movieid in user_item_movie_ids:
    if movieid in user_movie_ratings.keys():
        user_rating_full[movieid] = user_movie_ratings[movieid]
    else:
        user_rating_full[movieid] = None

In [68]:
# Convert user ratings full into a dataframe
user_rating_table = pd.DataFrame(user_rating_full, index=[0])

In [69]:
# fill in missing values with a constant
user_rating_table.fillna(value=2.5, inplace=True)

In [70]:
user_rating_table

Unnamed: 0,userId,1,2,3,4,5,6,7,8,9,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,3,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


In [71]:
# Calculate the user-genre matrix (P matrix)
pmatrix = model.transform(user_rating_table)
pmatrix



array([[0.00000000e+00, 4.15952241e+00, 1.22660411e-01, 7.87512405e-02,
        2.89582662e-02, 1.69958528e-01, 7.18603068e-02, 1.13485193e-01,
        6.59156114e-02, 1.15515556e-01, 1.01601430e-01, 9.95392759e-02,
        3.87650578e-02, 1.33758073e-03, 3.42842936e-02, 0.00000000e+00,
        1.03636289e-01, 1.05095706e-01, 2.23351724e-01, 4.51838813e-02]])

In [72]:
# pmatrix as a table
p_df = pd.DataFrame(pmatrix)

In [73]:
# Calculate the predicted ratings
qmatrix = model.components_

In [74]:
# q matrix as a dataframe
q_df = pd.DataFrame(qmatrix)

In [75]:
# calculate predictions
predicted_ratings = p_df.dot(q_df)
predicted_ratings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9702,9703,9704,9705,9706,9707,9708,9709,9710,9711
0,2.916771,2.592426,2.448799,2.555533,2.506641,2.577216,2.454367,2.561784,2.497317,2.518979,...,2.507897,2.504981,2.510813,2.510813,2.507897,2.510813,2.507897,2.507897,2.507897,2.502385


In [76]:
# Remove already rated items
predicted_ratings.drop(user_movie_ids_dict.values(), axis=1, inplace=True)

In [77]:
# select n top movies for recommendations
n = 5
predictions = predicted_ratings.T
predictions.columns = ["ratings"]
movie_ids_to_recommend = list(predictions.sort_values("ratings", ascending=False).head(n).index)
movie_ids_to_recommend

[0, 278, 315, 900, 225]

In [78]:
# look up movie titles
import utils
[utils.lookup_movieId(i) for i in movie_ids_to_recommend]

IndexError: list index out of range