### Test for my explore branch

calculate an ordered top list (with regards to the average rating) of movies for your dataset

recommend top movies that a user has not seen yet: Implement a function `recommend(user_id, top_list, ratings, k)` that gets 
- a user_id, 
- the top list of movies 
- and the ratings table 

- and returns a list of k movie_ids as recommondations.

Only consider movies that are rated by a number of users above a certain threshold. Try out different theshold values.


In [1]:
import os
import pandas as pd
from functools import reduce
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.decomposition import NMF



In [2]:
# load clean data
df = pd.read_csv('data/ml-latest-small/ratings_names_uniqueids.csv', index_col=0)

# create rating matrix R
R = df.pivot_table(index='userId', columns='movieId_unique', values='rating', dropna=False)
R.fillna(0,inplace=True) # not: (R.median().median())

# instantiate Non negative matrix factorization
m = NMF(n_components=20)
m.fit(R)

# get Predictors (p) - users and quantities (Q) - components
Q = m.components_
P = m.transform(R)

# calculate new rating matrix and put into dataframe
new_R = np.dot(P,Q)
new_R_df = pd.DataFrame(new_R.round(1), columns=R.columns, index=R.index)
new_R_df.head()



movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,1.1,1.1,0.0,0.1,1.6,0.3,0.1,0.2,1.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.3,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.7,0.4,0.1,0.0,0.2,0.8,0.2,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.2,1.0,0.5,0.1,0.5,0.8,0.6,0.1,0.1,1.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# find movies with top ratings:
# calculate the sum of every column and get the top 50 highest values
new_R_df.loc['rating_sum']= new_R_df.sum(axis=0)
new_R_df.tail()

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
607,2.2,1.0,0.8,0.1,0.3,1.5,0.4,0.1,0.2,1.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,3.3,2.0,0.9,0.1,0.3,3.5,0.3,0.2,0.1,4.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.9,0.7,0.3,0.1,0.4,0.6,0.4,0.1,0.1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,4.9,0.1,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rating_sum,805.1,434.7,190.5,21.0,149.4,450.9,170.6,31.6,31.4,481.7,...,0.7,0.2,0.7,0.7,0.7,0.7,0.7,0.7,0.7,0.5


In [39]:
# show 100 movie_ids with the highest ratings
top_100_df = new_R_df.sort_values(by ='rating_sum', axis = 1, ascending=False).iloc[-1:, 0:50]
top_100_df

movieId_unique,318,356,296,593,2571,260,110,2959,589,480,...,377,1136,1265,1291,1197,1213,1704,79132,2329,1214
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rating_sum,1303.5,1298.8,1266.8,1123.2,1071.3,981.5,955.5,933.6,909.5,903.1,...,623.6,621.6,610.8,609.4,606.8,605.5,598.0,595.9,594.4,592.5


In [40]:
top100_list = top_50_df.columns.to_list()

In [23]:
new_R_df.shape

(611, 9719)

In [41]:
movies_seen= new_R_df.iloc[2][new_R_df.iloc[2] != 0].index.to_list()
#movies_seen_ids = movies_seen.movieId_unique
len(movies_seen)

211

In [5]:
# map the movie title to the movie_id

In [43]:
def recommend(user_id, top_list): # , top_list, ratings, k
    
    # get movie_ids of the films the user has already seen 
    movies_seen= new_R_df.iloc[2][new_R_df.iloc[2] != 0].index.to_list()

    # check which films in top 50 list the user has already seen:
    #top_films_seen = set(top100_list).intersection(movies_seen)
    
    # remove them from the top100_list
    top_films_to_watch = list(set(top100_list) - set(movies_seen))
    print('remaining top films to watch:', len(top_films_to_watch))  
    
    
    # GENERIC TOP LIST (OF ALL USERS FOR NOW), needs to be specific for the user!! 
    # create an instance of the NMF for the current user!!!
    
    # return top 5 to watch 
    return top_films_to_watch

In [44]:
rec = recommend(2, top_20_list)
rec

remaining top films to watch: 15


[5952,
 4993,
 4226,
 1704,
 588,
 364,
 590,
 527,
 7153,
 4306,
 150,
 2329,
 79132,
 318,
 58559]