### Simple recommender

calculation of an ordered top list (with regards to the average rating) of movies 

In [1]:
import os
import pandas as pd
from functools import reduce
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.decomposition import NMF



In [18]:
# load clean dataset
df = pd.read_csv('./data/ml-latest-small/dev_ds_ratings_names_uniqueids.csv', index_col=0)
df.head()

Unnamed: 0,userId,movieId_x,rating,timestamp,movieId_unique,movieId_y,title,genres
0,1,1,4.0,964982703,1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,3,3,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,6,6,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,47,47,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,50,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [34]:
R = df.pivot_table(index='userId', columns='movieId_unique', values='rating', dropna=False)

# count the number of ratings per film:
R.loc['rating_count']= R.count(axis=0)
R.tail()

movieId_unique,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,
610,5.0,,,,,5.0,,,,,...,,,,,,,,,,
rating_count,215.0,110.0,52.0,7.0,49.0,102.0,54.0,8.0,16.0,132.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
# filter for movieIds with more than 50 ratings
R_gt_50 = R[R.columns[R.loc['rating_count']>50]]
# drop rating count row again
R = R_gt_50.drop(['rating_count'])

In [36]:
# create rating matrix R
R.fillna(0,inplace=True) # not: (R.median().median())

# instantiate Non negative matrix factorization
m = NMF(n_components=20)
m.fit(R)

# get Predictors (p) - users and quantities (Q) - components
Q = m.components_
P = m.transform(R)

# calculate new rating matrix and put into dataframe
new_R = np.dot(P,Q)
new_R_df = pd.DataFrame(new_R.round(1), columns=R.columns, index=R.index)
new_R_df.head()



movieId_unique,1,2,3,6,7,10,11,16,17,19,...,81845,89745,91500,91529,99114,106782,109374,109487,112852,122904
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,1.2,1.4,2.5,0.2,1.4,0.2,1.4,0.1,0.7,...,0.0,0.0,0.0,0.1,0.2,0.1,0.0,0.1,0.1,0.0
2,0.4,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.8,1.5,0.8,1.8,1.5,1.1,1.2,1.7,1.4,1.2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.8,0.2,0.7,0.2,0.2,0.4,0.1,0.1,0.4,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0
5,1.2,1.2,0.2,0.8,0.4,1.0,1.4,0.8,1.1,0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# find movies with overall top ratings: calculate the sum of every column 
# Only consider movies that are rated by a number of users above a certain threshold. Try out different theshold values.
new_R_df.loc['rating_sum']= new_R_df.sum(axis=0)
new_R_df.tail()

movieId_unique,1,2,3,6,7,10,11,16,17,19,...,81845,89745,91500,91529,99114,106782,109374,109487,112852,122904
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
607,2.0,1.1,0.3,2.1,0.2,1.9,1.0,0.2,0.5,0.3,...,0.1,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.0
608,3.2,2.3,0.9,4.0,0.6,3.2,1.0,3.5,0.9,1.7,...,0.4,0.1,0.2,0.2,0.6,0.4,0.1,0.0,0.1,0.2
609,0.8,0.8,0.3,0.1,0.2,1.4,0.4,0.1,0.0,0.6,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
610,2.0,1.7,0.3,2.6,0.1,2.1,0.2,3.3,0.6,1.7,...,3.1,4.3,2.9,4.5,4.2,3.1,3.3,4.2,3.9,3.5
rating_sum,873.4,432.9,218.4,462.3,207.7,495.7,319.9,353.2,318.0,287.7,...,245.6,302.3,216.6,341.0,322.5,237.2,229.2,297.6,272.3,234.3


In [43]:
# show 100 movie_ids with the highest ratings
top_100_df = new_R_df.sort_values(by ='rating_sum', axis = 1, ascending=False).iloc[-1:, 0:100]
top_100_list = top_100_df.columns.to_list()

In [39]:
def recommend_top_films_unwatched(user_id, top_list, ratings_table, k): # , top_list, ratings, k
    
    """ Function that returns a top list of the 
    overall highest rated films the specific user has not seen yet
    
    Parameters:
    - user_id, 
    - the top list of movies 
    - and the ratings table 
    - k movie_title the user wants to get recommended

    """
    
    # get movie_ids of the films the user has already seen 
    movies_seen= ratings_table.iloc[user_id][ratings_table.iloc[user_id] != 0].index.to_list()
    
    # remove them from the top100_list
    top_films_to_watch = list(set(top_list) - set(movies_seen))
    print('remaining top films for this user to watch:', len(top_films_to_watch))  
    
    # map movieIds to movie names
    rec_names = []
    for movie_id in top_films_to_watch:
        movie_name = df.loc[df['movieId_unique'] == movie_id]['title'].unique()[0]
        rec_names.append(movie_name)

    # return top 5 (if 5 film lelft to watch) 
    return rec_names[:k]

In [40]:
# prediction -test 
recommend_top_films_unwatched(2, top_100_list, new_R_df, 3)

remaining top films for this user to watch: 97


['Toy Story (1995)', 'Batman Begins (2005)', 'Incredibles, The (2004)']

In [41]:
recommend_top_films_unwatched(34, top_100_list, new_R_df, 5)

remaining top films for this user to watch: 46


['Incredibles, The (2004)',
 'Batman Begins (2005)',
 'Memento (2000)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Lord of the Rings: The Fellowship of the Ring, The (2001)']