In [326]:
from sqlalchemy import create_engine
import pandas as pd
import os
import numpy as np
from sklearn.decomposition import NMF
from joblib import dump, load

In [298]:
HOST = 'localhost'
PORT = '5432'
DB = 'movies'

conn_string_mac = f'postgres://{HOST}:{PORT}/{DB}'

In [299]:
#Create engine
engine = create_engine(conn_string_mac)

In [300]:
#REad in table names
table_list = engine.table_names()

In [301]:
table_list

['links', 'tags', 'ratings', 'movies']

### Read in SQL tables into DFs

In [302]:
df_links = pd.read_sql_query('SELECT * from links',con=engine)

In [303]:
df_tags = pd.read_sql_query('SELECT * from tags',con=engine)

In [304]:
df_ratings = pd.read_sql_query('SELECT * from ratings',con=engine)

In [390]:
df_movies = pd.read_sql_query('SELECT * from movies',con=engine)

In [375]:
movie_id_dict = dict(zip(df_movies['movieId'], df_movies['title_new']))

KeyError: 'title_new'

In [391]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [392]:
df_movies['title_new'] = df_movies['title'].replace('[^a-z]*$', '', regex = True)

In [393]:
df_movies['title_new'] = df_movies['title_new'].str.lower()

In [394]:
df_movies['title_new'] = df_movies['title_new'].str.split('(')

In [401]:
df_movies['title_new'] = df_movies['title'].replace('\(.*', '', regex = True)

In [402]:
df_movies

Unnamed: 0,movieId,title,genres,title_new
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II
5,6,Heat (1995),Action|Crime|Thriller,Heat
6,7,Sabrina (1995),Comedy|Romance,Sabrina
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck
8,9,Sudden Death (1995),Action,Sudden Death
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye


### Table Transformation

In [308]:
df_ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [309]:
user_movie_matrix = df_ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating')

In [310]:
user_movie_matrix.head() #Sparse

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [311]:
user_movie_matrix.shape

(610, 9724)

In [312]:
mean_rating = df_ratings['rating'].mean()

In [313]:
user_movie_matrix = user_movie_matrix.fillna(value = mean_rating)

In [314]:
model = NMF(n_components=30, init='random', random_state=10)

In [315]:
model.fit(user_movie_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init='random', l1_ratio=0.0, max_iter=200,
    n_components=30, random_state=10, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [316]:
Q = model.components_  # movie-genre matrix

In [317]:
Q.shape

(30, 9724)

In [318]:
#Component_Movie Matrix
df_Q = pd.DataFrame(data = Q, columns = user_movie_matrix.columns)

In [406]:
df_Q.columns

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585,
            193587, 193609],
           dtype='int64', name='movieId', length=9724)

In [320]:
P = model.transform(user_movie_matrix) # user - genre matrix

In [321]:
#User_component matrix
df_P = pd.DataFrame(P)

In [322]:
df_P

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,2.167563,0.783799,0.412294,0.529339,0.450188,0.564225,0.164158,0.885578,0.267219,0.398983,...,0.503997,0.153103,0.226265,0.101880,0.317906,0.200466,0.152718,0.072218,0.319303,0.000000
1,2.484750,0.957430,0.396633,0.411962,0.408539,0.510850,0.118441,0.842188,0.432399,0.312622,...,0.157006,0.359686,0.196930,0.126154,0.174872,0.000000,0.253163,0.131336,0.280490,0.074124
2,2.105389,0.767018,0.294767,0.466784,0.444855,0.517322,0.031797,1.103336,0.408852,0.316506,...,0.132566,0.445259,0.235661,0.167101,0.154348,0.000000,0.232236,0.068563,0.216038,0.123341
3,2.216206,0.748616,0.253077,0.339846,0.326549,0.504502,0.140070,0.995863,0.362595,0.371049,...,0.169226,0.305632,0.408079,0.000000,0.240986,0.000000,0.185152,0.090075,0.482254,0.000000
4,2.518768,0.981807,0.410785,0.392255,0.444502,0.504112,0.134406,0.845596,0.440622,0.317558,...,0.153749,0.347758,0.193281,0.122876,0.157327,0.000000,0.253442,0.096002,0.332282,0.044913
5,2.258280,0.823681,0.307255,0.451683,0.468751,0.538206,0.011808,0.983637,0.437956,0.481944,...,0.280394,0.389221,0.304871,0.139839,0.142238,0.000000,0.332262,0.094713,0.176437,0.000000
6,2.625282,1.041719,0.472195,0.555607,0.387867,0.463907,0.193202,0.825787,0.575386,0.222669,...,0.135351,0.133043,0.287092,0.072057,0.242582,0.000000,0.206765,0.141366,0.380317,0.000000
7,2.552083,0.997914,0.435400,0.412708,0.457078,0.535683,0.131231,0.819655,0.465087,0.293656,...,0.163556,0.311798,0.138774,0.123553,0.160142,0.000000,0.252366,0.093792,0.333456,0.048291
8,2.482547,0.971507,0.404492,0.448926,0.408050,0.505354,0.127914,0.845385,0.403406,0.300228,...,0.168095,0.326236,0.204776,0.088360,0.178526,0.000000,0.258367,0.087094,0.276472,0.037189
9,2.330397,0.880345,0.290571,0.405005,0.385329,0.434702,0.009480,0.948432,0.364979,0.418521,...,0.129165,0.454539,0.268689,0.124366,0.173785,0.000000,0.258240,0.107118,0.272054,0.043100


In [116]:
#Create a fake user
user = np.repeat(mean_rating, Q.shape[1])

In [330]:
len(user)

9724

In [117]:
user[3] = 5

In [118]:
user[25] = 2

In [119]:
user[11] = 4

In [120]:
user[8] = 1

In [121]:
user.shape

(9724,)

In [123]:
reconstruct_user = np.dot(user, Q.T)

In [124]:
reconstruct_user.shape # user profile on genre-preference

(30,)

In [254]:
r1 = model.inverse_transform(reconstruct_user)

In [256]:
r1.shape

(9724,)

In [328]:
r1

array([98769.77147772, 89801.37168721, 96837.58827324, ...,
       82329.74520009, 82365.51935574, 61809.1571057 ])

In [126]:
prediction = np.dot(Q.T, reconstruct_user)

In [257]:
prediction.shape

(9724,)

In [329]:
prediction

array([98769.77147772, 89801.37168721, 96837.58827324, ...,
       82329.74520009, 82365.51935574, 61809.1571057 ])

In [275]:
#See which movies did the user see
user_df = pd.DataFrame([user, prediction], index = ['real', 'predicted'])

In [276]:
user_df = user_df.T

In [277]:
user_df['movie_ID'] = df_Q.columns

In [280]:
user_df['real'] = user_df['real'].round(3)

In [281]:
user_df

Unnamed: 0,real,predicted,movie_ID
0,3.502,98769.771478,1
1,3.502,89801.371687,2
2,3.502,96837.588273,3
3,5.000,79254.935576,4
4,3.502,87647.029404,5
5,3.502,86654.663090,6
6,3.502,87195.692031,7
7,3.502,71591.159522,8
8,1.000,65629.200392,9
9,3.502,90665.110195,10


In [271]:
mean_rating_round = round(mean_rating, ndigits = 3)

In [272]:
mean_rating_round

3.502

In [282]:
user_df = user_df[user_df['real']==mean_rating_round]

In [283]:
user_df.head()

Unnamed: 0,real,predicted,movie_ID
0,3.502,98769.771478,1
1,3.502,89801.371687,2
2,3.502,96837.588273,3
4,3.502,87647.029404,5
5,3.502,86654.66309,6


In [242]:
recomm_for_user = user_df.sort_values(by = 'predicted', ascending = False)

In [243]:
recomm_for_user.head(10)

Unnamed: 0,real,predicted,movie_ID
1938,3.5,129458.712996,2571
257,3.5,120982.776332,296
2224,3.5,117234.652875,2959
4345,3.5,117007.246123,6365
773,3.5,115157.296658,1015
314,3.5,114699.984994,356
838,3.5,114001.560672,1101
5009,3.5,113790.265389,7786
4751,3.5,113784.6322,7086
615,3.5,112173.485613,780


In [244]:
movies_ = recomm_for_user['movie_ID'].map(movie_id_dict)

In [294]:
movies_ = movies_.head(10)

In [295]:
movies_

1938                               Matrix, The (1999)
257                               Pulp Fiction (1994)
2224                                Fight Club (1999)
4345                      Matrix Reloaded, The (2003)
773     Homeward Bound: The Incredible Journey (1993)
314                               Forrest Gump (1994)
838                                    Top Gun (1986)
5009                             Genghis Blues (1999)
4751                                 Pygmalion (1938)
615              Independence Day (a.k.a. ID4) (1996)
Name: movie_ID, dtype: object

In [323]:
import pickle

In [327]:
dump(model, 'NMF.joblib') 

['NMF.joblib']

In [252]:
#Write the recommendation function

In [258]:
def sparse_matrix(ratings):
    user_movie_matrix = ratings.pivot(index = 'userId', columns = 'movieId', values = 'rating')
    return user_movie_matrix

In [332]:
def user_recommendation(sparse_matrix_df, user_input, model, movie_id_dict, number_of_recomm):
    #Convert component-movie matrix to df
    Q = model.components_
    df_Q = pd.DataFrame(data = Q, columns = sparse_matrix_df.columns)
    #Calculate the mean rating and impute it for NaNs
    mean_rating = df_ratings['rating'].mean()
    sparse_matrix_df = sparse_matrix_df.fillna(value = mean_rating)
    #Get user blueprint on movies
    user_blueprint = np.dot(user_input, Q.T)
    prediction = model.inverse_transform(user_blueprint)
    #Create user df
    user_df = pd.DataFrame([user_input, prediction], index = ['real', 'predicted'])
    #Transform (for stylistic reasons)
    user_df = user_df.T
    user_df['movie_ID'] = df_Q.columns
    #Round the values
    user_df['real'] = user_df['real'].round(3)
    mean_rating_round = round(mean_rating, ndigits = 3)
    #Filter unwatched movies
    user_df = user_df[user_df['real']==mean_rating_round]
    #Sort for recommendation
    recomm_for_user = user_df.sort_values(by = 'predicted', ascending = False)
    #Map the movie ids
    movies_ = recomm_for_user['movie_ID'].map(movie_id_dict)
    #Restrict the number
    movies_ = movies_.head(number_of_recomm)
    movies_list = list(movies_)
    return movies_list

In [285]:
a = sparse_matrix(df_ratings)

In [333]:
user_recommendation(a, user, model, movie_id_dict, 10)

['Matrix, The (1999)',
 'Pulp Fiction (1994)',
 'Fight Club (1999)',
 'Matrix Reloaded, The (2003)',
 'Homeward Bound: The Incredible Journey (1993)',
 'Forrest Gump (1994)',
 'Top Gun (1986)',
 'Genghis Blues (1999)',
 'Pygmalion (1938)',
 'Independence Day (a.k.a. ID4) (1996)']

array([3.50155698, 3.50155698, 3.50155698, ..., 3.50155698, 3.50155698,
       3.50155698])

In [353]:
movie_id_dict

{1: 'toy story',
 2: 'jumanji',
 3: 'grumpier old men',
 4: 'waiting to exhale',
 5: 'father of the bride part',
 6: 'heat',
 7: 'sabrina',
 8: 'tom and huck',
 9: 'sudden death',
 10: 'goldeneye',
 11: 'american president, the',
 12: 'dracula: dead and loving it',
 13: 'balto',
 14: 'nixon',
 15: 'cutthroat island',
 16: 'casino',
 17: 'sense and sensibility',
 18: 'four rooms',
 19: 'ace ventura: when nature calls',
 20: 'money train',
 21: 'get shorty',
 22: 'copycat',
 23: 'assassins',
 24: 'powder',
 25: 'leaving las vegas',
 26: 'othello',
 27: 'now and then',
 28: 'persuasion',
 29: 'city of lost children, the (cité des enfants perdus, la',
 30: 'shanghai triad (yao a yao yao dao waipo qiao',
 31: 'dangerous minds',
 32: 'twelve monkeys (a.k.a. 12 monkeys',
 34: 'babe',
 36: 'dead man walking',
 38: 'it takes two',
 39: 'clueless',
 40: 'cry, the beloved country',
 41: 'richard',
 42: 'dead presidents',
 43: 'restoration',
 44: 'mortal kombat',
 45: 'to die for',
 46: 'how to ma

In [416]:
movie_id_dict[2571]

'matrix, the'