In [1]:
#####################
# Model: NLP + ANN
#####################

import numpy as np
import pandas as pd
import os
import re
import math

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tensorflow as tf
# from tensorflow import keras
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, LSTM,Input, Dropout, Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer

pd.options.display.max_colwidth = 100

SEED = 42
TEST_SIZE = 0.2

callback = EarlyStopping(monitor='loss', patience=3)

p_data_raw       = os.path.abspath('../data/raw') # raw data path
p_data_processed = os.path.abspath('../data/processed') # processed data path
p_data_cleaned   = os.path.abspath('../data/cleaned') # clearned data path
p_data_src       = os.path.abspath('../data/src') # src data path

k_ratings_f = pd.read_csv(os.path.abspath(p_data_processed+'/k_ratings_f.csv'), dtype={'label_c': object})

In [38]:
# # get rid of "Other" genre
# k_ratings_f = (k_ratings_f[k_ratings_f['genre1']!='Other'])
# vote_count=k_ratings_f.groupby('movieId')['userId'].count().sort_values(ascending=False)
# # get rid of movies with <10 ratings
# k_ratings_f = k_ratings_f[k_ratings_f.movieId.isin(vote_count[vote_count>=10].index.to_list())]  # 245 movies
# save data
# k_ratings_f.to_csv(os.path.abspath(os.path.abspath(p_data_processed+'/k_ratings_f.csv')),index=False,encoding='utf-8-sig')

In [11]:
# # MovieLens datasets
# ml_links   = pd.read_csv(p_data_raw+'/ml-25m/links.csv') # movie links
# ml_ratings = pd.read_csv(p_data_raw+'/ml-25m/ratings.csv') # user ratings
# ml_movies  = pd.read_csv(p_data_raw+'/ml-25m/movies.csv') # movie list

# # IMDB Kmovie list
# imdb_mat_orig = pd.read_pickle(p_data_raw+'/imdb_kr.pickle')

# # IMDB Kmovie list join MovieLens
# imdb_mat = pd.merge(imdb_mat_orig,ml_links,on='imdbId')

# # IMDB Kmovie list join MovieLens movie list
# imdb_mat = pd.merge(imdb_mat,ml_movies,on='movieId')

# imdb_mat_f = imdb_mat[imdb_mat.imdb_id.isin(k_ratings_f.imdb_id.to_list())]
# imdb_mat_f['genre1'] = imdb_mat_f['genres'].apply(lambda s:s.split('|')[0])
# imdb_mat_f['year'] = imdb_mat_f['title'].apply(lambda s:s.split()[-1].replace('(','').replace(')',''))
# imdb_mat_f.to_csv(os.path.abspath(p_data_processed+'/imdb_mat_f.csv'),index=False,encoding='utf-8-sig')

In [3]:
# prepare X
X = pd.concat([k_ratings_f.drop(['userId','movieId','rating_x','timestamp','name','year','rating_y',
                                 # 'metascore',
                                 'imdb_id','poster','genre',
                                 'certificate','runtime','director_actor', 
                                 # 'story', 
                                 'imdbId','tmdbId',
                                 'title','genres','genre1',
                                 'label','label_c',
                                 'y_5','y',
                                 # 'timestamp','vote',
                                 'director','director1','director2','actor1'
                                 ], axis = 1),
               pd.get_dummies(k_ratings_f[[
                   #'certificate',
                   #'genre1',
                   'label_c','director1','actor1']], 
                              drop_first = True, dummy_na = True)],axis = 1)

# fill metascore null values with mean
X['metascore'].fillna(X['metascore'].mean(), inplace=True)
X.fillna(0,inplace=True)

In [4]:
# prepare y
# y = k_ratings_f['y']
y = k_ratings_f['y_5']
# y = k_ratings_f['y'].cat.codes

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode target labels
y = label_encoder.fit_transform(y)
y = to_categorical(y)

In [5]:
# prepare X1 for NLP
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)            
    # Removing "See full summary »"
    sentence = sentence.replace('See full summary', ' ')
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [6]:
X1 = []
sentences = list(X["story"])
for sen in sentences:
    X1.append(preprocess_text(sen))
    
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X1)

X1 = tokenizer.texts_to_sequences(X1)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50 # 200

X1 = pad_sequences(X1, padding='post', maxlen=maxlen)

# data scale
# scaler1 = MinMaxScaler()
# scaler1 = StandardScaler()
# X1 = scaler1.fit_transform(X1)

# embedding
embeddings_dictionary = dict()

glove_file_link = p_data_src+'/glove.6B/glove.6B.50d.txt' # glove file link

with open(glove_file_link, 'r') as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

glove_file.close()

embedding_matrix = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [7]:
# prepare X2 for ANN
X2 = X.drop('story', axis=1).copy().values

# data scale
# scaler2 = MinMaxScaler()
scaler2 = StandardScaler()
X2 = scaler2.fit_transform(X2)

In [8]:
################
# model
################

input_1 = Input(shape=(maxlen,))
input_2 = Input(shape=(X.shape[1]-1,))

embedding_layer = Embedding(vocab_size, 50, weights=[embedding_matrix], trainable=False)(input_1)
LSTM_Layer_1 = LSTM(128)(embedding_layer)

def create_model():
    
    tf.keras.backend.clear_session()
    
    dense_layer_1 = Dense(972, activation='relu')(input_2)
    dense_layer_2 = Dense(1944, activation='relu')(dense_layer_1)
    dense_layer_3 = Dense(972, activation='relu')(dense_layer_2)
    dense_layer_4 = Dense(486, activation='relu')(dense_layer_3)
    dense_layer_5 = Dense(243, activation='relu')(dense_layer_4)    
    
    concat_layer = Concatenate()([LSTM_Layer_1, dense_layer_5])
    dense_layer_6 = Dense(10, activation='relu')(concat_layer)
    output = Dense(5, activation='softmax')(dense_layer_6)
    # output = Dense(3, activation='softmax')(dense_layer_6)
    
    model = Model(inputs=[input_1, input_2], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    #plot_model(model, to_file='figures/model_plot_final.png', show_shapes=True, show_layer_names=True)

    return model

model=create_model()
# history = model.fit(x=[X1_train, X2_train], y=y_train, validation_data=([X1_test, X2_test],y_test),
#                     callbacks=[callback],
#                     batch_size=128, epochs=100, verbose=1)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 385)]        0                                            
__________________________________________________________________________________________________
dense (Dense)                   (None, 972)          375192      input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1944)         1891512     dense[0][0]                      
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 50)]         0                                            
______________________________________________________________________________________________

In [9]:
model.fit(x=[X1, X2], y=y, epochs=10)

Train on 31378 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x146e072b0>

In [10]:
# save model
import joblib
from tensorflow.keras.models import load_model

model.save("model_final.h5")
#joblib.dump(scaler1,'scaler1_final.pkl')
joblib.dump(scaler2,'scaler2_final.pkl')

['scaler2_final.pkl']

In [12]:
# load model
import joblib
from tensorflow.keras.models import load_model

ratings_model = load_model('model_final.h5')
#ratings_scaler1 = joblib.load('scaler1_final.pkl')
ratings_scaler2 = joblib.load('scaler2_final.pkl')

In [13]:
#####################################
# prepare X for megamat
#####################################
reviewers_mat = k_ratings_f[['userId','label','label_c']].copy()
movies_mat = k_ratings_f.drop(['userId','rating_x','timestamp','label','label_c','y','y_5'], axis=1).copy()
# movies_mat = k_ratings_f.drop(['userId','rating_x','timestamp','label','label_c','story'], axis=1).copy()
# storys_mat = k_ratings_f[['movieId','story']].copy()

reviewers_mat = reviewers_mat.drop_duplicates(subset='userId')
movies_mat = movies_mat.drop_duplicates(subset='movieId')
# storys_mat = storys_mat.drop_duplicates(subset='movieId')

print('# reviewers: {}'.format(len(reviewers_mat)))
print('# movies: {}'.format(len(movies_mat)))

# create megamat
reviewers_mat['joincol'] = 1
movies_mat['joincol'] = 1
megamat = pd.merge(reviewers_mat,movies_mat,how='outer',on='joincol')
print('# movies x reviewers: {}'.format(len(megamat)))

# reviewers: 9765
# movies: 245
# movies x reviewers: 2392425


In [14]:
# exclude ratings with real "y"
k_ratings_f['joinId'] = k_ratings_f['userId'].astype(str) + '_' + k_ratings_f['movieId'].astype(str)
# take long time!
megamat['joinId'] = megamat['userId'].astype(str) + '_' + megamat['movieId'].astype(str)
megamat = megamat[~megamat.joinId.isin(k_ratings_f['joinId'].tolist())]
print('# movies x reviewers exclude real reviews: {}'.format(len(megamat)))

# movies x reviewers exclude real reviews: 2361047


In [15]:
X_final = pd.concat([megamat.drop(['joincol','joinId','userId','movieId','name','year','rating_y',
                                 # 'metascore',
                                 'imdb_id','poster','genre',
                                 'certificate','runtime','director_actor', 
                                 # 'story', 
                                 'imdbId','tmdbId',
                                 'title','genres','genre1',
                                 'label','label_c',
                                 # 'timestamp','vote',
                                 'director','director1','director2','actor1'
                                 ], axis = 1),
               pd.get_dummies(megamat[[
                   #'certificate',
                   #'genre1',
                   'label_c','director1','actor1']], 
                              drop_first = True, dummy_na = True)],axis = 1)

# fill metascore null values with mean
X_final['metascore'].fillna(X_final['metascore'].mean(), inplace=True)
X_final.fillna(0,inplace=True)

In [16]:
# X1_final
X1_final = []
sentences = list(X_final["story"])
for sen in sentences:
    X1_final.append(preprocess_text(sen))
    
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X1_final)

X1_final = tokenizer.texts_to_sequences(X1_final)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 50 # 200

X1_final = pad_sequences(X1_final, padding='post', maxlen=maxlen)

# data scale
# scaler1 = MinMaxScaler()
# scaler1 = StandardScaler()
# X1_final = scaler1.fit_transform(X1_final)

# embedding
embeddings_dictionary = dict()

glove_file_link = p_data_src+'/glove.6B/glove.6B.50d.txt' # glove file link

with open(glove_file_link, 'r') as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

glove_file.close()

embedding_matrix = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [17]:
# X2_final
X2_final = X_final.drop('story', axis=1).copy().values

# data scale
X2_final = ratings_scaler2.fit_transform(X2_final)

In [39]:
# def return_prediction(model,scaler1,scaler2,sample_json):
# def return_prediction(model, sample_json):
#     story = [[]]
#     # story = scaler1_final.transform(story)
#     reviewer = [[]]
#     # reviewer = scaler2_final.transform(reviewer)
#     rating_ind = model.predict_classes([story, reviewer])

# cutoff = 1000
# rating_ind = ratings_model.predict([X1_final[:cutoff], X2_final[:cutoff]])

rating_ind = ratings_model.predict([X1_final, X2_final])

In [40]:
predictions = np.argmax(rating_ind,axis=1)#+1
predictions=predictions+1

In [54]:
megamat.head()

Unnamed: 0,userId,label,label_c,joincol,movieId,name,year,rating_y,metascore,vote,...,title,genres,genre1,year_n,runtime_n,director,director1,director2,actor1,joinId
9,181,34,34,1,30803,3-Iron,(2004),8.0,72.0,47924.0,...,3-Iron (Bin-jip) (2004),Drama|Romance,Drama,2004,88,Kiduk Kim,KidukKim,,SeungYunLee,181_30803
10,181,34,34,1,27869,Tae Guk Gi: The Brotherhood of War,(2004),8.1,64.0,37060.0,...,Tae Guk Gi: The Brotherhood of War (Taegukgi hwinalrimyeo) (2004),Action|Drama|War,Action,2004,140,Jekyu Kang,JekyuKang,,DongGunJang,181_27869
11,181,34,34,1,31364,Memories of Murder,(2003),8.1,82.0,123625.0,...,Memories of Murder (Salinui chueok) (2003),Crime|Drama|Mystery|Thriller,Crime,2003,132,Bong Joon Ho,BongJoonHo,,KanghoSong,181_31364
12,181,34,34,1,6210,Hwasango,(2001),6.0,,3329.0,...,Volcano High (Whasango) (2001),Action|Comedy,Action,2001,120,Taegyun Kim,TaegyunKim,,HyukJang,181_6210
13,181,34,34,1,76091,Mother,(2009),7.8,79.0,46913.0,...,Mother (Madeo) (2009),Crime|Drama|Mystery|Thriller,Crime,2009,129,Bong Joon Ho,BongJoonHo,,HyejaKim,181_76091


In [90]:
predictions_df = pd.DataFrame(predictions, index=megamat.index)
predictions_df = pd.merge(megamat[['userId','movieId','title']],predictions_df,left_index=True, right_index=True)
predictions_df.set_axis(['userId','movieId','title','y'],axis=1,inplace=True)
predictions_df.head()

Unnamed: 0,userId,movieId,title,y
9,181,30803,3-Iron (Bin-jip) (2004),1
10,181,27869,Tae Guk Gi: The Brotherhood of War (Taegukgi hwinalrimyeo) (2004),1
11,181,31364,Memories of Murder (Salinui chueok) (2003),1
12,181,6210,Volcano High (Whasango) (2001),1
13,181,76091,Mother (Madeo) (2009),1


In [92]:
predictions_df.to_csv(os.path.abspath(p_data_processed+'/predictions_df.csv'),index=False)

In [93]:
original_df = k_ratings_f[['userId','movieId','title','y_5']].copy()
original_df.set_axis(['userId','movieId','title','y'],axis=1,inplace=True)
original_df.to_csv(os.path.abspath(p_data_processed+'/original_df.csv'),index=False)

In [115]:
intermediate_df, intermediate2_df = train_test_split(predictions_df, test_size=0.5)
intermediate_df = pd.concat([original_df,intermediate_df], axis=0)
intermediate_df.shape

(1211901, 4)

In [522]:
#intermediate2_df.shape
#beginner_df, beginner2_df = train_test_split(intermediate2_df, test_size=0.5)
intermediate2_df_copy = intermediate2_df.copy()
beginner_df = intermediate2_df_copy.sample(frac=0.5, random_state=0)

In [524]:
beginner_df = pd.concat([original_df,beginner_df], axis=0)
beginner_df.shape

(621640, 4)

In [116]:
advanced_df = pd.concat([original_df,predictions_df], axis=0)
advanced_df.shape

(2392425, 4)

In [117]:
intermediate_df.to_csv(os.path.abspath(p_data_processed+'/intermediate_df.csv'),index=False)
advanced_df.to_csv(os.path.abspath(p_data_processed+'/advanced_df.csv'),index=False)

In [525]:
beginner_df.to_csv(os.path.abspath(p_data_processed+'/beginner_df.csv'),index=False)

In [526]:
beginner_mat = beginner_df.pivot_table(index='userId',columns='movieId',values='y')
beginner_mat.to_csv(os.path.abspath(p_data_processed+'/beginner.csv'),index=False)

In [133]:
original_mat = original_df.pivot_table(index='userId',columns='movieId',values='y')
intermediate_mat = intermediate_df.pivot_table(index='userId',columns='movieId',values='y')
advanced_mat = advanced_df.pivot_table(index='userId',columns='movieId',values='y')

original_mat.to_csv(os.path.abspath(p_data_processed+'/original_mat.csv'),index=False)
intermediate_mat.to_csv(os.path.abspath(p_data_processed+'/intermediate_mat.csv'),index=False)
advanced_mat.to_csv(os.path.abspath(p_data_processed+'/advanced_mat.csv'),index=False)

In [288]:
# SVD
# df_movie=advanced_df[['movieId','title']].drop_duplicates(subset=None, keep='first')

# # df_movie_features = original_mat.fillna(0)
# # df_movie_features = intermediate_mat.fillna(0)
# df_movie_features = advanced_mat

# R = df_movie_features.values
# user_ratings_mean = np.mean(R, axis = 1)
# R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# from scipy.sparse.linalg import svds
# U, sigma, Vt = svds(R_demeaned, k = 50)
# sigma = np.diag(sigma)
# all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

# # original_preds_df = pd.DataFrame(all_user_predicted_ratings,
# # intermediate_preds_df = pd.DataFrame(all_user_predicted_ratings,
# advanced_preds_df = pd.DataFrame(all_user_predicted_ratings,
#                         #index=df_movie_features.index,
#                         columns=df_movie_features.columns)

In [507]:
# SVD
from scipy.sparse.linalg import svds

df_movie=advanced_df[['movieId','title']].drop_duplicates(subset=None, keep='first')

def SVD_prediction(mat):

    df_movie_features = mat.fillna(0)

    R = df_movie_features.values
    user_ratings_mean = np.mean(R, axis = 1)
    R_demeaned = R - user_ratings_mean.reshape(-1, 1)

    U, sigma, Vt = svds(R_demeaned, k = 50)
    sigma = np.diag(sigma)
    all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

    return pd.DataFrame(all_user_predicted_ratings,
                        index=df_movie_features.index,
                        columns=df_movie_features.columns)

original_mat_prediction = SVD_prediction(original_mat)
intermediate_mat_prediction = SVD_prediction(intermediate_mat)
advanced_mat_prediction = SVD_prediction(advanced_mat)

In [510]:
original_mat_prediction.to_csv(os.path.abspath(p_data_processed+'/original_mat_prediction.csv'),index=True)
intermediate_mat_prediction.to_csv(os.path.abspath(p_data_processed+'/intermediate_mat_prediction.csv'),index=True)
advanced_mat_prediction.to_csv(os.path.abspath(p_data_processed+'/advanced_mat_prediction.csv'),index=True)

In [527]:
beginner_mat_prediction = SVD_prediction(beginner_mat)
beginner_mat_prediction.to_csv(os.path.abspath(p_data_processed+'/beginner_mat_prediction.csv'),index=True)

In [512]:
test = pd.read_csv(os.path.abspath(p_data_processed+'/original_mat_prediction.csv'))

In [513]:
test

Unnamed: 0,userId,652,4026,5526,5687,5821,6161,6210,6648,7982,...,188623,189461,189527,191873,192265,192749,193904,195343,202439,205082
0,3,-0.003420,-0.004153,-0.000651,0.005368,-0.001252,-0.000754,-0.001282,-0.002114,0.000353,...,0.000558,-0.000572,-0.001695,-0.001106,0.006428,0.004483,0.003781,0.003341,0.000090,0.004576
1,57,-0.003147,-0.003455,0.000358,0.000401,-0.000333,-0.001277,-0.000851,0.000298,0.000479,...,0.000454,0.002605,0.000942,0.001984,0.000313,0.001194,0.003470,-0.001106,-0.000048,-0.000048
2,84,-0.020322,-0.037354,-0.007941,0.042005,0.001595,0.007016,0.020991,-0.025369,-0.001044,...,0.002840,-0.012109,-0.041006,-0.011411,-0.005969,-0.001447,-0.000885,-0.015731,0.000582,-0.000300
3,95,0.002922,-0.006625,-0.004784,0.001511,-0.002881,0.001375,-0.006298,0.000897,-0.001431,...,-0.000593,0.003991,-0.000977,-0.009682,-0.000529,-0.009420,-0.000680,0.001371,0.000878,-0.007028
4,113,-0.001888,-0.002073,0.000215,0.000240,-0.000200,-0.000766,-0.000511,0.000179,0.000287,...,0.000272,0.001563,0.000565,0.001190,0.000188,0.000716,0.002082,-0.000664,-0.000029,-0.000029
5,174,-0.003147,-0.003455,0.000358,0.000401,-0.000333,-0.001277,-0.000851,0.000298,0.000479,...,0.000454,0.002605,0.000942,0.001984,0.000313,0.001194,0.003470,-0.001106,-0.000048,-0.000048
6,181,-0.000350,-0.004177,-0.002146,-0.001587,0.000214,-0.000262,0.002170,0.009760,0.000179,...,0.001449,-0.006436,0.008461,0.011909,0.022642,0.006691,0.003291,0.001123,1.994476,-0.005085
7,185,0.003314,-0.000505,-0.001671,0.016901,0.013168,-0.028739,-0.005243,-0.004028,-0.001278,...,-0.000329,-0.026979,-0.004215,0.006480,0.019607,0.018441,0.001303,0.000272,0.002054,0.000581
8,187,0.089745,0.065102,0.077051,-0.088038,0.115968,0.061672,0.078595,0.069797,-0.024284,...,-0.026799,0.048856,0.052417,0.024892,0.095362,0.013675,0.020698,0.004048,-0.029129,0.048022
9,207,-0.003147,-0.003455,0.000358,0.000401,-0.000333,-0.001277,-0.000851,0.000298,0.000479,...,0.000454,0.002605,0.000942,0.001984,0.000313,0.001194,0.003470,-0.001106,-0.000048,-0.000048


In [480]:
intermediate_mat_prediction[27773].idxmax()

4015

In [497]:
advanced_mat_prediction[4026].idxmax()

6088

In [500]:
advanced_df[advanced_df.index==advanced_mat_prediction[4026].idxmax()]

Unnamed: 0,userId,movieId,title,y
6088,108778,8014,"Spring, Summer, Fall, Winter... and Spring (Bom yeoreum gaeul gyeoul geurigo bom) (2003)",4
6088,3409,121143,Flu (2013),4


In [506]:
def recommend_movies(preds_df, userIndex, movies_df, original_ratings_df, num_recommendations=5):
    
    user_row_number = userIndex
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    user_data = original_ratings_df[original_ratings_df.index==user_row_number]
    recommendations = pd.DataFrame(sorted_user_predictions).reset_index().merge(
                (movies_df[~movies_df['movieId'].isin(user_data['movieId'])]),on='movieId').rename(
                columns = {user_row_number: 'Predictions'}).iloc[:num_recommendations]    
    
    print(user_data)
    print('\n')
    print(recommendations)
    
    return user_data, recommendations

movieIndex = 55705

# original
(user, recommendations) = recommend_movies(
    original_mat_prediction,original_mat_prediction[movieIndex].idxmax(),df_movie,
    original_df.reset_index(),num_recommendations=5)

      index  userId  movieId             title  y
4311   4440   75791   144532  Cold Eyes (2013)  3


   movieId  Predictions  \
0   106204     6.641260   
1     8805     6.094529   
2   179779     5.745242   
3    84116     5.431900   
4   134204     5.418847   

                                                    title  
0                                            Pieta (2013)  
1  Attack the Gas Station! (Juyuso seubgyuksageun) (1999)  
2                                    A Taxi Driver (2017)  
3                                     Poetry (Shi) (2010)  
4                                       A Hard Day (2014)  


In [502]:
# intermediate 
(user, recommendations) = recommend_movies(
    intermediate_mat_prediction,intermediate_mat_prediction[movieIndex].idxmax(),df_movie,
    intermediate_df.reset_index(),num_recommendations=5)

      index  userId  movieId                       title  y
2494   2543   43839    51709  Host, The (Gwoemul) (2006)  4


   movieId  Predictions  \
0     4026     5.775119   
1    55705     5.554806   
2   144380     5.479084   
3   183699     5.299484   
4   166494     5.266439   

                                                      title  
0  Nowhere to Hide (Injeong sajeong bol geot eobtda) (1999)  
1                          Secret Sunshine (Milyang) (2007)  
2                                The Fatal Encounter (2014)  
3                                          Forgotten (2017)  
4                    The Tiger: An Old Hunter's Tale (2015)  


In [503]:
# advanced
(user, recommendations) = recommend_movies(
    advanced_mat_prediction, advanced_mat_prediction[movieIndex].idxmax(), df_movie, 
    advanced_df.reset_index(), num_recommendations=5)

      index  userId  movieId                 title  y
6088   6248  111311    60551  Breath (Soom) (2007)  2


   movieId  Predictions                                     title
0   162448     5.064677                      Seoul Station (2016)
1   192265     5.061601  The Witch: Part 1. The Subversion (2018)
2   178477     5.059552        On the Beach at Night Alone (2017)
3   144282     5.059152   Better Tomorrow, A (Moo-jeok-ja) (2010)
4   158783     5.055537                     The Handmaiden (2016)


In [2]:






# SVDpp
reader = Reader(rating_scale=(1,5))

iids = original_df['movieId'].unique()

# original
alg = SVDpp()
data = Dataset.load_from_df(original_df[['userId', 'movieId', 'y']], reader)
original_preds_df = alg.fit(data.build_full_trainset())

# # intermediate
# alg2 = SVDpp()
# data = Dataset.load_from_df(intermediate_df[['userId', 'movieId', 'y']], reader)
# intermediate_preds_df = alg2.fit(data.build_full_trainset())

# # advanced
# alg3 = SVDpp()
# data = Dataset.load_from_df(advanced_df[['userId', 'movieId', 'y']], reader)
# advanced_preds_df = alg3.fit(data.build_full_trainset())

NameError: name 'Reader' is not defined

In [459]:
# userId = 29730
# movieId = 27592

# pred = original_preds_df.predict(uid=userId,iid=movieId)
# iids_userId = original_df.loc[original_df['userId']==userId,'movieId']
# iids_to_pred = np.setdiff1d(iids,iids_userId)

# testset =[[userId,iid,3] for iid in iids_to_pred]
# predictions_svdpp = alg.test(testset)

# pred_ratings = np.array([pred.est for pred in predictions_svdpp])

# # top 5 prediction
# iids_to_pred[pred_ratings.argsort()[::-1][:5].tolist()]

array([202439,  31364, 158783,  32705,  76091])

In [448]:
def svdpp_prediction(preds_df,original_df,iids,userId,movieId):
    
    pred = preds_df.predict(uid=userId,iid=movieId)
    iids_userId = original_df.loc[original_df['userId']==userId,'movieId']
    # iids = original_df['movieId'].unique()
    iids_to_pred = np.setdiff1d(iids,iids_userId)
    
    testset =[[userId,iid,3] for iid in iids_to_pred]
    predictions_svdpp = alg.test(testset)
    
    pred_ratings = np.array([pred.est for pred in predictions_svdpp])

    # top 5 prediction
    return iids_to_pred[pred_ratings.argsort()[::-1][:5].tolist()].tolist()

In [461]:
svdpp_prediction(original_preds_df,original_df,iids,29730,27592)
# svdpp_prediction(intermediate_preds_df,intermediate_df,iids,29730,27592)
# svdpp_prediction(advanced_preds_df,advanced_df,iids,29730,27592)

[202439, 31364, 158783, 32705, 76091]