In [3]:
import numpy as np
import pandas as pd
import os
import tensorflow as T
import keras
from keras import backend as K
from keras import initializers
from keras.initializers import RandomNormal
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape,  Flatten, Dropout
from keras.optimizers import Adagrad, Adam, SGD, RMSprop, Adamax
from keras.regularizers import l2
from keras.layers import Multiply, Concatenate
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from time import time
import multiprocessing as mp
import sys
import math
import argparse
import matplotlib.pyplot as plt


Using TensorFlow backend.


In [0]:
!pip install -U -q PyDrive ## you will have install for every colab session

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
RATING_DATA_FILE_TRAIN = 'u1.base'
RATING_DATA_FILE_TEST = 'u1.test'
RATING_DATA_GEN_FILE = 'u_gen.data'
RATINGS_CSV_FILE_NORM = 'u_norm.data'
RATINGS_GEN_CSV_FILE = 'u_genr.data'
MODEL_WEIGHTS_FILE = 'u_emb_weights.h5'
MODEL_WEIGHTS_FILE_CORE = 'u_emb_weights'

In [0]:
rating_file_import_train = drive.CreateFile({'id':'1smKszlPQlT03Bbi7yLRIDIkd0c-XS-Y5'})
rating_file_import_train.GetContentFile(RATING_DATA_FILE_TRAIN)
rating_file_import_test = drive.CreateFile({'id':'1dxlfTQJiQ5MyewGyhk7Y4A9mETdnjknf'})
rating_file_import_test.GetContentFile(RATING_DATA_FILE_TEST)

In [7]:
ratings = pd.read_csv(RATING_DATA_FILE_TRAIN, 
                    sep='\t', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'movieid', 'rating', 'timestamp'])
max_userid = ratings['userid'].drop_duplicates().max()
max_movieid = ratings['movieid'].drop_duplicates().max()
ratings['user_emb_id'] = ratings['userid'] - 1
ratings['movie_emb_id'] = ratings['movieid'] - 1
print(str(len(ratings))+' ratings loaded')


80000 ratings loaded


In [8]:
test_ratings = pd.read_csv(RATING_DATA_FILE_TEST, 
                    sep='\t', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'movieid', 'rating', 'timestamp'])
test_ratings['user_emb_id'] = test_ratings['userid'] - 1
test_ratings['movie_emb_id'] = test_ratings['movieid'] - 1
print(str(len(test_ratings))+' ratings loaded')

20000 ratings loaded


In [0]:
def get_ncf_model(num_users, num_items, latent_dim,hidden_dim,do):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding', input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding', input_length=1)   
    
    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))
    
    # Element-wise product of user and item embeddings
    conc = Concatenate()([user_latent, item_latent])
    drop = Dropout(0.5)(conc)
    hid1 = Dense(hidden_dim, activation='relu')(conc)
    drop2  = Dropout(do)(hid1)
    prediction = Dense(1, activation='relu', kernel_initializer='lecun_uniform', name = 'prediction')(drop2)
    
    
    model = Model(input=[user_input, item_input], output=prediction)
    print("ncf model")
    model.summary()

    return model

In [0]:

K_LATENT=None
hidden_dim=None
do=None

def set_ncf_model(parameter_hidden_dim, parameter_loss,parameter_optimizer,):
  K_LATENT = 20
  hidden_dim = parameter_hidden_dim
  do = 0.5
  NCF_model = get_ncf_model(max_userid,max_movieid,K_LATENT,hidden_dim,do)
  NCF_model.compile(loss=parameter_loss,optimizer=parameter_optimizer,metrics=['mae'])
  return NCF_model

In [24]:

NCF_model=set_ncf_model(20,'mse','Adamax')
Users = ratings['user_emb_id'].values
Movies = ratings['movie_emb_id'].values
Ratings = ratings['rating'].values

ncf model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 20)        18860       user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 20)        33640       item_input[0][0]                 
__________________________________________________________________________________________________




In [27]:
callbacks_ncf = [EarlyStopping('val_loss', patience=10), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncf_'+str(do)+'_'+str(K_LATENT)+'_'+str(hidden_dim)+'.h5', save_best_only=True)]
history_history_ncf = NCF_model.fit([Users, Movies], Ratings, nb_epoch=60, validation_split=.1, verbose=1, callbacks=callbacks_ncf, batch_size = 32)

Train on 72000 samples, validate on 8000 samples
Epoch 1/60
 3168/72000 [>.............................] - ETA: 3s - loss: 0.8492 - mean_absolute_error: 0.7294

  This is separate from the ipykernel package so we can avoid doing imports until


Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60


In [38]:

test_Users = test_ratings['user_emb_id'].values
test_Movies = test_ratings['movie_emb_id'].values
test_Ratings = test_ratings['rating'].values
predict=NCF_model.predict([test_Users,test_Movies])
df_predict=pd.DataFrame(data=predict,columns=['prediction'])
df_predict['ratings'] = test_Ratings
df_predict['diff']=abs(df_predict['ratings']-df_predict['prediction'])

print(df_predict)

       prediction  ratings      diff
0        3.779286        5  1.220714
1        4.035730        3  1.035730
2        4.533484        5  0.466516
3        4.060013        5  0.939987
4        3.229557        3  0.229557
5        3.900399        4  0.099601
6        4.291782        4  0.291782
7        3.475376        3  0.475376
8        3.112710        2  1.112710
9        3.757715        3  0.757715
10       3.478665        4  0.521335
11       2.020246        2  0.020246
12       3.155388        4  0.844612
13       3.393221        5  1.606779
14       3.926667        4  0.073333
15       3.265915        3  0.265915
16       3.581975        4  0.418025
17       3.019182        3  0.019182
18       3.248034        3  0.248034
19       4.297423        4  0.297423
20       4.239299        5  0.760701
21       4.008194        4  0.008194
22       3.137779        3  0.137779
23       4.654284        5  0.345716
24       3.836649        4  0.163351
25       2.838459        3  0.161541
2

according to the first 

In [40]:
print('main method - '+str(sum(df_predict['diff'])/len(df_predict['diff'])))
NCF_model=set_ncf_model(16,'mse','Adamax')
callbacks_ncf = [EarlyStopping('val_loss', patience=10), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncf_'+str(do)+'_'+str(K_LATENT)+'_'+str(hidden_dim)+'.h5', save_best_only=True)]
history_history_ncf = NCF_model.fit([Users, Movies], Ratings, nb_epoch=60, validation_split=.1, verbose=0, callbacks=callbacks_ncf, batch_size = 32)
predict=NCF_model.predict([test_Users,test_Movies])
df_predict=pd.DataFrame(data=predict,columns=['prediction'])
df_predict['ratings'] = test_Ratings
df_predict['diff']=abs(df_predict['ratings']-df_predict['prediction'])
print('second method hidden layer size=16 - '+str(sum(df_predict['diff'])/len(df_predict['diff'])))
NCF_model=set_ncf_model(20,'mse','sgd')
callbacks_ncf = [EarlyStopping('val_loss', patience=10), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncf_'+str(do)+'_'+str(K_LATENT)+'_'+str(hidden_dim)+'.h5', save_best_only=True)]
history_history_ncf = NCF_model.fit([Users, Movies], Ratings, nb_epoch=60, validation_split=.1, verbose=0, callbacks=callbacks_ncf, batch_size = 32)
predict=NCF_model.predict([test_Users,test_Movies])
df_predict=pd.DataFrame(data=predict,columns=['prediction'])
df_predict['ratings'] = test_Ratings
df_predict['diff']=abs(df_predict['ratings']-df_predict['prediction'])
print('third method optimizer=SGD - '+str(sum(df_predict['diff'])/len(df_predict['diff'])))
NCF_model=set_ncf_model(20,'mape','Adamax')
callbacks_ncf = [EarlyStopping('val_loss', patience=10), 
             ModelCheckpoint(MODEL_WEIGHTS_FILE_CORE+'_ncf_'+str(do)+'_'+str(K_LATENT)+'_'+str(hidden_dim)+'.h5', save_best_only=True)]
history_history_ncf = NCF_model.fit([Users, Movies], Ratings, nb_epoch=60, validation_split=.1, verbose=0, callbacks=callbacks_ncf, batch_size = 32)
predict=NCF_model.predict([test_Users,test_Movies])
df_predict=pd.DataFrame(data=predict,columns=['prediction'])
df_predict['ratings'] = test_Ratings
df_predict['diff']=abs(df_predict['ratings']-df_predict['prediction'])
print('fourth method loss function=mean absolute precentage error - '+str(sum(df_predict['diff'])/len(df_predict['diff'])))

main method - 0.823009756035
ncf model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 20)        18860       user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 20)        33640       item_input[0][0]                 
______________________________________________________________________

  """


second method hidden layer size=16 - 0.746050732756
ncf model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 20)        18860       user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 20)        33640       item_input[0][0]                 
_______________________________________________

  


third method optimizer=SGD - 0.74714355818
ncf model
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 20)        18860       user_input[0][0]                 
__________________________________________________________________________________________________
item_embedding (Embedding)      (None, 1, 20)        33640       item_input[0][0]                 
________________________________________________________



fourth method loss function=mean absolute precentage error - 0.808057893771


The first method had shown the best results in based on the same parameters as the other method except the optimizer,loss function and the hidden dimentions.
From our research in the web we have concluded that the adamax optimizer is considered to be the best optimizer for this spessific problem. In addition the mse loss function has the closest aproximation to the real error value.As for the number of nodes in the hidden layer there isn't a good explanation for this spesific number, but through trial and error we deducted that 20 nodes show a decent result.
