In [1]:
from math import sqrt
from keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from keras.models import Model
from keras.callbacks import Callback, ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error


In [2]:
# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))


def build_cfmodel(n_users, n_items, embed_size, output_layer='dot'):
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')
    
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)
    
    if output_layer == 'dot':
        model_output = Dot(axes=1)([user_emb, item_emb])
    elif output_layer == 'mlp':
        mlp_input = Concatenate()([user_emb, item_emb])

        dense_1 = Dense(64, activation='relu')(mlp_input)
        dense_1_dp = Dropout(0.15)(dense_1)
        dense_2 = Dense(32, activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(0.15)(dense_2)
        model_output = Dense(1)(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input],
                  outputs=model_output)
    return model


In [3]:
if __name__ == "__main__":
    tr_df = pd.read_csv("data/train.csv")
    val_df = pd.read_csv("data/valid.csv")
    te_df = pd.read_csv("data/test.csv")
    
    # Build User/Item vocabulary
    user_set = set(tr_df.user_id.unique())
    business_set = set(tr_df.business_id.unique())
    user_vocab = dict(zip(user_set, range(1, len(user_set) + 1)))
    user_vocab['unk'] = 0
    n_users = len(user_vocab)
    business_vocab = dict(zip(business_set, range(1, len(business_set) + 1)))
    business_vocab['unk'] = 0
    n_items = len(business_vocab)

    tr_users = tr_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
    tr_items = tr_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values
    tr_ratings = tr_df.stars.values
    val_users = val_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
    val_items = val_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values
    val_ratings = val_df.stars.values
    te_users = te_df.user_id.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
    te_items = te_df.business_id.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values

    model = build_cfmodel(
        n_users, n_items, 
        embed_size=50,
        output_layer='mlp')

    model.compile(optimizer='adam', loss='mse')
    history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=1, 
        verbose=1,
        callbacks=[ModelCheckpoint('model.h5')])
    y_pred = model.predict([tr_users, tr_items])
    print("TRAIN RMSE: ", rmse(y_pred, tr_ratings))
    y_pred = model.predict([val_users, val_items])
    print("VALID RMSE: ", rmse(y_pred, val_ratings))

TRAIN RMSE:  0.9897315237507579
VALID RMSE:  1.042941577297077


In [4]:
from scipy.sparse import coo_matrix

In [5]:
train_matrix_sparse = coo_matrix((tr_ratings, (tr_users, tr_items)))

train_matrix = train_matrix_sparse.toarray()
print(train_matrix.shape)
print(train_matrix.sum()/n_users)
print(train_matrix.sum()/n_items)

(2970, 5938)
78.12929292929293
39.07780397440216


In [5]:
def build_ncf_model(n_users, n_items, embed_size, output_layer='dot'):
    '''
    params:
        -n_users: number of user embedding vectors
        -n_items: number of item embedding vectors
        -embed_size: dimension of each embedding vector
        -output_layer: which instantiation of NCF to use ('dot' or 'mlp')

    return:
        a keras Model object for the constructed ncf model 
    '''

    # Get the users and items input
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')
    
    
    # Get the embeddings of users and items
    
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)

    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)
    
    if output_layer == 'dot':
        # Compute the dot product of users' and items' embeddings as the model output
        model_output = Dot(axes=1)([user_emb, item_emb])
        
    elif output_layer == 'mlp':
        mlp_input = Concatenate()([user_emb, item_emb])
        # Concatenate the users' and items' embeddings as the input of MLP 
         # First fully-connected layer
        dense_1 = Dense(512, activation='relu')(mlp_input)
        dense_1_dp = Dropout(0.15)(dense_1)

        # Second fully-connected layer
        dense_2 = Dense(512, activation='relu')(dense_1_dp)
        dense_2_dp = Dropout(0.15)(dense_2)

        # Final fully-connected layer to compute model output
        model_output = Dense(1)(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input],
                  outputs=model_output)
    return model   


In [6]:
def case(embed_size=10, output_layer='dot', epochs=1):
    model = build_cfmodel(n_users, n_items, embed_size=embed_size, output_layer=output_layer)
    model.compile(optimizer='adam', loss='mse')
    history = model.fit(x=[tr_users, tr_items], y=tr_ratings, epochs=epochs, verbose=1, callbacks=[ModelCheckpoint('model.h5')])
    y_pred = model.predict([val_users, val_items])
    score=rmse(y_pred, val_ratings)
    return score
    

In [11]:
size=[30,40,50,60,70,80,90,100]
layer=['dot','mlp']
epoch=[3,4,5,6,7]
best_rmse=2

rmse_score=[]
sizes=[]
layers=[]
epoches=[]
for s in size:
    for l in layer:
        for e in epoch:
            score=case(embed_size=s, output_layer=l, epochs=e)
            rmse_score.append(score)
            sizes.append(s)
            layers.append(l)
            epoches.append(e)
            if score<best_rmse:
                best_rmse=score
                best_size=s
                best_layer=l
                best_epoch=e
                
CF_Result=pd.DataFrame()
CF_Result["embed_size"]=sizes
CF_Result["method"]=layers
CF_Result["epoch"]=epoches
CF_Result["score"]=rmse_score
CF_Result=CF_Result.sort_values(by=["score"],ascending=False)
CF_Result.to_csv("CF_result.csv")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7


Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [13]:
print("Best Model: Best size=",best_size,"Best method=",best_layer,"Best epoch=",best_epoch, "The RMSE is ",best_rmse)

Best Model: Best size= 30 Best method= mlp Best epoch= 3 The RMSE is  1.0318438056131205


In [7]:
def case_ncf(embed_size=10, output_layer='dot', epochs=1):
    model = build_ncf_model(n_users, n_items, embed_size=embed_size, output_layer=output_layer)
    model.compile(optimizer='adam', loss='mse')
    history = model.fit(x=[tr_users, tr_items], y=tr_ratings, epochs=epochs, verbose=1, callbacks=[ModelCheckpoint('model.h5')])
    y_pred = model.predict([val_users, val_items])
    score=rmse(y_pred, val_ratings)
    return score

In [11]:
ncf_size=[30,40,50,60,70,80,90,100]
ncf_layer=['dot','mlp']
ncf_epoch=[3,4,5,6,7]
best_ncf_rmse=2

ncf_RMSE=[]
ncf_sizes=[]
ncf_layers=[]
ncf_epoches=[]
for s in ncf_size:
    for l in ncf_layer:
        for e in ncf_epoch:
            ncf_score=case_ncf(embed_size=s, output_layer=l, epochs=e)
            ncf_RMSE.append(ncf_score)
            ncf_sizes.append(s)
            ncf_layers.append(l)
            ncf_epoches.append(e)
            if ncf_score<best_ncf_rmse:
                best_ncf_rmse=ncf_score
                best_ncf_size=s
                best_ncf_layer=l
                best_ncf_epoch=e
                
NCF_Result=pd.DataFrame()
NCF_Result["embed_size"]=ncf_sizes
NCF_Result["method"]=ncf_layers
NCF_Result["epoch"]=ncf_epoches
NCF_Result["score"]=ncf_RMSE
NCF_Result=NCF_Result.sort_values(by=["score"],ascending=False)
NCF_Result.to_csv("NCF_result.csv")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7


Epoch 5/6
Epoch 6/6
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [12]:
print("Best NCF Model: Best size=",best_ncf_size,"Best method=",best_ncf_layer,"Best epoch=",best_ncf_epoch, "The RMSE is ",best_ncf_rmse)

Best NCF Model: Best size= 40 Best method= mlp Best epoch= 3 The RMSE is  1.047717113111594


In [10]:
model = build_ncf_model(n_users, n_items, embed_size=40, output_layer="mlp")
model.compile(optimizer='adam', loss='mse')
history = model.fit(x=[tr_users, tr_items], y=tr_ratings, epochs=3, verbose=1, callbacks=[ModelCheckpoint('model.h5')])
y_pred = model.predict([tr_users, tr_items])
tr_ncf_score=rmse(y_pred, tr_ratings)
print(tr_ncf_score)

Epoch 1/3
Epoch 2/3
Epoch 3/3
0.860275508060453


In [12]:
model = build_cfmodel(n_users, n_items, embed_size=30, output_layer="mlp")
model.compile(optimizer='adam', loss='mse')
history = model.fit(x=[tr_users, tr_items], y=tr_ratings, epochs=3, verbose=1, callbacks=[ModelCheckpoint('model.h5')])
y_pred = model.predict([tr_users, tr_items])
tr_cf_score=rmse(y_pred, tr_ratings)
print(tr_cf_score)

Epoch 1/3
Epoch 2/3
Epoch 3/3
0.9182000099528242
