In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df = pd.read_csv('the-movies-dataset/ratings_small.csv')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
52635,383,21,3.0,789652009
52641,383,47,5.0,789652009
52684,383,1079,3.0,789652009
56907,409,21,5.0,828212412
56909,409,25,4.0,828212412


In [6]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of movies)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(rating_df.userId)
movie_ids = movie_encoder.fit_transform(rating_df.movieId)

In [7]:
# train / val split
num_train = int(len(user_ids) * 0.8)
train_user_ids = user_ids[:num_train]
train_movie_ids = movie_ids[:num_train]
train_ratings = rating_df.rating.values[:num_train]
val_user_ids = user_ids[num_train:]
val_movie_ids = movie_ids[num_train:]
val_ratings = rating_df.rating.values[num_train:]

In [8]:
num_users= user_ids.max()+1
num_movies = movie_ids.max() + 1

In [9]:
# normalize ratings
train_ratings /= 5
val_ratings /= 5

In [10]:
# credit to https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k):
    '''
    Compute DCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed dcg
        
    '''
    r = r[:k]
    dcg = np.sum(r / np.log2(np.arange(2, len(r) + 2)))
    return dcg


In [11]:
def ndcg_at_k(r, k, method=0):
    '''
    Compute NDCG
    args:
        r: np.array, to be evaluated
        k: int, number of entries to be considered
    
    returns:
        dcg: float, computed ndcg
        
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)

    return dcg_at_k(r, k) / dcg_max

In [12]:
# compute average ndcg for all users
def evaluate_prediction(predictions):
    '''
    Return the average ndcg for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed NDCG
    '''
    ndcgs = []
    # iterate
    for target_user in np.unique(val_user_ids):
        # get movie ids and ratings associated with the target user.
        target_val_movie_ids = val_movie_ids[val_user_ids == target_user] 
        target_val_ratings = val_ratings[val_user_ids == target_user] 
        
        # compute ndcg for this user
        ndcg = ndcg_at_k(target_val_ratings[np.argsort(-predictions[val_user_ids == target_user])], k=30)
        ndcgs.append(ndcg)
#         print(np.argsort(-predictions[val_user_ids == target_user]))
    ndcg = np.mean(ndcgs)
    return ndcg


In [13]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Activation, Add, Dropout, Multiply
from keras.optimizers import Adam
def get_mf_model():
    # user input
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users, output_dim=50)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    # item input
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies, output_dim=50)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    # element-wise multiplication
    hidden = Multiply()([user_hidden, item_hidden])
    
    output = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer=Adam(lr=0.0005))
    return model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [14]:
# init mf model
model = get_mf_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        33550       input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        453300      input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 

In [15]:
from keras.callbacks import EarlyStopping
# early stopping wait for 1 epoch
callbacks = [EarlyStopping(patience=1)]

# train for 50 epochs
model.fit([train_user_ids, train_movie_ids], train_ratings,\
          validation_data=([val_user_ids, val_movie_ids], val_ratings), epochs=50, batch_size=128, callbacks=callbacks)

Train on 80003 samples, validate on 20001 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


<keras.callbacks.History at 0x127d74ba8>

In [16]:
# prediction & evalutation
predictions = model.predict([val_user_ids, val_movie_ids])
evaluate_prediction(predictions[:,0])

0.8348814003225187

In [17]:
def get_mlp_model():
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users, output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies, output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(128, activation='relu')(hidden)
    hidden = Dropout(0.2)(hidden)
    hidden = Dense(64, activation='relu')(hidden)    
    hidden = Dropout(0.2)(hidden)    
    output = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model
model = get_mlp_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 64)        42944       input_3[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 1, 64)        580224      input_4[0][0]                    
__________________________________________________________________________________________________
flatten_3 

In [18]:
# train for 50 epochs
model.fit([train_user_ids, train_movie_ids], train_ratings,\
          validation_data=([val_user_ids, val_movie_ids], val_ratings), epochs=50, batch_size=128, callbacks=callbacks)

Train on 80003 samples, validate on 20001 samples
Epoch 1/50
Epoch 2/50


<keras.callbacks.History at 0x12805ca58>

In [19]:
# prediction & evalutation
predictions = model.predict([val_user_ids, val_movie_ids])
evaluate_prediction(predictions[:,0])

0.8752340572923161

In [20]:
def get_ncf_model():
    user_inp = Input((1,))
    user_hidden = Embedding(input_dim=num_users, output_dim=64)(user_inp)
    user_hidden = Flatten()(user_hidden)
    
    item_inp = Input((1,))
    item_hidden = Embedding(input_dim=num_movies, output_dim=64)(item_inp)
    item_hidden = Flatten()(item_hidden)
    
    # element-wise multiplication
    mf_output = Multiply()([user_hidden, item_hidden])
    
    hidden = concatenate([user_hidden, item_hidden])
    hidden = Dense(128, activation='relu')(hidden)
    hidden = Dropout(0.2)(hidden)
    mlp_output = Dense(64, activation='relu')(hidden)    

    
    output = concatenate([mf_output, mlp_output])
    output = Dense(1, activation='sigmoid')(output)
    
    model = Model(inputs=[user_inp, item_inp], outputs=output)
    model.compile(loss='mse', optimizer='adam')
    return model
model = get_ncf_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 64)        42944       input_5[0][0]                    
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1, 64)        580224      input_6[0][0]                    
__________________________________________________________________________________________________
flatten_5 

In [21]:
# train for 50 epochs
model.fit([train_user_ids, train_movie_ids], train_ratings,\
          validation_data=([val_user_ids, val_movie_ids], val_ratings), epochs=50, batch_size=128, callbacks=callbacks)

Train on 80003 samples, validate on 20001 samples
Epoch 1/50
Epoch 2/50


<keras.callbacks.History at 0x12935a978>

In [22]:
# prediction & evalutation
predictions = model.predict([val_user_ids, val_movie_ids])
evaluate_prediction(predictions[:,0])

0.875748165354793