In [1]:
import keras
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
import keras.backend as K
from google.colab import drive
from sklearn.metrics import mean_squared_error
from keras.models import Model
from tensorflow.keras import regularizers
from keras.layers import Flatten, Dense, Input, Embedding, Dot
from keras.layers import GlobalAveragePooling1D, MaxPooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# load data
movie = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/movies.dat").readlines()]
movie = pd.DataFrame(movie)
movie = movie.rename(columns={0: "movieId", 1: "title"})
rating = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/ratings.dat").readlines()]
rating = pd.DataFrame(rating)
rating = rating.rename(columns={0: "userId", 1: "movieId", 2: "rating", 3: "timestamp"})
movie = movie.loc[:,["movieId"]]
rating = rating.loc[:,["userId","movieId","rating","timestamp"]]
data = pd.merge(rating, movie)
data['rating'] = pd.to_numeric(data['rating'])
data['timestamp'] = pd.to_numeric(data['timestamp'])

In [3]:
# preprocessing
n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())
users = data.userId.unique()
movies = data.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

data['userId'] = data['userId'].apply(lambda x: userid2idx[x])
data['movieId'] = data['movieId'].apply(lambda x: movieid2idx[x])

min_timestamp = pd.DataFrame(data.loc[:,["movieId","timestamp"]].groupby(['movieId'], as_index=False).min())
min_timestamp = min_timestamp.sort_values(by=["movieId"], axis=0)
min_timestamp = min_timestamp.rename(columns={"timestamp": "min_timestamp"})
data = pd.merge(data, min_timestamp)
data['timestamp'] = data['timestamp'] - data['min_timestamp']
data['timestamp'] = data['timestamp'] / (60 * 60 * 24 * 365)
data = data.drop(columns=['min_timestamp'])
data['day'] = data['timestamp'] * 365 / (7 * 2)
data['day'] = data['day'].apply(lambda x: int(x))
days = data.day.unique()
n_days = len(days)
day2idx = {o:i for i,o in enumerate(days)}
data['day'] = data['day'].apply(lambda x: day2idx[x])
print(n_movies, n_users, len(rating), n_days)

10677 69878 10000054 365


In [4]:
# split train and test data
data = data.sample(frac=1)
split = np.random.rand(len(data)) < 0.9
train = data[split]
valid = data[~split]
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,day
5444806,50493,1097,4.0,4.527218,310
3261983,30692,415,5.0,6.362466,90
1830069,11143,175,3.0,0.691981,24
5540956,48673,1134,5.0,2.341491,246
6928167,45199,1614,4.0,3.03572,315
8164102,3019,2502,3.0,2.582119,289
5763711,50354,1248,3.0,3.140409,316
8409532,17663,2723,5.0,8.309191,52
8844837,567,3191,4.0,6.961249,108
632093,7379,40,3.5,11.440678,23


In [8]:
# generate model
embedding_l2 = 1e-6 * 1                                  # regularization
dense_l2 = 1e-3 * 1                                      # regularization
keras.backend.clear_session()

embedding_l2 /= 2
dense_l2 /= 2
user_input = Input(shape=(1,), name='user_input', dtype='int32')
movie_input = Input(shape=(1,), name='movie_input', dtype='int32')
day_input = Input(shape=(1,), name='day_input', dtype='int32')
time_input = Input(shape=(1,), name='time_input', dtype='float32')
time_root = tf.math.sqrt(time_input + 1.0)
time_square = tf.math.square(time_input)
time_vector = tf.concat([time_input, time_root, time_square], 1)

def create_model(n_user_embedding):
    n_movie_embedding = n_user_embedding

    user_embedding = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector = Flatten()(user_embedding)
    movie_embedding = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector = Flatten()(movie_embedding)
    day_embedding = Embedding(n_days, int(n_user_embedding/8), embeddings_regularizer=regularizers.l2(embedding_l2))(day_input)
    day_vector = Flatten()(day_embedding)

    concat_layer = tf.concat([user_vector, movie_vector, time_vector, day_vector], 1)
    mlp_output = Dense(int(n_user_embedding/2), activation='relu', kernel_regularizer=regularizers.l2(dense_l2))(concat_layer)

    user_embedding2 = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector2 = Flatten()(user_embedding2)
    movie_embedding2 = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector2 = Flatten()(movie_embedding2)
    MF = Dot(axes=1)([user_vector2, movie_vector2])

    user_embedding3 = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector3 = Flatten()(user_embedding3)
    movie_embedding3 = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector3 = Flatten()(movie_embedding3)
    diff = tf.math.subtract(user_vector3, movie_vector3)
    W = tf.Variable(tf.random.normal(shape=[n_user_embedding, 1], stddev=0.1))
    MF2 = tf.linalg.matmul(diff, W)

    output_layer = tf.concat([mlp_output, MF, MF2], 1)
    return output_layer

output_layer = tf.concat([create_model(16*x) for x in range(1,9)], 1)
output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(dense_l2))(output_layer)
output = output*5.5

model = Model([user_input, movie_input, time_input, day_input], output)
model.summary()

The following Variables were used a Lambda layer's call (tf.linalg.matmul), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(16, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.linalg.matmul_1), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(32, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.linalg.matmul_2), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(48, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an

In [9]:
# training
def rmse (y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

model.compile(optimizer=Adam(), loss='mse', metrics=[rmse])
batch_size = 4096*8
epochs = 128

callback = tf.keras.callbacks.EarlyStopping(monitor='val_rmse', patience=1, restore_best_weights=True)
history = model.fit([train.userId, train.movieId, train.timestamp, train.day], train.rating,
                batch_size=batch_size, epochs=epochs, callbacks=[callback],
                validation_data = ([valid.userId, valid.movieId, valid.timestamp, valid.day], valid.rating), verbose = 1)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128


In [10]:
# check final RMSE
valid_pred = model.predict([valid.userId, valid.movieId, valid.timestamp, valid.day], batch_size = batch_size)
valid_pred = [max(min(x, 5), 0.5) for x in valid_pred]
test_rmse = mean_squared_error(valid.rating, valid_pred, squared=False) # squared=False -> RMSE
print(test_rmse)

0.7771434789665826


  return array(a, dtype, copy=False, order=order)
