In [1]:
import keras
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
import keras.backend as K
from google.colab import drive
from sklearn.metrics import mean_squared_error
from keras.models import Model
from tensorflow.keras import regularizers
from keras.layers import Flatten, Dense, Input, Embedding, Dot
from keras.layers import GlobalAveragePooling1D, MaxPooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# load data
movie = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/movies.dat").readlines()]
movie = pd.DataFrame(movie)
movie = movie.rename(columns={0: "movieId", 1: "title", 2: "genres"})
rating = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/ratings.dat").readlines()]
rating = pd.DataFrame(rating)
rating = rating.rename(columns={0: "userId", 1: "movieId", 2: "rating", 3: "timestamp"})
movie = movie.loc[:,["movieId","genres"]]
rating = rating.loc[:,["userId","movieId","rating","timestamp"]]
data = pd.merge(rating, movie)
data['rating'] = pd.to_numeric(data['rating'])
data['timestamp'] = pd.to_numeric(data['timestamp'])

In [3]:
# preprocessing
n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())
genres_split = data.genres.str.split('|').tolist()
genres_unique = set()
for movie_genres in genres_split:
    for genre in movie_genres:
        genres_unique.add(genre)
genres2idx = {o:i+1 for i,o in enumerate(genres_unique)}
genres_split = [[genres2idx[x] for x in movie_genres] for movie_genres in genres_split]
padded_genres = tf.keras.preprocessing.sequence.pad_sequences(
    genres_split, padding="post"
)

users = data.userId.unique()
movies = data.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

data['userId'] = data['userId'].apply(lambda x: userid2idx[x])
data['movieId'] = data['movieId'].apply(lambda x: movieid2idx[x])

data = data.drop(columns=['genres'])
for i in range(len(padded_genres[0])):
    data['genre'+str(i)] = padded_genres[:,i]
data['genre_ratio'] = [len(movie_genres) for movie_genres in genres_split]

min_timestamp = pd.DataFrame(data.loc[:,["movieId","timestamp"]].groupby(['movieId'], as_index=False).min())
min_timestamp = min_timestamp.sort_values(by=["movieId"], axis=0)
min_timestamp = min_timestamp.rename(columns={"timestamp": "min_timestamp"})
data = pd.merge(data, min_timestamp)
data['timestamp'] = data['timestamp'] - data['min_timestamp']
data['timestamp'] = data['timestamp'] / (60 * 60 * 24 * 365)
data = data.drop(columns=['min_timestamp'])
data['day'] = data['timestamp'] * 365 / (7 * 2)
data['day'] = data['day'].apply(lambda x: int(x))
days = data.day.unique()
n_days = len(days)
day2idx = {o:i for i,o in enumerate(days)}
data['day'] = data['day'].apply(lambda x: day2idx[x])
print(n_movies, n_users, len(rating), n_days)

10677 69878 10000054 365


In [4]:
# split train and test data
data = data.sample(frac=1)
split = np.random.rand(len(data)) < 0.9
train = data[split]
valid = data[~split]
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,genre0,genre1,genre2,genre3,genre4,genre5,genre6,genre7,genre_ratio,day
305163,52468,19,4.0,4.728982,9,2,0,0,0,0,0,0,2,243
9804972,40870,6387,4.0,0.86733,6,1,0,0,0,0,0,0,2,235
8457587,2323,2757,1.0,0.395376,1,0,0,0,0,0,0,0,1,80
5805885,21805,1265,3.5,5.471151,1,5,17,18,10,0,0,0,5,200
5841688,7486,1281,5.0,0.250679,6,1,3,0,0,0,0,0,3,44
6788497,25351,1556,3.0,0.890645,9,0,0,0,0,0,0,0,1,262
6590800,3252,1478,3.0,1.655284,6,1,10,0,0,0,0,0,3,210
1481463,23711,135,4.0,0.404079,1,0,0,0,0,0,0,0,1,80
4395497,33388,697,4.0,3.525184,1,0,0,0,0,0,0,0,1,192
5581462,16429,1151,4.0,4.987586,1,18,0,0,0,0,0,0,2,117


In [5]:
# generate model
embedding_l2 = 1e-6 * 1                                  # regularization
dense_l2 = 1e-3 * 1                                      # regularization
keras.backend.clear_session()

user_input = Input(shape=(1,), name='user_input', dtype='int32')
movie_input = Input(shape=(1,), name='movie_input', dtype='int32')
genres_input = Input(shape=(len(padded_genres[0]),), name='genres_input', dtype='int32')
genre_ratio_input = Input(shape=(1, ), name='genre_ratio_input', dtype='float32')
day_input = Input(shape=(1,), name='day_input', dtype='int32')
time_input = Input(shape=(1,), name='time_input', dtype='float32')
time_root = tf.math.sqrt(time_input + 1.0)
time_square = tf.math.square(time_input)
time_vector = tf.concat([time_input, time_root, time_square], 1)

def create_model(n_user_embedding):
    n_movie_embedding = n_user_embedding - 1
    n_genres_embedding = 1
    genre_ratio_vector = tf.where(tf.less(tf.constant(list(range(len(padded_genres[0])))*n_genres_embedding, dtype='float32'), genre_ratio_input), 1.0 / genre_ratio_input, 0)
    genre_ratio_vector = tf.transpose(tf.reshape(genre_ratio_vector, [-1, n_genres_embedding, len(padded_genres[0])]), perm=[0,2,1])

    user_embedding = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector = Flatten()(user_embedding)
    movie_embedding = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector = Flatten()(movie_embedding)
    genres_embedding = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, embeddings_regularizer=regularizers.l2(embedding_l2/len(padded_genres[0])))(genres_input)
    genres_embedding = tf.math.multiply(genres_embedding, genre_ratio_vector)
    genres_average_embedding = tf.math.reduce_sum(genres_embedding, axis=1)
    genres_vector = Flatten()(genres_average_embedding)
    day_embedding = Embedding(n_days, int(n_user_embedding/8), embeddings_regularizer=regularizers.l2(embedding_l2))(day_input)
    day_vector = Flatten()(day_embedding)

    concat_layer = tf.concat([user_vector, movie_vector, genres_vector, time_vector, day_vector], 1)
    mlp_layer = Dense(int(n_user_embedding/2), activation='relu', kernel_regularizer=regularizers.l2(dense_l2))(concat_layer)

    user_embedding2 = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector2 = Flatten()(user_embedding2)
    movie_embedding2 = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector2 = Flatten()(movie_embedding2)
    genres_embedding2 = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, embeddings_regularizer=regularizers.l2(embedding_l2/len(padded_genres[0])))(genres_input)
    genres_embedding2 = tf.math.multiply(genres_embedding2, genre_ratio_vector)
    genres_average_embedding2 = tf.math.reduce_sum(genres_embedding2, axis=1)
    genres_vector2 = Flatten()(genres_average_embedding2)
    matrix_factorization = Dot(axes=1)([user_vector2, tf.concat([movie_vector2, genres_vector2], 1)])

    user_embedding3 = Embedding(n_users, n_user_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
    user_vector3 = Flatten()(user_embedding3)
    movie_embedding3 = Embedding(n_movies, n_movie_embedding, embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
    movie_vector3 = Flatten()(movie_embedding3)
    genres_embedding3 = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, embeddings_regularizer=regularizers.l2(embedding_l2/len(padded_genres[0])))(genres_input)
    genres_embedding3 = tf.math.multiply(genres_embedding3, genre_ratio_vector)
    genres_average_embedding3 = tf.math.reduce_sum(genres_embedding3, axis=1)
    genres_vector3 = Flatten()(genres_average_embedding3)
    diff = tf.math.subtract(user_vector3, tf.concat([movie_vector3, genres_vector3], 1))
    W = tf.Variable(tf.random.normal(shape=[n_user_embedding, 1], stddev=0.1))
    diff = tf.linalg.matmul(diff, W)

    output_layer = tf.concat([matrix_factorization, mlp_layer, diff], 1)
    return output_layer

output_layer = tf.concat([create_model(8*8), create_model(8*7), create_model(8*6), create_model(8*5), create_model(8*4), create_model(8*3), create_model(8*2), create_model(8*1)], 1)
output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(dense_l2))(output_layer)
output = output*5.5

model = Model([user_input, movie_input, time_input, genres_input, genre_ratio_input, day_input], output)
model.summary()

The following Variables were used a Lambda layer's call (tf.linalg.matmul), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(64, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.linalg.matmul_1), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(56, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
The following Variables were used a Lambda layer's call (tf.linalg.matmul_2), but
are not present in its tracked objects:
  <tf.Variable 'Variable:0' shape=(48, 1) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an

In [6]:
# training
def rmse (y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

model.compile(optimizer=Adam(), loss='mse', metrics=[rmse])
batch_size = 4096*8
epochs = 128

callback = tf.keras.callbacks.EarlyStopping(monitor='val_rmse', patience=1, restore_best_weights=True)
history = model.fit([train.userId, train.movieId, train.timestamp, train.iloc[:,4:4+len(padded_genres[0])], train.genre_ratio, train.day], train.rating,
                batch_size=batch_size, epochs=epochs, callbacks=[callback],
                validation_data = ([valid.userId, valid.movieId, valid.timestamp, valid.iloc[:,4:4+len(padded_genres[0])], valid.genre_ratio, valid.day], valid.rating), verbose = 1)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128


In [7]:
# check final RMSE
valid_pred = model.predict([valid.userId, valid.movieId, valid.timestamp, valid.iloc[:,4:4+len(padded_genres[0])], valid.genre_ratio, valid.day], batch_size = batch_size)
valid_pred = [max(min(x, 5), 0.5) for x in valid_pred]
test_rmse = mean_squared_error(valid.rating, valid_pred, squared=False) # squared=False -> RMSE
print(test_rmse)

0.7780977052883264


  return array(a, dtype, copy=False, order=order)
