In [1]:
pip install keras-rectified-adam



In [2]:
import keras
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import drive
from sklearn.metrics import mean_squared_error
from keras.models import Model
from tensorflow.keras import regularizers
from keras.layers import Flatten, Dense, Input, Embedding, Dot
from keras.layers import GlobalAveragePooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras_radam import RAdam

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
movie = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/movies.dat").readlines()]
movie = pd.DataFrame(movie)
movie = movie.rename(columns={0: "movieId", 1: "title", 2: "genres"})
rating = [i.strip().split("::") for i in open("/content/drive/My Drive/MovieLens10M/ratings.dat").readlines()]
rating = pd.DataFrame(rating)
rating = rating.rename(columns={0: "userId", 1: "movieId", 2: "rating", 3: "timestamp"})
movie = movie.loc[:,["movieId","genres"]]
rating = rating.loc[:,["userId","movieId","rating","timestamp"]]
data = pd.merge(rating, movie)

In [4]:
data['rating'] = pd.to_numeric(data['rating'])
data['timestamp'] = pd.to_numeric(data['timestamp'])

In [5]:
n_movies = len(data['movieId'].unique())
n_users = len(data['userId'].unique())
print(n_movies, n_users, len(rating))

10677 69878 10000054


In [6]:
genres_split = data.genres.str.split('|').tolist()
genres_unique = set()
for movie_genres in genres_split:
    for genre in movie_genres:
        genres_unique.add(genre)
genres2idx = {o:i+1 for i,o in enumerate(genres_unique)}
genres_split = [[genres2idx[x] for x in movie_genres] for movie_genres in genres_split]
padded_genres = tf.keras.preprocessing.sequence.pad_sequences(
    genres_split, padding="post"
)

users = data.userId.unique()
movies = data.movieId.unique()

userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

data['userId'] = data['userId'].apply(lambda x: userid2idx[x])
data['movieId'] = data['movieId'].apply(lambda x: movieid2idx[x])

data = data.drop(columns=['genres'])
for i in range(len(padded_genres[0])):
    data['genre'+str(i)] = padded_genres[:,i]

In [7]:
min_timestamp = pd.DataFrame(data.loc[:,["movieId","timestamp"]].groupby(['movieId'], as_index=False).min())
min_timestamp = min_timestamp.sort_values(by=["movieId"], axis=0)
min_timestamp = min_timestamp.rename(columns={"timestamp": "min_timestamp"})
data = pd.merge(data, min_timestamp)
data['timestamp'] = data['timestamp'] - data['min_timestamp']
data['timestamp'] = data['timestamp'] / (60 * 60 * 24 * 365)
data = data.drop(columns=['min_timestamp'])

In [8]:
split = np.random.rand(len(data)) < 0.9
train = data[split]
valid = data[~split]
data.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,genre0,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,0,0,5.0,0.347135,10,6,0,0,0,0,0,0
1,1,0,3.0,4.638027,10,6,0,0,0,0,0,0
2,2,0,2.5,9.015238,10,6,0,0,0,0,0,0
3,3,0,3.0,3.659976,10,6,0,0,0,0,0,0
4,4,0,4.5,8.702935,10,6,0,0,0,0,0,0
5,5,0,3.0,0.519782,10,6,0,0,0,0,0,0
6,6,0,3.0,0.520016,10,6,0,0,0,0,0,0
7,7,0,3.0,0.347532,10,6,0,0,0,0,0,0
8,8,0,1.0,4.052296,10,6,0,0,0,0,0,0
9,9,0,3.0,3.736893,10,6,0,0,0,0,0,0


In [15]:
n_user_embedding = 64
n_movie_embedding = int(n_user_embedding * 0.75)
n_genres_embedding = int(n_user_embedding * 0.25)
embedding_l2 = 1e-6 * 5                                  # regularization
dense_l2 = 1e-4 * 5                                      # regularization
keras.backend.clear_session()

user_input = Input(shape=(1,), name='user_input', dtype='int64')
movie_input = Input(shape=(1,), name='movie_input', dtype='int64')
genres_input = Input(shape=(len(padded_genres[0]),), name='genres_input', dtype='int64') # input = (10, 1)
time_input = Input(shape=(1,), name='time_input', dtype='float32')
time_root = tf.math.sqrt(time_input)
time_square = tf.math.square(time_input)
time_vector = tf.concat([time_input, time_root, time_square], 1)

user_embedding = Embedding(n_users, n_user_embedding, name='user_embedding', embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
user_vector = Flatten()(user_embedding)
movie_embedding = Embedding(n_movies, n_movie_embedding, name='movie_embedding', embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
movie_vector = Flatten()(movie_embedding)
genres_embedding = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, name='genres_embedding', embeddings_regularizer=regularizers.l2(embedding_l2/(len(genres_unique)+1)))(genres_input)
genres_average_embedding = GlobalAveragePooling1D()(genres_embedding)
genres_vector = Flatten()(genres_average_embedding)

concat_layer = tf.concat([user_vector, movie_vector, genres_vector, time_vector], 1, name='concat_layer')
mlp_layer = Dense(32, activation='relu', kernel_regularizer=regularizers.l2(dense_l2))(concat_layer)

user_embedding2 = Embedding(n_users, n_user_embedding, name='user_embedding2', embeddings_regularizer=regularizers.l2(embedding_l2))(user_input)
user_vector2 = Flatten()(user_embedding2)
movie_embedding2 = Embedding(n_movies, n_movie_embedding, name='movie_embedding2', embeddings_regularizer=regularizers.l2(embedding_l2))(movie_input)
movie_vector2 = Flatten()(movie_embedding2)
genres_embedding2 = Embedding(len(genres_unique)+1, n_genres_embedding, mask_zero=True, name='genres_embedding2', embeddings_regularizer=regularizers.l2(embedding_l2/(len(genres_unique)+1)))(genres_input)
genres_average_embedding2 = GlobalAveragePooling1D()(genres_embedding2)
genres_vector2 = Flatten()(genres_average_embedding2)
matrix_factorization = Dot(axes=1)([user_vector2, tf.concat([movie_vector2, genres_vector2], 1)])

output_layer = tf.concat([matrix_factorization, mlp_layer], 1)
output = Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(dense_l2))(output_layer)
output = output*4.5 + 0.5

model = Model([user_input, movie_input, time_input, genres_input], output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
genres_input (InputLayer)       [(None, 8)]          0                                            
__________________________________________________________________________________________________
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
movie_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
genres_embedding2 (Embedding)   (None, 8, 16)        336         genres_input[0][0]               
______________________________________________________________________________________________

In [16]:
model.compile(optimizer=Adam(0.003), loss='mse')
batch_size = 4096*8
epochs = 128

In [17]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)
history = model.fit([train.userId, train.movieId, train.timestamp, train.iloc[:,4:12]],train.rating,
                batch_size=batch_size, epochs=epochs, callbacks=[callback],
                validation_data = ([valid.userId, valid.movieId, valid.timestamp, valid.iloc[:,4:12]],valid.rating), verbose = 1)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128


In [18]:
valid_predictions = model.predict([valid.userId, valid.movieId, valid.timestamp, valid.iloc[:,4:12]], batch_size = batch_size)
test_rmse = mean_squared_error(valid.rating, valid_predictions, squared=False) # squared=False -> RMSE
print(test_rmse)

0.792867734946506
