Download dataset from:
`http://files.grouplens.org/datasets/movielens/ml-100k.zip`

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import mlflow
import pandas as pd

from sklearn.model_selection import train_test_split

from keras import Model, Sequential
from keras.constraints import non_neg
from keras.layers import Embedding, Input, Flatten, Dot
from keras.optimizers import Adam
from keras.regularizers import l2

In [None]:
# Set MLFlow Experiment
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name="recommendation_system")

In [None]:
header = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=header)
ratings.head(10)

In [None]:
ratings.describe()

In [None]:
# Create index for user and movie IDs.
u_unique = ratings.userId.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movieId.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

In [None]:
ratings.userId = ratings.userId.apply(lambda x: user2Idx[x])
ratings.movieId = ratings.movieId.apply(lambda x: movie2Idx[x])

In [None]:
ratings.tail()

In [None]:
# Split Dataset
ratings_train, ratings_val = train_test_split(ratings, test_size=0.2)

In [None]:
n_users = int(ratings.userId.nunique())
n_movies = int(ratings.movieId.nunique())
n_users_train = int(ratings_train.userId.nunique())
n_movies_train = int(ratings_train.movieId.nunique())
print(f"{n_users = }")
print(f"{n_movies = }")
print(f"{n_users_train = }")
print(f"{n_movies_train = }")

In [None]:
n_latent_factors = 5
# Input Layers
movie_input = Input(shape=[1], name='Item')
user_input = Input(shape=[1],name='User')

# Movie Embedding Layer
movie_embedding = Embedding(n_movies + 1, 
                            n_latent_factors, 
                            mask_zero=True,
                            embeddings_regularizer=l2(0.001),
                            name='Movie-Embedding')(movie_input)
movie_vec = Flatten(name='FlattenMovies')(movie_embedding)

# User Embedding Layer
user_embedding = Embedding(n_users + 1, 
                           n_latent_factors, 
                           mask_zero=True, 
                           name='User-Embedding')(user_input)
user_vec = Flatten(name='FlattenUsers')(user_embedding)

In [None]:
prod = Dot(axes=1, name='DotProduct')([movie_vec, user_vec])
model = Model([user_input, movie_input], prod)
model.summary()

In [None]:
model.compile(Adam(learning_rate=0.001), 'mean_squared_error', metrics=["root_mean_squared_error"])

In [None]:
with mlflow.start_run(run_name="keras_model"):
    # Fit Model
    history = model.fit(
        [ratings_train.userId, ratings_train.movieId], 
        ratings_train.rating, 
        batch_size=320,
        validation_data=([ratings_val.userId, ratings_val.movieId], ratings_val.rating), 
        epochs=100,
        verbose=1
        )

    # Log Metrics
    metrics_train = model.evaluate([ratings_train.userId, ratings_train.movieId], ratings_train.rating)
    metrics_val = model.evaluate([ratings_val.userId, ratings_val.movieId], ratings_val.rating)
    mlflow.log_metrics(
            {"RMSE_TRAIN": metrics_train[1], "RMSE_EVAL":  metrics_val[1]}
        )