In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Load data

In [2]:
df_train = pd.read_parquet('../data/02_model_input/df_train_oos.parquet.gzip')
df_test = pd.read_parquet('../data/02_model_input/df_test_oos.parquet.gzip')

In [3]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
14008643,96763,2713,4.0,944893475
4465052,30519,26360,0.5,1269639265
2086368,14111,17,4.0,862685983
9538082,66021,289,5.0,830299340
15170315,104859,1721,4.0,951793622


## Encode users and movies

In [4]:
pd.concat([df_train['userId'], df_test['userId']]).head()

14008643     96763
4465052      30519
2086368      14111
9538082      66021
15170315    104859
Name: userId, dtype: int64

In [5]:
le_user, le_movie = LabelEncoder(), LabelEncoder()
le_user.fit(pd.concat([df_train['userId'], df_test['userId']]))
le_movie.fit(pd.concat([df_train['movieId'], df_test['movieId']]))

In [6]:
df_train['user_encoded'] = le_user.transform(df_train['userId'])
df_train['movie_encoded'] = le_movie.transform(df_train['movieId'])

df_test['user_encoded'] = le_user.transform(df_test['userId'])
df_test['movie_encoded'] = le_movie.transform(df_test['movieId'])

## Center the target

In [7]:
df_train["rating_centered"] = df_train["rating"] - pd.concat([df_train['rating'], df_test['rating']]).mean()
df_test["rating_centered"] = df_test["rating"] - pd.concat([df_train['rating'], df_test['rating']]).mean()

## Define model

In [15]:
u = tf.keras.layers.Input(shape=(1, ))
m = tf.keras.layers.Input(shape=(1, ))

u_emb = tf.keras.layers.Embedding(input_dim=len(le_user.classes_), output_dim=8)(u)
m_emb = tf.keras.layers.Embedding(input_dim=len(le_movie.classes_), output_dim=8)(m)

u_emb = tf.keras.layers.Flatten()(u_emb)
m_emb = tf.keras.layers.Flatten()(m_emb)

x = tf.keras.layers.Concatenate()([u_emb, m_emb])
x = tf.keras.layers.Dense(32, activation='relu')(x)

x = tf.keras.layers.Dense(1)(x)

In [16]:
model = tf.keras.Model(inputs=[u, m], outputs=x)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9, weight_decay=1e-4), loss='mse')
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 1, 8)                 1107944   ['input_5[0][0]']             
                                                                                                  
 embedding_5 (Embedding)     (None, 1, 8)                 213952    ['input_6[0][0]']             
                                                                                            

## Train

In [17]:
r = model.fit(
    [df_train['user_encoded'], df_train['movie_encoded']], 
    df_train['rating_centered'], 
    validation_data=([df_test['user_encoded'], df_test['movie_encoded']], df_test['rating_centered']), 
    epochs=15,
    batch_size=2048
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Evaluate