In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## Load data

In [2]:
df_train = pd.read_parquet('../data/02_model_input/df_train.parquet.gzip')
df_test = pd.read_parquet('../data/02_model_input/df_test.parquet.gzip')

In [3]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
4182421,28507,1176,4.0,789652004
18950979,131160,1079,3.0,789652009
18950936,131160,47,5.0,789652009
18950930,131160,21,3.0,789652009
12341178,85252,45,3.0,822873600


## Encode users and movies

In [4]:
le_user, le_movie = LabelEncoder(), LabelEncoder()
df_train['user_encoded'] = le_user.fit_transform(df_train['userId'])
df_train['movie_encoded'] = le_movie.fit_transform(df_train['movieId'])

In [5]:
le_user.classes_

array([     1,      2,      3, ..., 138491, 138492, 138493])

In [6]:
le_user.transform(le_user.classes_)

array([     0,      1,      2, ..., 112463, 112464, 112465])

In [7]:
df_test.shape

(4000053, 4)

In [8]:
known_classes = df_test["userId"].isin(le_user.classes_) & df_test["movieId"].isin(le_movie.classes_)

In [9]:
df_test[known_classes].shape

(490844, 4)

In [10]:
df_test_known = df_test[known_classes].copy()

In [11]:
df_test_known['user_encoded'] = le_user.fit_transform(df_test_known['userId'])
df_test_known['movie_encoded'] = le_movie.fit_transform(df_test_known['movieId'])

## Center the target

In [12]:
df_train["rating_centered"] = df_train["rating"] - df_train["rating"].mean()
df_test_known["rating_centered"] = df_test_known["rating"] - df_train["rating"].mean()

## Define model

In [13]:
u = tf.keras.layers.Input(shape=(1, ))
m = tf.keras.layers.Input(shape=(1, ))

u_emb = tf.keras.layers.Embedding(input_dim=len(le_user.classes_), output_dim=8)(u)
m_emb = tf.keras.layers.Embedding(input_dim=len(le_movie.classes_), output_dim=8)(m)

u_emb = tf.keras.layers.Flatten()(u_emb)
m_emb = tf.keras.layers.Flatten()(m_emb)

x = tf.keras.layers.Concatenate()([u_emb, m_emb])
x = tf.keras.layers.Dense(32, activation='relu')(x)

x = tf.keras.layers.Dense(1)(x)

In [14]:
model = tf.keras.Model(inputs=[u, m], outputs=x)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9), loss='mse')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            


                                                                                                  
 embedding (Embedding)       (None, 1, 8)                 44272     ['input_1[0][0]']             
                                                                                                  
 embedding_1 (Embedding)     (None, 1, 8)                 89600     ['input_2[0][0]']             
                                                                                                  
 flatten (Flatten)           (None, 8)                    0         ['embedding[0][0]']           
                                                                                                  
 flatten_1 (Flatten)         (None, 8)                    0         ['embedding_1[0][0]']         
                                                                                                  
 concatenate (Concatenate)   (None, 16)                   0         ['flatten[0][0]',             
          

## Train

In [15]:
r = model.fit(
    [df_train['user_encoded'], df_train['movie_encoded']], 
    df_train['rating_centered'], 
    validation_data=([df_test_known['user_encoded'], df_test_known['movie_encoded']], df_test_known['rating_centered']), 
    epochs=5,
    batch_size=4096
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluate