In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.setrecursionlimit(1000)

In [2]:
import pandas as pd
import numpy as np

from keras import Model
from keras.layers import Embedding, Input, Flatten, Dot, Add
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.model_selection import train_test_split

# Train model

In [56]:
ratings.head()

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,Date
0,0,1,1,3,1997-12-04 15:55:49
1,1,2,2,3,1998-04-04 19:22:22
2,2,3,3,1,1997-11-07 07:18:36
3,3,4,4,2,1997-11-27 05:02:03
4,4,5,5,1,1998-02-02 05:33:16


In [63]:
ratings = pd.read_csv("Data/scores.csv")

# create a dictionary of indexes for the users and movies mapping every ID to a given index
u_unique = ratings.user_id.unique()
user2Idx = {o:i+1 for i,o in enumerate(u_unique)}

m_unique = ratings.movie_id.unique()
movie2Idx = {o:i+1 for i,o in enumerate(m_unique)}

# Replace the users and movies ids with the new index value
ratings["user_id"] = ratings["user_id"].apply(lambda x: user2Idx[x])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: movie2Idx[x])

n_users = int(ratings.user_id.nunique())
n_movies = int(ratings.movie_id.nunique())

n_users, n_movies

(943, 1682)

In [64]:
def build_keras_model(users_number, movies_number, latent_factors = 5, add_bias = False, loss="mean_squared_error", learning_rate=0.001, metrics=[keras.metrics.RootMeanSquaredError()]):
    user_input = Input(shape=[1],name="User")
    movie_input = Input(shape=[1],name="Item")
    
    # Movie embedding layer
    movie_embedding = Embedding(
        movies_number + 1,
        latent_factors,
        embeddings_regularizer=l2(0.001),
        name = "Movie-Embedding"
    )(movie_input)
    
    movie_vec = Flatten(name = "FlattenMovies")(movie_embedding)

    # User embedding layer
    user_embedding = Embedding(
        users_number + 1,
        latent_factors,
        #embeddings_regularizer=l2(0.001),
        name = "User-Embedding"
    )(user_input)
    
    user_vec = Flatten(name = "FlattenUsers")(user_embedding)
    
    prod = Dot(axes=1, name="DotProduct")([movie_vec,user_vec])
    
    if add_bias:
        # Movie bias Embedding layer
        movie_bias_embedding = Embedding(
            movies_number+1,
            1,
            embeddings_regularizer = l2(0.001),
            name = "Movie-Bias-Embedding"
        )(movie_input)
        
        movie_bias = Flatten(name="FlattenMoviesBias")(movie_bias_embedding)
        
        # User bias Embedding layer
        user_bias_embedding = Embedding(
            users_number+1,
            1,
            #embeddings_regularizer = l2(0.001),
            name = "User-Bias-Embedding"
        )(user_input)
        
        user_bias = Flatten(name="FlattenUserBias")(user_bias_embedding)
        
        prod = Add()([prod,user_bias,movie_bias])
        
    model = Model([user_input,movie_input],prod)
    model.compile(Adam(learning_rate = learning_rate), loss, metrics=metrics)
    return model

In [65]:
import keras
keras.metrics.RootMeanSquaredError()

<keras.metrics.RootMeanSquaredError at 0x1e00807db70>

In [66]:
# split dataset

eval_size = 0.2
ratings_train, ratings_val = train_test_split(ratings, test_size = eval_size)

best_lf = 0
best_mse = float('inf')
epochs = 100
latent_factor = 13
model = build_keras_model(n_users,n_movies,latent_factors = latent_factor, add_bias = True)
history = model.fit(
    [ratings_train.user_id, ratings_train.movie_id],
    ratings_train.rating,
    batch_size=320,
    validation_data = ([ratings_val.user_id, ratings_val.movie_id],ratings_val.rating),
    epochs = epochs,
    verbose = 1
)

metrics_train = model.evaluate([ratings_train.user_id,ratings_train.movie_id],ratings_train.rating)
metrics_val = model.evaluate([ratings_val.user_id,ratings_val.movie_id],ratings_val.rating)

print(f"RMSE: {metrics_val[1]} for latent_factors = {latent_factor}")

Train on 80000 samples, validate on 20000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/1

In [51]:
import tensorflow as tf
tf.reset_default_graph()




In [None]:
movies = pd.read_csv("Data/peliculas.csv")
users = pd.read_csv("Data/usuarios.csv")

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,Date
0,0,196,242,3,1997-12-04 15:55:49
1,1,186,302,3,1998-04-04 19:22:22
2,2,22,377,1,1997-11-07 07:18:36


In [14]:
users.head(3)

Unnamed: 0,id,Occupation,Active Since
0,1,technician,1997-09-22 21:57:58
1,2,other,1998-02-27 03:26:00
2,3,writer,1998-03-07 02:15:39


In [15]:
movies.head(3)

Unnamed: 0,id,Name,Release Date,IMDB URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
users["id"].apply(lambda x: user)

0        1
1        2
2        3
3        4
4        5
      ... 
938    939
939    940
940    941
941    942
942    943
Name: id, Length: 943, dtype: int64