In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  return f(*args, **kwds)


In [2]:
%matplotlib inline

In [3]:
ratings = pd.read_csv("data/ml-small/ratings.csv")
movies = pd.read_csv("data/ml-small/movies.csv").set_index("movieId")
movie_names = movies.title.to_dict()

In [4]:
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
user_ids = ratings.userId.unique()
movie_ids = ratings.movieId.unique()

In [7]:
# Is mandatory that the inputs to the embeddings must be a list of contiguous integers

userid2idx = {o:i for i,o in enumerate(user_ids)}
movieid2idx = {o:i for i,o in enumerate(movie_ids)}

ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

user_min, user_max, movie_min, movie_max = (ratings.userId.min(), 
    ratings.userId.max(), ratings.movieId.min(), ratings.movieId.max())
user_min, user_max, movie_min, movie_max

(0, 670, 0, 9065)

In [8]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [9]:
n_factors = 50
np.random.seed = 42

In [10]:
mask = np.random.rand(len(ratings)) < 0.80
train = ratings[mask]
valid = ratings[~mask]

In [11]:
# Inputs to the model:
# - train_user_inputs: placeholder for user ids
# - train_movie_inputs: placeholder for move ids
# - train_rating_labels: placeholder the rating labels

with tf.variable_scope("foo", reuse=tf.AUTO_REUSE):
    train_user_inputs = tf.placeholder(tf.int32, shape=[None, 1])
    train_move_inputs = tf.placeholder(tf.int32, shape=[None, 1])
    train_rating_labels = tf.placeholder(tf.float32, shape=[None, 1])

In [12]:
# Embedding layers
# The model contains two different embeddings that are concatenated: users and movies

with tf.variable_scope("foo", reuse=tf.AUTO_REUSE):
    user_embeddings = tf.get_variable("user_embeddings",
        [n_users, n_factors])
    embedded_user_ids = tf.nn.embedding_lookup(user_embeddings, train_user_inputs)

    movie_embeddings = tf.get_variable("movie_embeddings",
        [n_movies, n_factors])
    embedded_movie_ids = tf.nn.embedding_lookup(movie_embeddings, train_move_inputs)
    
    full_embedding = tf.concat([embedded_user_ids, embedded_movie_ids], axis=1)

In [65]:
# Model definition:
# 
# embedding
# flatten
# dropout
# dense
# dropout
# dense

with tf.variable_scope("foo4", reuse=tf.AUTO_REUSE):
    x = tf.layers.flatten(full_embedding)
    x = tf.layers.dropout(x, rate=0.5)
    x = tf.layers.dense(x, 170, activation=tf.nn.relu)
    x = tf.layers.batch_normalization(x)
    x = tf.layers.dropout(x, rate=0.75)
    x = tf.layers.dense(x, 1)
    
    loss = tf.reduce_mean(tf.square(x - train_rating_labels))
    
    learning_rate = tf.Variable(initial_value=0.001)
    
    adam = tf.train.AdamOptimizer(learning_rate=learning_rate)
    optimizer = adam.minimize(loss)

In [66]:
# Training

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [71]:
for i in range(8):
    feed_dict = {
        train_user_inputs: train.userId.values.reshape(79870, 1),
        train_move_inputs: train.movieId.values.reshape(79870, 1),
        train_rating_labels: train.rating.values.reshape(79870, 1)
    }
    _, current_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
    print("train loss: {}".format(current_loss))

    feed_dict = {
        train_user_inputs: valid.userId.values.reshape(20134, 1),
        train_move_inputs: valid.movieId.values.reshape(20134, 1),
        train_rating_labels: valid.rating.values.reshape(20134, 1)
    }
    current_loss = sess.run([loss], feed_dict=feed_dict)
    print("valid loss: {}".format(current_loss))

train loss: 2.888119697570801
valid loss: [2.5963454]
train loss: 2.5267887115478516
valid loss: [2.2628913]
train loss: 2.1965110301971436
valid loss: [1.9649183]
train loss: 1.9023816585540771
valid loss: [1.7069895]
train loss: 1.6489369869232178
valid loss: [1.4927915]
train loss: 1.4397634267807007
valid loss: [1.3246644]
train loss: 1.2770464420318604
valid loss: [1.2031169]
train loss: 1.1610841751098633
valid loss: [1.1263803]
