# Movie recommendations with TensorFlow Wide & Deep

In this notebook, we are going to implement the Wide & Deep Model and train it on movie ratings.

## Setup

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import sys

module_path = os.path.abspath("../scripts")
sys.path.append(module_path)
import recommendation_helper as rh

In [None]:
seed = 99
seed_init = 42
training_epochs = 70
learning_rate = 0.005
beta = 0.0
batch_size = 128
n_hidden = 128

In [None]:
not_wide_columns = ["movieId", "userId", "title", "year", "rating"]
not_deep_columns = ["movieId", "userId", "title", "year_bucket", "rating"]

Load the data and generate a train and a test set.

In [None]:
def load_data():
    """Loads data and splits into train and test set."""
    data = pd.read_csv(tf.gfile.Open("../data/05/movielens.csv"), sep=";")

    np.random.seed(seed)
    msk = np.random.rand(len(data)) < 0.9     # split into 90% train and 10% test data
    return data.loc[msk], data.loc[~msk]

Remove columns which are not used for training, e.g. movie and user ID which are only used for generating recommendations later on.

In [None]:
def get_input_data(input, type):
    if type == "wide":
        return input.drop(not_wide_columns, axis=1)
    elif type == "deep":
        drop_cols = [col for col in input.columns if col.startswith(tuple(not_deep_columns))]
        return input.drop(drop_cols, axis=1)

In [None]:
df_train, df_test = load_data()
print("Set contains %i training samples and %i test samples." % (len(df_train), len(df_test)))

# Input
train_w = get_input_data(df_train, "wide")
test_w = get_input_data(df_test, "wide")
train_d = get_input_data(df_train, "deep")
test_d = get_input_data(df_test, "deep")
train_y = pd.DataFrame({"rating": df_train.rating})
test_y = pd.DataFrame({"rating": df_test.rating})
df_train.sample(5)

## Model implementation

First, define the placeholders and variables needed for the model.

In [None]:
# Helper function for initializing variables like weights and biases
def compute_stddev(x):
    return tf.to_float(tf.sqrt(2 / tf.reduce_prod(x.get_shape()[1:])))

In [None]:
_, n_w = train_w.shape
_, n_d = train_d.shape

x_w = tf.placeholder("float", [None, n_w], name="x_w")
x_d = tf.placeholder("float", [None, n_d], name="x_d")
y = tf.placeholder("float", [None, 1], name="y")
keep_prob = tf.placeholder("float", name="keep_prob")

# Trainable model parameters
weights = {
    "linear": tf.Variable(tf.truncated_normal([n_w, 1], seed=seed_init, stddev=compute_stddev(x_w)), name="weights_linear"),
}
biases = {}

The Wide & Deep Model combines a linear model and a deep neural network.

**Task**: Implement the Wide & Deep Model.
* Create the linear component of the model by multiplying the input *x_w* with the weights for the linear layer, which we already defined for you.


* Create one hidden layer in the deep component
    * Create weights with shape [*n_d, n_hidden*] and biases with shape [*n_hidden*].
    * Both the weights and the biases are initialized with a truncated normal distribution which uses *seed_init* as seed and a standard deviation depending on *x_d*. (Hint: *compute_stddev(x)* above)
    * Multiply the input *x_d* and the weights. Add the biases to the matrix product.
    * Apply dropout and the relu activation function. 
    
    
* Create the output layer of the deep component
    * Create weights with shape [*n_hidden*, 1] which are initialized like the weights above. The standard deviation depends on the activations of the hidden layer.
    * Multiply the hidden layer and the new weights.

In [None]:
def build_model(keep_prob):
    ## YOUR CODE HERE
    # Linear layer with linear activation
    linear = 
    
    # Hidden layer with relu activation
    
    # Deep output layer
    deep =

    ##
    # Output layer with linear activation
    biases["out"] = tf.Variable(tf.truncated_normal([1], seed=seed_init, 
                                                    stddev=compute_stddev(deep)), name="biases_out")
    out_layer = tf.add(tf.add(linear, deep), biases['out'], name="prediction")

    return out_layer

We optimize the model using the Mean Squared Error with L2-regularization over the weights as loss function.

In [None]:
def create_optimizer(y_hat):
    global_step = tf.Variable(0)

    # Define loss (MSE) and optimizer
    mse = tf.reduce_mean(tf.square(tf.subtract(y, y_hat)))
    
    # L2 regularization over weights
    regularizer = tf.nn.l2_loss(weights["linear"]) + tf.nn.l2_loss(weights["h1"]) + \
                  tf.nn.l2_loss(weights["deep"])
    loss = mse + beta * regularizer
    
    # Learning rate which is exponentially decayed, so first optimization steps are larger
    lr = tf.train.exponential_decay(learning_rate, global_step, 10, 0.9, staircase=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
    return optimizer, mse, loss

In [None]:
def create_metrics(y_hat):
    # Mean Absolute Error |y - y_hat|
    mae = tf.reduce_mean(tf.abs(y - y_hat))
    
    # Root Mean Squared Error √(1/n Σ(y - y_hat)^2)
    rmse = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y, y_hat))))
    return mae, rmse

## Train and evaluate the model

In [None]:
sess = tf.InteractiveSession()

In [None]:
def train_and_evaluate(kp=1.0):
    # Set up the model, optimizer and metrics
    y_hat = build_model(kp)
    optimizer, mse, loss = create_optimizer(y_hat)
    mae, rmse = create_metrics(y_hat)

    # IMPORTANT: initialize variables before computing anything else
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # Training loop
    for epoch in range(training_epochs):
        avg_loss = 0.
        total_batch = int(df_train.shape[0] / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x_w = train_w.sample(batch_size)
            batch_x_d = train_d.loc[batch_x_w.index]
            batch_y = pd.DataFrame({"rating": df_train.rating.loc[batch_x_w.index]})

            # Run optimization op (backprop) and cost op (to get loss value)
            _, l, = sess.run([optimizer, loss], feed_dict={x_w: batch_x_w, x_d: batch_x_d, y: batch_y, keep_prob: kp})

            # Compute average loss and training accuracy
            avg_loss += l / total_batch

        if epoch % 5 == 4:
            error, cost, mean_abs, root_mean = sess.run([mse, loss, mae, rmse], 
                                                        feed_dict={x_w: train_w, x_d: train_d, y: train_y})
            print("Epoch %i\tLoss (regularized error): %.3f\tMSE: %.3f\tRMSE: %.3f\tMAE: %.3f" 
                  % ((epoch + 1), cost, error, root_mean, mean_abs))
    
    print('MSE Test:', sess.run(mse, feed_dict={x_w: test_w, x_d: test_d, y: test_y}))
    print('RMSE Test:', sess.run(rmse, feed_dict={x_w: test_w, x_d: test_d, y: test_y}))
    print('MAE Test:', sess.run(mae, feed_dict={x_w: test_w, x_d: test_d, y: test_y}))        


In [None]:
train_and_evaluate(kp=1.0)

## Get recommendations

Now that we've trained our model, we can use it to recommend movies to users.

For this, we need the nodes in the session graph which take the input and compute the estimates for the user's ratings.

In [None]:
# Get the necessary nodes from the model graph
prediction = sess.graph.get_tensor_by_name("prediction:0")
x_w = sess.graph.get_tensor_by_name("x_w:0")
x_d = sess.graph.get_tensor_by_name("x_d:0")

Compute the first *n* recommendations for a user:

In [None]:
def get_n_recommendations(user_id, n):
    input_w, input_d = rh.get_user_data(user_id, "wide_n_deep")
    recommendations = rh.movies[["title", "movieId"]]
    
    # Here, we use the nodes which we extracted before
    recommendations["rating"] = sess.run(prediction, feed_dict={x_w: input_w, x_d: input_d})
    return recommendations[["title", "movieId", "rating"]].sort_values("rating", ascending=False).head(n)

In [None]:
get_n_recommendations(1, 25)

**Optional tasks**:
* Experiment with the model parameters. Try, e.g., different learning rates or hidden layer sizes.
* Add L2-regularization or vary the dropout rate.
* Use several hidden layers. We implemented the function *create_relu_layer(n_this, n_previous, x, keep_prob, name)* for faster creation of layers with RELU activation.
* Try different optimization algorithms or activation functions (see [TensorFlow Activation Functions](https://www.tensorflow.org/api_guides/python/nn#Activation_Functions)).
* Compare the recommendations for different users.

In [None]:
# Example usage: hidden = rh.create_relu_layer(n_hidden, n_d, x_d, keep_prob, "h1")
def create_relu_layer(n_this, n_previous, x, keep_prob, name):
    weights[name] = tf.Variable(tf.truncated_normal([n_previous, n_this], seed=seed_init, stddev=compute_stddev(x)), name="weights_" + name)
    biases[name] = tf.Variable(tf.truncated_normal([n_this], seed=seed_init, stddev=compute_stddev(x)), name="biases_" + name)

    layer = tf.add(tf.matmul(x, weights[name]), biases[name])
    layer = tf.nn.relu(tf.nn.dropout(layer, keep_prob))
    return layer