In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import *

## Load curated dataset with 4778 movies and ratings for 443 users

In [2]:
df = pd.read_csv("./data/small_movie_list.csv",index_col=[0])
# df = pd.read_csv("./data/small_movies_X.csv")
df

Unnamed: 0,mean rating,number of ratings,title
0,3.400000,5,"Yards, The (2000)"
1,3.250000,6,Next Friday (2000)
2,2.000000,4,Supernova (2000)
3,2.000000,4,Down to You (2000)
4,2.672414,29,Scream 3 (2000)
...,...,...,...
4773,3.500000,1,Jon Stewart Has Left the Building (2015)
4774,4.000000,1,Black Butler: Book of the Atlantic (2017)
4775,3.500000,1,No Game No Life: Zero (2017)
4776,3.500000,1,Flint (2017)


In [3]:
#Load data
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y, R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


## Define cost function for collaborative filtering

In [26]:
# def cofi_cost_func(X, W, b, Y, R, lambda_):
#     """
#     Returns the cost for the collaborative filtering
#     Args:
#       X (ndarray (num_movies,num_features)): matrix of item features
#       W (ndarray (num_users,num_features)) : matrix of user parameters
#       b (ndarray (1, num_users)            : vector of user parameters
#       Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
#       R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
#       lambda_ (float): regularization parameter
#     Returns:
#       J (float) : Cost
#     """
#     nm, nu = Y.shape
#     J = 0
    
#     for i in range(nm):
#         for j in range(nu):
        
#             if R[i,j] == 1:
#                 J += np.squeeze((np.dot(W[j,:],X[i,:]) + b[:,j]  - Y[i,j])**2)
            
#     J += lambda_ * sum(W.flatten()**2) + lambda_ * sum(X.flatten()**2)
     
#     J = J/2                            
    
#     return J

In [27]:
def cofi_cost_func_v(X, W, b, Y, R, lambda_):
    """
    Returns the cost for the collaborative filtering
    Vectorized for speed. Uses tensorflow operations to be compatible with custom training loop.
    Args:
      X (ndarray (num_movies,num_features)): matrix of item features
      W (ndarray (num_users,num_features)) : matrix of user parameters
      b (ndarray (1, num_users)            : vector of user parameters
      Y (ndarray (num_movies,num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies,num_users)    : matrix, where R(i, j) = 1 if the i-th movies was rated by the j-th user
      lambda_ (float): regularization parameter
    Returns:
      J (float) : Cost
    """
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

## Inititalize my ratings

In [16]:
movieList, movieList_df = load_Movie_List_pd()

my_ratings = np.zeros(num_movies)          #  Initialize my ratings

# Check the file small_movie_list.csv for id of each movie in our dataset
my_ratings[4758] = 4   # Mission: Impossible - Fallout (2018)
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King
my_ratings[2716] = 5   # Inception
my_ratings[2162] = 5   # Kung Fu Panda (2008) 
my_ratings[2257]  = 3  # Body of Lies (2008)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[3618] = 5   # Interstellar (2014)
my_ratings[261] = 1    # Fast and the Furious, The (2001)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)
my_ratings[3084] = 5   # Bourne Legacy, The (2012)
my_ratings[3102] = 3   # Revenant, The (2009)
my_ratings[3110] = 5   # Einstein and Eddington (2008)  
my_ratings[446] = 2    # Resident Evil (2002)   
my_ratings[1774] = 4   # Déjà Vu (Deja Vu) (2006)  
my_ratings[3904] = 1   # Avengers: Age of Ultron (2015)

my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movieList_df.loc[i,"title"]}');
#         print(f'Rated {my_ratings[i]} for  {movieList_df.iloc[i]["title"]}');
        


New user ratings:

Rated 1.0 for  Fast and the Furious, The (2001)
Rated 5.0 for  Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for  Resident Evil (2002)
Rated 5.0 for  Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for  Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for  Eternal Sunshine of the Spotless Mind (2004)
Rated 4.0 for  Déjà Vu (Deja Vu) (2006)
Rated 5.0 for  Kung Fu Panda (2008)
Rated 3.0 for  Body of Lies (2008)
Rated 5.0 for  Inception (2010)
Rated 5.0 for  Bourne Legacy, The (2012)
Rated 3.0 for  Revenant, The (2009)
Rated 5.0 for  Einstein and Eddington (2008)
Rated 5.0 for  Interstellar (2014)
Rated 1.0 for  Avengers: Age of Ultron (2015)
Rated 4.0 for  Mission: Impossible - Fallout (2018)


## Add my ratings to the dataset and start training using tensorflow's gradienttape

In [17]:
# Reload ratings and add new ratings
Y, R = load_ratings_small()
Y    = np.c_[my_ratings, Y]
R    = np.c_[(my_ratings != 0).astype(int), R]

# Normalize the Dataset
Ynorm, Ymean = normalizeRatings(Y, R)

In [18]:
#  Useful Values
num_movies, num_users = Y.shape
num_features = 100

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [19]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost 
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2321264.1
Training loss at iteration 20: 136175.6
Training loss at iteration 40: 51861.6
Training loss at iteration 60: 24598.8
Training loss at iteration 80: 13630.8
Training loss at iteration 100: 8488.1
Training loss at iteration 120: 5808.3
Training loss at iteration 140: 4312.4
Training loss at iteration 160: 3436.1
Training loss at iteration 180: 2903.0


## Make predictions and compare them with my ratings

In [21]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

#restore the mean
pm = p + Ymean

my_predictions = pm[:,0]

# sort predictions
ix = tf.argsort(my_predictions, direction='DESCENDING')

# for i in range(17):
#     j = ix[i]
#     if j not in my_rated:
#         print(f'Predicting rating {my_predictions[j]:0.2f} for movie {movieList[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movieList[i]}')



Original vs Predicted ratings:

Original 1.0, Predicted 1.25 for Fast and the Furious, The (2001)
Original 5.0, Predicted 4.93 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Original 2.0, Predicted 2.16 for Resident Evil (2002)
Original 5.0, Predicted 4.91 for Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Original 5.0, Predicted 4.86 for Lord of the Rings: The Return of the King, The (2003)
Original 3.0, Predicted 3.07 for Eternal Sunshine of the Spotless Mind (2004)
Original 4.0, Predicted 3.79 for Déjà Vu (Deja Vu) (2006)
Original 5.0, Predicted 4.88 for Kung Fu Panda (2008)
Original 3.0, Predicted 3.15 for Body of Lies (2008)
Original 5.0, Predicted 4.95 for Inception (2010)
Original 5.0, Predicted 4.61 for Bourne Legacy, The (2012)
Original 3.0, Predicted 3.21 for Revenant, The (2009)
Original 5.0, Predicted 4.82 for Einstein and Eddington (2008)
Original 5.0, Predicted 4.97 for Interstellar (2014)
Original 1.0, 

In [22]:
movieList_, movieList_df_ = load_Movie_List_pd()

movieList_df_

Unnamed: 0,mean rating,number of ratings,title
0,3.400000,5,"Yards, The (2000)"
1,3.250000,6,Next Friday (2000)
2,2.000000,4,Supernova (2000)
3,2.000000,4,Down to You (2000)
4,2.672414,29,Scream 3 (2000)
...,...,...,...
4773,3.500000,1,Jon Stewart Has Left the Building (2015)
4774,4.000000,1,Black Butler: Book of the Atlantic (2017)
4775,3.500000,1,No Game No Life: Zero (2017)
4776,3.500000,1,Flint (2017)


## Recommend movies to me based on my ratings

In [25]:
filter=(movieList_df["number of ratings"] > 20)
movieList_df["pred"] = my_predictions
movieList_df = movieList_df.reindex(columns=["pred", "mean rating", "number of ratings", "title"])
# movieList_df.loc[ix[:300]].loc[filter].sort_values("mean rating", ascending=False)
movieList_df.loc[ix[:500]].loc[filter].sort_values("pred", ascending=False)


Unnamed: 0,pred,mean rating,number of ratings,title
3618,4.973061,3.993151,73,Interstellar (2014)
2420,4.958126,4.004762,105,Up (2009)
2716,4.946407,4.066434,143,Inception (2010)
366,4.934438,3.761682,107,Harry Potter and the Sorcerer's Stone (a.k.a. ...
793,4.91475,3.778523,149,Pirates of the Caribbean: The Curse of the Bla...
2162,4.875541,3.444444,54,Kung Fu Panda (2008)
174,4.875221,3.9,70,Traffic (2000)
929,4.863057,4.118919,185,"Lord of the Rings: The Return of the King, The..."
151,4.856411,3.836364,110,"Crouching Tiger, Hidden Dragon (Wo hu cang lon..."
397,4.686115,3.81,50,Black Hawk Down (2001)


### I realize I have watched some of the movies that the recommender system predicted and I liked them all! I guess I would like the rest!