<a href="https://colab.research.google.com/github/harsh194/machine_learning/blob/main/Collaborative_Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import os
new_directory = '/content/drive/MyDrive/Harsh/Unsupervised Learning/Week 2/Collaborative Filtering'
os.chdir(new_directory)

In [10]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from recsys_utils import *

In [11]:
X, W, b, num_movies, num_features, num_users = load_precalc_params_small()
Y,R = load_ratings_small()

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (4778, 443) R (4778, 443)
X (4778, 10)
W (443, 10)
b (1, 443)
num_features 10
num_movies 4778
num_users 443


In [12]:
tsmean = np.mean(Y[0, R[0,:].astype(bool)])
print(f"Average rating for movie 1 : {tsmean : 0.3f} / 5")

Average rating for movie 1 :  3.400 / 5


In [13]:
def cofi_cost_func(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0

    for j in range(nu):
        w = W[j,:]
        b_j = b[0,j]
        for i in range(nm):
            x = X[i,:]
            y = Y[i,j]
            r = R[i,j]
            J += r * np.square((np.dot(w,x) + b_j - y))

    J += (lambda_)*(np.sum(np.square(W)) + np.sum(np.square(X)))
    J = J/2

    return J


In [15]:
# Reduce the data set size so that this runs faster
num_users_r = 4
num_movies_r = 5
num_features_r = 3

X_r = X[:num_movies_r, :num_features_r]
W_r = W[:num_users_r,  :num_features_r]
b_r = b[0, :num_users_r].reshape(1,-1)
Y_r = Y[:num_movies_r, :num_users_r]
R_r = R[:num_movies_r, :num_users_r]

J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 0)
print(f"Cost : {J:0.2f}")

Cost : 13.67


In [17]:
# Evaluate the cost function with rgularization
J = cofi_cost_func(X_r, W_r, b_r, Y_r, R_r, 1.5)
print(f"Cost (with regularization): {J:0.2f}")

Cost (with regularization): 28.09


In [19]:
def cofi_cost_func_v(X,W,b,Y,R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5*tf.reduce_sum(j**2) + (lambda_/2)*(tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

In [20]:
# Evaluate cost function
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 0);
print(f"Cost: {J:0.2f}")

# Evaluate cost function with regularization
J = cofi_cost_func_v(X_r, W_r, b_r, Y_r, R_r, 1.5);
print(f"Cost (with regularization): {J:0.2f}")

Cost: 13.67
Cost (with regularization): 28.09


In [26]:
movieList, movieList_df = load_Movie_List_pd()
my_ratings = np.zeros(num_movies)

# Ratings for the selected movie
my_ratings[2700] = 5
my_ratings[2609] = 2
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)

my_rated = [i for i in range(len(my_ratings)) if my_ratings[i]>0]

print('\n New user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print(f'Rated {my_ratings[i]} for {movieList_df.loc[i, "title"]}')


 New user ratings:

Rated 5.0 for Shrek (2001)
Rated 5.0 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for Incredibles, The (2004)
Rated 2.0 for Persuasion (2007)
Rated 5.0 for Toy Story 3 (2010)
Rated 3.0 for Inception (2010)
Rated 1.0 for Louis Theroux: Law & Disorder (2008)
Rated 1.0 for Nothing to Declare (Rien à déclarer) (2010)


In [27]:
Y, R = load_ratings_small()
Y = np.c_[my_ratings,Y]
R = np.c_[(my_ratings !=0).astype(int), R]

Ynorm, Ymean = normalizeRatings(Y,R)

In [28]:
num_movies, num_users = Y.shape
num_features = 100

tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users, num_features), dtype= tf.float64), name = 'W')
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype = tf.float64), name = 'X')
b = tf.Variable(tf.random.normal((1, num_users), dtype = tf.float64), name = 'b')

optimizer = keras.optimizers.Adam(learning_rate = 1e-1)

In [None]:
iterations = 100000
lambda_ = 1
for iter in range(iterations):
    with tf.GradientTape() as tape:
        cost_value = cofi_cost_func_v(X,W, b, Ynorm, R, lambda_)
    grads = tape.gradient(cost_value, [X,W,b])
    optimizer.apply_gradients(zip(grads,[X,W,b]))

    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 2566.6
Training loss at iteration 20: 2348.7
Training loss at iteration 40: 2202.9
Training loss at iteration 60: 2102.4
Training loss at iteration 80: 2031.2
Training loss at iteration 100: 1979.4
