In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat

# Loading movie ratings dataset

In [3]:
movie_dataset = loadmat('data/ex8_movies.mat')
print(movie_dataset.keys())

Y = movie_dataset['Y']
R = movie_dataset['R']
print('Y shape: ', Y.shape)
print('R shape: ', R.shape)

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])
Y shape:  (1682, 943)
R shape:  (1682, 943)


# Collaborative Filtering Cost Function

In [4]:
params = loadmat('data/ex8_movieParams.mat')
params.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [5]:
X = params['X']
Theta = params['Theta']
num_users = params['num_users']
num_movies = params['num_movies']
num_features = params['num_features']
print(f'Number of users: {num_users}')
print(f'Number of movies: {num_movies}')
print(f'Number of features: {num_features}')

Number of users: [[943]]
Number of movies: [[1682]]
Number of features: [[10]]


In [6]:
num_users = 4
num_movies = 5
num_features = 3

In [7]:
X = X[0:num_movies, 0:num_features]
Theta = Theta[0:num_users, 0:num_features]
Y = Y[0:num_movies, 0:num_users]
R = R[0:num_movies, 0:num_users]

In [10]:
from utils import cofi_cost_function, cofi_gradient

In [11]:
# Evaluate cost function
Xs = np.reshape(X, (-1, 1), order='F')
Thetas = np.reshape(Theta, (-1, 1), order='C')
paras = np.vstack((Xs, Thetas))
J = cofi_cost_function(paras, Y, R, num_users, num_movies, num_features, 0)

In [12]:
J

22.224603725685675

In [13]:
J = cofi_cost_function(paras, Y, R, num_users, num_movies, num_features, 1.5)

In [14]:
J

31.34405624427422

In [15]:
with open('data/movie_ids.txt') as f:
    movies = f.read().strip()
movie_list = movies.split('\n')
movie_dict = {}
for movie in movie_list:
    mov = movie.split()
    idx = mov[0]
    movi = " ".join(mov[1:])
    movie_dict[int(idx)-1] = movi

In [16]:
movie_dict[0]

'Toy Story (1995)'

In [17]:
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[97] = 2
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

In [18]:
for i in range(0, len(ratings)):
    if ratings[i] > 0:
        print(f'Rated {ratings[i]} for {movie_dict[i]}')

Rated [4.] for Toy Story (1995)
Rated [3.] for Twelve Monkeys (1995)
Rated [5.] for Usual Suspects, The (1995)
Rated [4.] for Outbreak (1995)
Rated [5.] for Shawshank Redemption, The (1994)
Rated [3.] for While You Were Sleeping (1995)
Rated [5.] for Forrest Gump (1994)
Rated [2.] for Silence of the Lambs, The (1991)
Rated [4.] for Alien (1979)
Rated [5.] for Die Hard 2 (1990)
Rated [5.] for Sphere (1998)


# Learning Movie Ratings

In [19]:
movie_dataset = loadmat('data/ex8_movies.mat')
print(movie_dataset.keys())

Y = movie_dataset['Y']
R = movie_dataset['R']
print('Y shape: ', Y.shape)
print('R shape: ', R.shape)

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])
Y shape:  (1682, 943)
R shape:  (1682, 943)


In [20]:
Y = np.hstack((ratings, Y))
R = np.hstack((np.where(ratings != 0, 1, 0), R))

In [21]:
from utils import normalize_ratings

In [22]:
Ynorm, Ymean = normalize_ratings(Y, R)

In [23]:
num_movies, num_users = Y.shape
num_features = 10

In [24]:
# Set Initial Parameters (Theta, X)
X = np.random.randn(num_movies, num_features)
Theta = np.random.randn(num_users, num_features)
x = np.reshape(X, (-1, 1), order='F')
theta = np.reshape(Theta, (-1, 1), order='F')
initial_parameters = np.vstack((x, theta))

In [25]:
initial_parameters[num_movies*num_features:]

array([[ 1.40435258],
       [-0.65484419],
       [-1.1282191 ],
       ...,
       [-0.05789206],
       [ 0.04829022],
       [-0.07625768]])

In [28]:
from utils import train

In [29]:
lambd = 10
options = {'maxiter': 1000, 'disp': False}

In [30]:
theta = train(initial_parameters, Ynorm, R, num_users, num_movies, num_features, options, lambd)

In [31]:
X = np.reshape(theta[0:num_movies*num_features], (num_movies, num_features), order='F')
# theta = params[num_movies*num_features:, :]
theta = theta[num_movies*num_features:]
Theta = np.reshape(theta, (num_users, num_features))

# Recommendations

In [32]:
p = X.dot(Theta.T)
p.shape

(1682, 944)

In [33]:
predictions = p[:, 1].reshape(-1, 1) + Ymean

In [34]:
predictions.shape

(1682, 1)

In [35]:
p_sorted = np.sort(predictions, axis=0)
r = p_sorted[::-1].flatten().tolist()
p_idx = np.argsort(predictions, axis=0)
idx = p_idx[::-1].flatten().tolist()

In [36]:
for i in range(10):
    j = idx[i]
    print(f'Prediction rating {predictions[j][0]:.1f} for movie {movie_dict[j]}')

Prediction rating 10.6 for movie Bitter Sugar (Azucar Amargo) (1996)
Prediction rating 9.9 for movie Golden Earrings (1947)
Prediction rating 9.7 for movie I Don't Want to Talk About It (De eso no se habla) (1993)
Prediction rating 9.6 for movie Letter From Death Row, A (1998)
Prediction rating 9.3 for movie Visitors, The (Visiteurs, Les) (1993)
Prediction rating 9.3 for movie Pharaoh's Army (1995)
Prediction rating 9.2 for movie Substance of Fire, The (1996)
Prediction rating 9.1 for movie Safe (1995)
Prediction rating 9.0 for movie Fear, The (1995)
Prediction rating 9.0 for movie Big One, The (1997)


In [37]:
for i in range(0, len(ratings)):
    if ratings[i] > 0:
        print(f'Rated {ratings[i]} for {movie_dict[i]}')

Rated [4.] for Toy Story (1995)
Rated [3.] for Twelve Monkeys (1995)
Rated [5.] for Usual Suspects, The (1995)
Rated [4.] for Outbreak (1995)
Rated [5.] for Shawshank Redemption, The (1994)
Rated [3.] for While You Were Sleeping (1995)
Rated [5.] for Forrest Gump (1994)
Rated [2.] for Silence of the Lambs, The (1991)
Rated [4.] for Alien (1979)
Rated [5.] for Die Hard 2 (1990)
Rated [5.] for Sphere (1998)
