# 2-推荐系统

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.io as sio
import matplotlib.pyplot as plt

In [2]:
sns.set(context = 'notebook', style = 'white', palette = 'RdBu')

# Load data and setting up

In [4]:
# Notes: X - num_movies (1682) x num_features (10) matrix of movie features
#        Theta - num_users (943) x num_features (10) matrix of user features
#        Y - num_movies x num_users matrix of user ratings of movies
#        R - num_movies x num_users matrix, where R(i, j) = 1 if the
#            i-th movie was rated by the j-th user

In [5]:
movies_mat = sio.loadmat('./data/ex8_movies.mat')
Y, R = movies_mat.get('Y'), movies_mat.get('R')

Y.shape, R.shape

((1682, 943), (1682, 943))

In [16]:
m, u = Y.shape
# m: how many movies
# u: how many users

n = 10 # how many features for a movie

In [17]:
param_mat = sio.loadmat('./data/ex8_movieParams.mat')
theta, X = param_mat.get('Theta'), param_mat.get('X')

theta.shape, X.shape

((943, 10), (1682, 10))

# cost

$$ J(x^{(1)}, ..., x^{(n_m)}, \theta^{(1)}, ..., \theta^{(n_u)}) = 
\frac{1}{2}\sum\limits_{(i,j):r(i,j) = 1}((\theta^{(j)})^Tx^{(i)} - y^{(i,j)})^2 $$

In [19]:
def serialize(X, theta):
    '''
    serialize 2 matrix
    '''
    # X (movie, feature), (1682, 10): movie features
    # theta (user, feature), (943, 10): user preference
    return np.concatenate((X.ravel(), theta.ravel()))

def deserialize(param, n_movie, n_user, n_features):
    '''
    into ndarray of X(1682, 10), theta(943, 10)
    '''
    return param[:n_movie * n_features].reshape(n_movie, n_features),\
           param[n_movie * n_features:].reshape(n_user, n_features)

# recommendation fn
def cost(param, Y, R, n_features):
    """
    compute cost for every r(i, j) = 1
    Args:
        param: serialized X, theta
        Y (movie, user), (1682, 943): (movie, user) rating
        R (movie, user), (1682, 943): (movie, user) has rating
    """
    # theta (user, feature), (943, 10): user preference
    # X (movie, feature), (1682, 10): movie features
    n_movie, n_user = Y.shape
    X, theta = deserialize(param, n_movie, n_user, n_features)

    inner = np.multiply(X @ theta.T - Y, R)

    return np.power(inner, 2).sum() / 2


def gradient(param, Y, R, n_features):
    # theta (user, feature), (943, 10): user preference
    # X (movie, feature), (1682, 10): movie features
    n_movies, n_user = Y.shape
    X, theta = deserialize(param, n_movies, n_user, n_features)

    inner = np.multiply(X @ theta.T - Y, R)  # (1682, 943)

    # X_grad (1682, 10)
    X_grad = inner @ theta

    # theta_grad (943, 10)
    theta_grad = inner.T @ X

    # roll them together and return
    return serialize(X_grad, theta_grad)


def regularized_cost(param, Y, R, n_features, l=1):
    reg_term = np.power(param, 2).sum() * (l / 2)

    return cost(param, Y, R, n_features) + reg_term


def regularized_gradient(param, Y, R, n_features, l=1):
    grad = gradient(param, Y, R, n_features)
    reg_term = l * param

    return grad + reg_term

In [20]:
# use subset of data to calculate the cost as in pdf...
users = 4
movies = 5
features = 3

X_sub = X[:movies, :features]
theta_sub = theta[:users, :features]
Y_sub = Y[:movies, :users]
R_sub = R[:movies, :users]

param_sub = serialize(X_sub, theta_sub)
cost(param_sub, Y_sub, R_sub, features)

22.224603725685675

In [21]:
param = serialize(X, theta)  # total real params

cost(serialize(X, theta), Y, R, 10)  # this is real total cost

27918.64012454421

# gradient

$$ \frac{\partial{J}}{\partial{x_k^{(i)}}} = \sum\limits_{j:r(i,j) = 1}((\theta^{(j)})^Tx^{(i)} - y^{(i, j)})\theta^{(j)}_k $$

$$ \frac{\partial{J}}{\partial{\theta_k^{(j)}}} = \sum\limits_{i:r(i,j) = 1}((\theta^{(j)})^Tx^{(i)} - y^{(i, j)})x^{(i)}_k $$

In [22]:
n_movie, n_user = Y.shape

X_grad, theta_grad = deserialize(gradient(param, Y, R, 10),
                                      n_movie, n_user, 10)

$$ X_{grad}(i,:) = (X(i, :)*\Theta^T_{temp} - Y_{temp}) * \Theta_{temp} $$

In [23]:
assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

# regularized cost

In [24]:
# in the ex8_confi.m, lambda = 1.5, and it's using sub data set
regularized_cost(param_sub, Y_sub, R_sub, features, l = 1.5)

31.34405624427422

In [25]:
regularized_cost(param, Y, R, 10, l = 1) # total regularized cost

32520.682450229557

# regularized gradient

$$ \frac{\partial{J}}{\partial{x_k^{(i)}}} = \sum\limits_{j:r(i,j) = 1}((\theta^{(j)})^Tx^{(i)} - y^{(i, j)})\theta^{(j)}_k + \lambda x_k^{(i)} $$

$$ \frac{\partial{J}}{\partial{\theta_k^{(j)}}} = \sum\limits_{i:r(i,j) = 1}((\theta^{(j)})^Tx^{(i)} - y^{(i, j)})x^{(i)}_k + \lambda \theta_k^{(j)}$$

In [26]:
n_movie, n_user = Y.shape

X_grad, theta_grad = deserialize(regularized_gradient(param, Y, R, 10),
                                                                n_movie, n_user, 10)

assert X_grad.shape == X.shape
assert theta_grad.shape == theta.shape

# parse movie_id.txt

In [27]:
movie_list = []

with open('./data/movie_ids.txt', encoding='latin-1') as f:
    for line in f:
        tokens = line.strip().split(' ')
        movie_list.append(' '.join(tokens[1:]))

movie_list = np.array(movie_list)

In [28]:
ratings = np.zeros(1682)

ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

# prepare data

In [29]:
Y, R = movies_mat.get('Y'), movies_mat.get('R')

Y = np.insert(Y, 0, ratings, axis = 1) # now I become user 0
Y.shape

(1682, 944)