In [50]:
import pandas as pd
import numpy as np
import json

# Load the data

In [51]:
df_train = pd.read_csv("data/data_train.csv")
df_test = pd.read_csv("data/data_test.csv")

In [52]:
with open("data/data_ids.json") as f:
    ids = json.load(f)

max_movieId = max(ids["moviesIDs"].values())
max_userId = max(ids["userIDs"].values())

# Create a matrix of users and movies
base_matrix = np.zeros((max_movieId + 1, max_userId + 1))
base_matrix.shape

(9633, 385)

In [53]:
# Utility matrix and Binary matrix
from scipy.sparse import lil_matrix

def utility_matrix_AND_binary_matrix(dfTrain, zerosMatrix = base_matrix, jsonIds = ids):
    Y = lil_matrix(zerosMatrix.shape)  # Use sparse matrix format
    R = lil_matrix(zerosMatrix.shape)
    for i, row in dfTrain.iterrows():
        original_movieId, original_userId = int(row["movieId"]), int(row["userId"])
        movieId = jsonIds["moviesIDs"][str(original_movieId)]
        userId = jsonIds["userIDs"][str(original_userId)]
        Y[movieId, userId] = row["rating"]
        R[movieId, userId] = 1
    return Y, R

Y, R = utility_matrix_AND_binary_matrix(df_train)

In [54]:
def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda):
    """
    Returns the cost and gradient for the collaborative filtering
    Lambda - regularization parameter
    """
        
    # Unfold the params
    X = params[:num_movies*num_features].reshape(num_movies,num_features)
    Theta = params[num_movies*num_features:].reshape(num_users,num_features)
    
    predictions =  X @ Theta.T
    err = (predictions - Y)
    J = 1/2 * np.sum((err**2) * R)
    
    #compute regularized cost function
    reg_X =  Lambda/2 * np.sum(Theta**2)
    reg_Theta = Lambda/2 *np.sum(X**2)
    reg_J = J + reg_X + reg_Theta
    
    # Compute gradient
    X_grad = err*R @ Theta
    Theta_grad = (err*R).T @ X
    grad = np.append(X_grad.flatten(),Theta_grad.flatten())
    
    # Compute regularized gradient
    reg_X_grad = X_grad + Lambda*X
    reg_Theta_grad = Theta_grad + Lambda*Theta
    reg_grad = np.append(reg_X_grad.flatten(),reg_Theta_grad.flatten())
    
    return J, grad, reg_J, reg_grad

os nossos dados têm 20 generos diferentes, entao vamos usar 20 features (visto no 02 data_vis)

In [55]:
num_movies, num_users = base_matrix.shape
num_features = 20

X = np.random.normal(loc=0, scale=0.3, size=(num_movies, num_features))
Theta = np.random.normal(loc=0, scale=0.3, size=(num_users, num_features))

print(X.shape, Theta.shape)

params = np.append(X.flatten(),Theta.flatten())

#print(cofiCostFunc(params, dense_arrayY, dense_arrayR, num_users, num_movies, num_features, 0)[0])

(9633, 20) (385, 20)


In [56]:
Y = Y.toarray()
R = R.toarray()
cofiCostFunc(params, Y, R, num_users, num_movies, num_features, 1)

(np.float64(502274.353732395),
 array([-34.53112541, -16.95835109,  -0.94750721, ..., -16.43014244,
        -73.16253632,  41.29122414], shape=(200360,)),
 np.float64(511291.081563786),
 array([-34.62630289, -16.85193177,  -0.90215495, ..., -16.18290818,
        -73.75762825,  41.47983194], shape=(200360,)))