In [1]:
import pandas as pd
import numpy as np
import json

# Load the data

In [2]:
df_train = pd.read_csv("data/data_train.csv")
df_test = pd.read_csv("data/data_test.csv")

In [3]:
with open("data/data_ids.json") as f:
    ids = json.load(f)

max_movieId = 0
max_userId = 0
for values in ids["moviesIDs"]:
    max_movieId = max(max_movieId, int(values))
for values in ids["userIDs"]:
    max_userId = max(max_userId, int(values))

# Create a matrix of users and movies
base_matrix = np.zeros((max_movieId + 1, max_userId + 1))
base_matrix.shape

(131263, 138494)

In [4]:
# Utility matrix and Binary matrix
from scipy.sparse import lil_matrix

def utility_matrix_AND_binary_matrix(dfTrain, zerosMatrix = base_matrix, jsonIds = ids):
    Y = lil_matrix(zerosMatrix.shape)  # Use sparse matrix format
    R = lil_matrix(zerosMatrix.shape)
    for i, row in dfTrain.iterrows():
        original_movieId, original_userId = int(row["movieId"]), int(row["userId"])
        movieId = jsonIds["moviesIDs"][str(original_movieId)]
        userId = jsonIds["userIDs"][str(original_userId)]
        Y[movieId, userId] = row["rating"]
        R[movieId, userId] = 1
    
    return Y, R

Y, R = utility_matrix_AND_binary_matrix(df_train)

In [5]:
# TESTE DE PASSAR DE SPARSE PARA DENSE

#dense_arrayY = Y.toarray()
#dense_arrayR = R.toarray()

#dense_arrayR

In [6]:
def cofiCostFunc(params, Y, R, num_users, num_movies, num_features, Lambda):
    """
    Returns the cost and gradient for the collaborative filtering
    Lambda - regularization parameter
    """
        
    # Unfold the params
    X = params[:num_movies*num_features].reshape(num_movies,num_features)
    Theta = params[num_movies*num_features:].reshape(num_users,num_features)
    
    predictions =  X @ Theta.T
    err = (predictions - Y)
    J = 1/2 * np.sum((err**2) * R)
    
    #compute regularized cost function
    reg_X =  Lambda/2 * np.sum(Theta**2)
    reg_Theta = Lambda/2 *np.sum(X**2)
    reg_J = J + reg_X + reg_Theta
    
    # Compute gradient
    X_grad = err*R @ Theta
    Theta_grad = (err*R).T @ X
    grad = np.append(X_grad.flatten(),Theta_grad.flatten())
    
    # Compute regularized gradient
    reg_X_grad = X_grad + Lambda*X
    reg_Theta_grad = Theta_grad + Lambda*Theta
    reg_grad = np.append(reg_X_grad.flatten(),reg_Theta_grad.flatten())
    
    return J, grad, reg_J, reg_grad

os nossos dados têm 20 generos diferentes, entao vamos usar 20 features (visto no 02 data_vis)

In [7]:
num_movies, num_users = base_matrix.shape
num_features = 20

X = np.random.normal(loc=0, scale=0.3, size=(num_movies, num_features))
Theta = np.random.normal(loc=0, scale=0.3, size=(num_users, num_features))

print(X.shape, Theta.shape)

params = np.append(X.flatten(),Theta.flatten())

#print(cofiCostFunc(params, dense_arrayY, dense_arrayR, num_users, num_movies, num_features, 0)[0])

import gc
from scipy.sparse import csr_matrix
Y_sparse = csr_matrix(Y)
R_sparse = csr_matrix(R)
del X
del Theta
del Y
del R
gc.collect()

(131263, 20) (138494, 20)


0

: 

In [None]:
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix

def cofiCostFuncSparse(params, Y_sparse, R_sparse, num_users, num_movies, num_features, Lambda):
    """
    Returns the cost and gradient for the collaborative filtering, adapted for sparse matrices.
    Y_sparse and R_sparse should be scipy sparse matrices.
    Lambda - regularization parameter.
    """
    
    # Convert R to CSR for faster element-wise operations
    R_sparse = R_sparse.tocsr()
    
    # Unfold the params
    X = params[:num_movies * num_features].reshape(num_movies, num_features)
    Theta = params[num_movies * num_features:].reshape(num_users, num_features)
    
    # Compute predictions and errors (keeping sparsity)
    predictions = X @ Theta.T
    err = R_sparse.multiply(predictions - Y_sparse)  # Element-wise multiply to keep only observed entries

    # Compute the cost function
    J = 1 / 2 * np.sum(err.data ** 2)  # Only sum over non-zero elements in the sparse matrix

    # Add regularization to the cost
    reg_X = Lambda / 2 * np.sum(Theta ** 2)
    reg_Theta = Lambda / 2 * np.sum(X ** 2)
    reg_J = J + reg_X + reg_Theta

    # Compute gradients
    X_grad = err @ Theta  # Sparse matrix-vector product
    Theta_grad = err.T @ X  # Sparse matrix-vector product
    
    # Add regularization to the gradients
    reg_X_grad = X_grad + Lambda * X
    reg_Theta_grad = Theta_grad + Lambda * Theta
    
    # Flatten gradients into a single vector
    reg_grad = np.append(reg_X_grad.flatten(), reg_Theta_grad.flatten())
    
    return J, reg_J, reg_grad


cofiCostFuncSparse(params, Y_sparse, R_sparse, num_users, num_movies, num_features, 1.5)