In [30]:
import numpy as np
import pandas as pd
import sklearn
import time
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

In [2]:
data_path = '../datasets/'
data = np.load(data_path + 'ratings_train.npy')
data_test = np.nan_to_num(np.load(data_path + 'ratings_test.npy'))

In [3]:
data.shape

(610, 4980)

In [55]:
class Solve:

    def __init__(self, k, mu, alpha,beta, train_data, descent_method = 'SGD', n_steps = 100, seed = 10):
        self.k = k
        self.mu = mu
        self.alpha = alpha
        self.beta = beta
        self.data = np.copy(train_data)
        self.non_nan = np.argwhere(~np.isnan(train_data))
        self.descent = descent_method
        self.I = np.random.rand(len(self.data), self.k) # Generating random matrices, maybe a better initialization can be initialized
        self.U = np.random.rand(len(self.data[0]), self.k).T
        
        self.I_2 = np.random.rand(len(self.data), self.k) # Generating random matrices, maybe a better initialization can be initialized
        self.U_2 = np.random.rand(len(self.data[0]), self.k)
                
        self.n_steps = n_steps
    
    def compute_sgd(self):
        d_I, d_U = 0, 0
        for (i, j) in self.non_nan:
            eij = data[i][j] - np.dot(self.I[i,:],self.U[:,j])
            for k in range(self.k):
                d_I += self.I[i][k] + self.alpha * (2 * eij * self.U[k][j] - self.mu * self.I[i][k])
                d_U += self.U[k][j] + self.beta * (2 * eij * self.I[i][k] - self.mu * self.U[k][j])

        return d_I,d_U
    
    def train(self, output_loss=False):
        loss = []
        for _ in range(self.n_steps):
            if output_loss:
                e = 0
                for (i,j) in self.non_nan:
                    e = e + pow(self.data[i][j] - np.dot(self.I[i,:],self.U[:,j]), 2)
                    for k in range(self.k):
                        e = e + (self.mu/2) * (pow(self.I[i][k],2) + pow(self.U[k][j],2))

                loss.append(e)

            for (i, j) in self.non_nan:
                eij = self.data[i][j] - np.dot(self.I[i,:],self.U[:,j])
                for k in range(self.k):
                    self.I[i, k] = self.I[i, k] + self.alpha * (2 * eij * self.U[k, j] - self.mu * self.I[i, k])
                    self.U[k, j] = self.U[k, j] + self.beta * (2 * eij * self.I[i, k] - self.mu * self.U[k, j])

        return loss
    
    
    def matrix_completion_als(self, max_iter=100, tol=1e-6, lambda_reg=0.1):
        m, n = self.data.shape
        error = 1e10
        Omega = (self.data > 0).astype(int)
        for _ in range(max_iter):
            # Update U while fixing V
            for i in range(m):
                indices = np.where(Omega[i, :] == 1)[0]
                Vi = self.U_2[indices, :]
                Yi = self.data[i, indices]
                self.I_2[i, :] = np.linalg.solve(Vi.T @ Vi + lambda_reg * np.eye(self.k), Vi.T @ Yi)
        
            # Update V while fixing U
            for j in range(n):
                indices = np.where(Omega[:, j] == 1)[0]
                Uj = self.I_2[indices, :]
                Yj = self.data[indices, j]
                self.U_2[j, :] = np.linalg.solve(Uj.T @ Uj + lambda_reg * np.eye(self.k), Uj.T @ Yj)

            # Compute the matrix approximation and error
            Y_hat = self.I_2 @ self.U_2.T
            diff = Omega * (self.data - Y_hat)
            new_error = np.linalg.norm(diff, 'fro')
            if abs(new_error - error) < tol:
                break
            error = new_error
    

    def train_masked(self):
        for _ in range(self.n_steps):
            masked = np.ma.array(self.data, mask=np.isnan(self.data))
            masked_T = np.ma.transpose(masked)
            d_U = np.ma.add(np.ma.add(-2*np.ma.dot(masked_T,self.I), 2*self.U.T@self.I.T@self.I) , 2*self.mu*self.U.T)
            #d_U = np.ma.add(np.ma.add(-2*masked_T@self.I, 2*self.U@self.I.T@self.I),2*self.mu*self.U)
            d_I = np.ma.add(np.ma.add(-2*np.ma.dot(masked,self.U.T) ,2*self.I@self.U@self.U.T), 2*self.mu*self.I)
            self.I -= self.alpha*d_I
            self.U -= self.beta*d_U.T

    def rmse(self, test_matrix):
        masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
        predictions = np.around((self.I@self.U)*2, 0)/2
        diff = np.ma.subtract(predictions, masked)
        squared = np.ma.power(diff, 2)
        return np.ma.sqrt(np.ma.mean(squared))
    
    def rmse_als(self, test_matrix):
        masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
        predictions = np.around((self.I_2@self.U_2.T)*2, 0)/2
        diff = np.ma.subtract(predictions, masked)
        squared = np.ma.power(diff, 2)
        return np.ma.sqrt(np.ma.mean(squared))
    
    def rmse_als2(self, test_matrix):
        masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
        predictions = self.I_2@self.U_2.T
        rows, cols = predictions.shape
        for i in range(rows):
            for j in range(cols):
                decimal_part = predictions[i, j] - int(predictions[i, j])
                if decimal_part < 0.45:
                    predictions[i, j] = np.floor(predictions[i, j])
                elif decimal_part > 0.55:
                    predictions[i, j] = np.ceil(predictions[i, j])
                else:
                    predictions[i, j] = int(predictions[i, j]) + 0.5
        diff = np.ma.subtract(predictions, masked)
        squared = np.ma.power(diff, 2)
        return np.ma.sqrt(np.ma.mean(squared))

    def predict(self):
        return np.around((self.I@self.U)*2, 0)/2
    
    def predict_als(self):
        return np.around((self.I_2@self.U_2.T)*2, 0)/2
    
    def predict_als2(self):
        pred = self.I_2@self.U_2.T
        rows, cols = pred.shape
        for i in range(rows):
            for j in range(cols):
                decimal_part = pred[i, j] - int(pred[i, j])
                if decimal_part < 0.45:
                    pred[i, j] = np.floor(pred[i, j])
                elif decimal_part > 0.55:
                    pred[i, j] = np.ceil(pred[i, j])
                else:
                    pred[i, j] = int(pred[i, j]) + 0.5
        return pred
    

if __name__ == '__main__':
    data_path = '../datasets/'
    data = np.load(data_path + 'ratings_train.npy')
    test_data = np.load(data_path + 'ratings_test.npy')
    
    np.random.seed(42)
    
    t_1 = time.time()
    solver = Solve(k=3, mu = 0.02, alpha = 0.0005, beta = 0.0005, train_data=data, n_steps=50)
    pred = solver.train()
    t_2 = time.time()
    print(f'Elapsed time solver without mask: {t_2 - t_1}')
    rmse = solver.rmse(test_data)
    train_rmse = solver.rmse(data)
    print("\nGD Solver no mask")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    table = solver.predict()
    print(table)
    
    '''
    t_1 = time.time()
    solver_2 = Solve(k=3, mu = 0.0002, alpha = 0.00005, beta = 0.000005, train_data=data, n_steps=50)
    pred = solver_2.train_masked()
    t_2 = time.time()
    print(f'\nElapsed time solver with mask: {t_2 - t_1}')
    rmse = solver_2.rmse(test_data)
    train_rmse = solver_2.rmse(data)
    print("\nSolver mask")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    '''
    
    t_1 = time.time()
    solver_als = Solve(k=1, mu=0.02, alpha=0.0005, beta=0.0005, train_data=data, n_steps=50)
    pred = solver_als.matrix_completion_als()
    t_2 = time.time()
    print(f'\nElapsed time ALS solver: {t_2 - t_1}')
    rmse = solver_als.rmse_als(test_data)
    train_rmse = solver_als.rmse_als(data)
    rmse2 = solver_als.rmse_als2(test_data)
    train_rmse2 = solver_als.rmse_als2(data)
    print("\nALS Solver")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    print(f"RMSE 2 against TRAIN: {train_rmse2}")
    print(f"RMSE 2 against TEST: {rmse2}")
    table_als = solver_als.predict_als()
    print(table_als)
    table_als2 = solver_als.predict_als2()
    print(table_als2)

Elapsed time solver without mask: 12.304697751998901

GD Solver no mask
RMSE against TRAIN: 0.8881930842280643
RMSE against TEST: 1.0065218682587045
[[4.5 4.  4.  ... 4.  3.  3. ]
 [3.5 3.  2.5 ... 3.  2.  2. ]
 [1.5 1.5 1.5 ... 1.  0.5 1. ]
 ...
 [3.5 3.5 3.5 ... 3.  2.5 2.5]
 [3.5 3.  2.5 ... 3.  2.  2. ]
 [4.5 4.  4.  ... 4.  3.  3. ]]

Elapsed time ALS solver: 18.15254831314087

ALS Solver
RMSE against TRAIN: 0.7649857367404258
RMSE against TEST: 0.9239338246109781
RMSE 2 against TRAIN: 0.7856804796253423
RMSE 2 against TEST: 0.9432895844888124
[[4.5 4.  3.5 ... 4.5 4.  4.5]
 [4.  3.5 3.  ... 3.5 3.5 4. ]
 [1.5 1.5 1.  ... 1.5 1.5 1.5]
 ...
 [3.5 3.  3.  ... 3.5 3.  3.5]
 [3.5 3.  3.  ... 3.5 3.  3.5]
 [4.  3.5 3.  ... 4.  3.5 4. ]]
[[5.  4.  4.  ... 4.  4.  5. ]
 [4.  3.  3.  ... 4.  3.  4. ]
 [1.5 1.  1.  ... 1.5 1.  1.5]
 ...
 [4.  3.  3.  ... 3.  3.  4. ]
 [3.5 3.  3.  ... 3.  3.  4. ]
 [4.  4.  3.  ... 4.  4.  4. ]]


In [20]:
import numpy as np

def initialize_with_svd(Y, rank):
    # Fill missing values with zeros
    Y_filled = np.nan_to_num(Y)

    # Perform SVD
    U, Sigma, Vt = np.linalg.svd(Y_filled, full_matrices=False)

    # Take top `rank` singular vectors/values
    U_init = U[:, :rank]
    Sigma_init = np.diag(Sigma[:rank])
    Vt_init = Vt[:rank, :]

    # Initialize U and V
    U_initialized = U_init @ np.sqrt(Sigma_init)
    V_initialized = (np.sqrt(Sigma_init) @ Vt_init).T

    return U_initialized, V_initialized


def matrix_completion_als(data, rank, max_iter=50, tol=1e-6, lambda_reg=0.1):
    m, n = data.shape
    I = np.random.rand(m, rank)
    U = np.random.rand(n, rank)
    #I, U = initialize_with_svd(data, rank)
    error = 1e10
    
    Omega = (data > 0).astype(int)
    #R = np.nan_to_num(data, copy=True)
    
    for _ in range(max_iter):
        # Update U while fixing V
        for i in range(m):
            indices = np.where(Omega[i, :] == 1)[0]
            Vi = U[indices, :]
            Yi = data[i, indices]
            I[i, :] = np.linalg.solve(Vi.T @ Vi + lambda_reg * np.eye(rank), Vi.T @ Yi)
        
        # Update V while fixing U
        for j in range(n):
            indices = np.where(Omega[:, j] == 1)[0]
            Uj = I[indices, :]
            Yj = data[indices, j]
            U[j, :] = np.linalg.solve(Uj.T @ Uj + lambda_reg * np.eye(rank), Uj.T @ Yj)

        # Compute the matrix approximation and error
        Y_hat = I @ U.T
        diff = Omega * (data - Y_hat)
        new_error = np.linalg.norm(diff, 'fro')
        if abs(new_error - error) < tol:
            break
        error = new_error
    
    return I @ U.T


data_path = '../datasets/'
data = np.load(data_path + 'ratings_train.npy')
test_data = np.load(data_path + 'ratings_test.npy')


# Create the Omega matrix: 1 where Y has an entry, 0 otherwise
#Omega = (data > 0).astype(int)

# Display the generated matrix
print("Original matrix (with missing values set to 0):")
print(data)


# Use the ALS method to complete the matrix
completed_matrix = matrix_completion_als(data, rank=1)

print("\nCompleted matrix:")
print(completed_matrix)


def rmse(completed_matrix, test_matrix):
    masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
    predictions = completed_matrix
    diff = np.ma.subtract(predictions, masked)
    squared = np.ma.power(diff, 2)
    return np.ma.sqrt(np.ma.mean(squared))

print(rmse(completed_matrix, data))
print(rmse(completed_matrix, test_data))

Original matrix (with missing values set to 0):
[[ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan  2. nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]

Completed matrix:
[[4.63298058 4.10174728 3.62796024 ... 4.42211141 4.13562518 4.69652646]
 [3.78638699 3.35222698 2.96501597 ... 3.61405035 3.37991431 3.838321  ]
 [1.52128492 1.34684922 1.19127657 ... 1.45204395 1.35797337 1.54215084]
 ...
 [3.61787839 3.20304016 2.8330615  ... 3.45321138 3.22949529 3.66750114]
 [3.53151891 3.12658294 2.76543576 ... 3.37078253 3.15240659 3.57995715]
 [4.13282101 3.65893771 3.23629897 ... 3.9447165  3.68915828 4.18950671]]
0.7511346297201029
0.9122731566891428
