In [21]:
import numpy as np
import pandas as pd
import sklearn
import time
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

In [54]:
data_path = '../datasets/'
data = np.load(data_path + 'ratings_train.npy')
data_test = np.nan_to_num(np.load(data_path + 'ratings_test.npy'))

In [65]:
data.shape

(610, 4980)

In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import inv
A = csc_matrix(data)
Ainv = inv(A.T@A)
Ainv

In [78]:
A.shape
print(data)
print(len(data))
print(len(data[0]))

[[ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan  2. nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
610
4980


In [73]:
class Solve:

    def __init__(self, k, mu, alpha,beta, train_data, descent_method = 'SGD', n_steps = 100, seed = 10):
        self.k = k
        self.mu = mu
        self.alpha = alpha
        self.beta = beta
        self.data = np.copy(train_data)
        self.non_nan = np.argwhere(~np.isnan(train_data))
        self.descent = descent_method
        self.I = np.random.rand(len(self.data), self.k) #Generating random matrices, maybe a better initialization can be initialized
        self.U = np.random.rand(len(self.data[0]), self.k).T
        
        self.data2 = np.nan_to_num(data, copy=True)
        
        self.n_steps = n_steps
    
    def compute_sgd(self):
        d_I, d_U = 0, 0
        for (i, j) in self.non_nan:
            eij = data[i][j] - np.dot(self.I[i,:],self.U[:,j])
            for k in range(self.k):
                d_I += self.I[i][k] + self.alpha * (2 * eij * self.U[k][j] - self.mu * self.I[i][k])
                d_U += self.U[k][j] + self.beta * (2 * eij * self.I[i][k] - self.mu * self.U[k][j])

        return d_I,d_U
    
    def train(self, output_loss=False):
        loss = []
        for _ in range(self.n_steps):
            if output_loss:
                e = 0
                for (i,j) in self.non_nan:
                    e = e + pow(self.data[i][j] - np.dot(self.I[i,:],self.U[:,j]), 2)
                    for k in range(self.k):
                        e = e + (self.mu/2) * (pow(self.I[i][k],2) + pow(self.U[k][j],2))

                loss.append(e)

            for (i, j) in self.non_nan:
                eij = self.data[i][j] - np.dot(self.I[i,:],self.U[:,j])
                for k in range(self.k):
                    self.I[i, k] = self.I[i, k] + self.alpha * (2 * eij * self.U[k, j] - self.mu * self.I[i, k])
                    self.U[k, j] = self.U[k, j] + self.beta * (2 * eij * self.I[i, k] - self.mu * self.U[k, j])
                    
                    #self.I[i, k] = np.dot(eij, self.U[k, j]) * np.linalg.inv(np.dot(self.U[k, j].T, self.U[k, j]) + self.mu * np.eye(self.k))
                    #self.U[k, j] = np.dot(eij.T, self.I[i, k]) * np.linalg.inv(np.dot(self.I[i, k].T, self.I[i, k]) + self.mu * np.eye(self.k))
                    
                    #self.I[i, k] = np.linalg.solve(np.dot(self.U[k, j].T, self.U[k, j]) + self.mu * np.eye(self.k), np.dot(self.U[k, j].T, eij.T)).T
                    #self.U[k, j] = np.linalg.solve(np.dot(self.I[i, k].T, self.I[i, k]) + self.mu * np.eye(self.k), np.dot(self.I[i, k].T, eij))

        self.I = np.around(self.I*2, 0)/2
        self.U = np.around(self.U*2, 0)/2
        return loss
    
    
    ####################################################################################################
    ####################################################################################################
    
    def als_train(self, output_loss=False):
        loss = []
        for _ in range(self.n_steps):
            if output_loss:
                e = 0
                for (i, j) in self.non_nan:
                    e = e + pow(self.data[i][j] - np.dot(self.I[i, :], self.U[:, j]), 2)
                    for k in range(self.k):
                        e = e + (self.mu / 2) * (pow(self.I[i][k], 2) + pow(self.U[k][j], 2))

                loss.append(e)

            for (i, j) in self.non_nan:
                eij = self.data[i][j] - np.dot(self.I[i, :], self.U[:, j])
                
                self.I[i, :] = np.linalg.solve(
                    np.dot(self.U[:, j].T, self.U[:, j]) + self.mu * np.eye(self.k),
                    np.dot(self.U[:, j].T, eij)
                )
                self.U[:, j] = np.linalg.solve(
                    np.dot(self.I[i, :].T, self.I[i, :]) + self.mu * np.eye(self.k),
                    np.dot(self.I[i, :].T, eij)
                )

        self.I = np.around(self.I * 2, 0) / 2
        self.U = np.around(self.U * 2, 0) / 2
        return loss
    
    ####################################################################################################
    ####################################################################################################
    
    def matrix_completion_als(self, max_iter=1000, tol=1e-6, lambda_reg=0.1):
        m, n = self.data.shape
        #U = np.random.rand(m, rank)
        #V = np.random.rand(n, rank)
        error = 1e10
        Omega = (self.data > 0).astype(int)
        for it in range(max_iter):
            # Update U while fixing V
            for i in range(m):
                indices = np.where(Omega[i, :] == 1)[0]
                Vi = self.U[indices, :]
                Yi = self.data[i, indices]
                self.I[i, :] = np.linalg.solve(Vi.T @ Vi + lambda_reg * np.eye(self.k), Vi.T @ Yi)
        
            # Update V while fixing U
            for j in range(n):
                indices = np.where(Omega[:, j] == 1)[0]
                Uj = self.I[indices, :]
                Yj = self.data[indices, j]
                self.U[j, :] = np.linalg.solve(Uj.T @ Uj + lambda_reg * np.eye(self.k), Uj.T @ Yj)

            # Compute the matrix approximation and error
            Y_hat = self.I @ self.U.T
            diff = Omega * (self.data - Y_hat)
            new_error = np.linalg.norm(diff, 'fro')
            if abs(new_error - error) < tol:
                break
            error = new_error
    
        #return self.I @ self.U.T

    ####################################################################################################
    ####################################################################################################
    
    
    def train_als(self, output_loss = False):
        loss = []
        for _ in range(self.n_steps):
            if output_loss:
                e = 0
                for (i, j) in self.non_nan:
                    e = e + pow(self.data[i][j] - np.dot(self.I[i, :], self.U[:, j]), 2)
                    for k in range(self.k):
                        e = e + (self.mu / 2) * (pow(self.I[i][k], 2) + pow(self.U[k][j], 2))

                loss.append(e)
                
        for _ in range(self.n_steps):
            # Fix U and estimate I
            for i in range(self.data.shape[0]):
                Ai = np.dot(self.U, self.U.T) + self.mu * np.eye(self.k) + 1e-6 * np.eye(self.k)
                det = np.linalg.det(Ai)
                if abs(det) < 1e-25:
                    Ai += 1e-6 * np.eye(self.k)
                print("NaN in Ai:", np.isnan(Ai).sum())
                
                data_slice = self.data[i, :].T
                print("NaN in data_slice:", np.isnan(data_slice).sum())
                print("Inf in data_slice:", np.isinf(data_slice).sum())
                Vi = np.dot(self.U, data_slice)
                
                if np.isnan(Vi).sum() > 0:
                    print("First appearance of NaN in Vi!")
                    print("i:", i)
                    print("Ui:", self.U)
                    print("data_slice:", data_slice)
                    break

                #Vi = np.dot(self.U, self.data[i, :].T)
                print("NaN in Vi:", np.isnan(Vi).sum())
                self.I[i, :] = np.linalg.solve(Ai, Vi)
                print("NaN in I after update:", np.isnan(self.I).sum())

            # Fix I and estimate U
            for j in range(self.data.shape[1]):
                Aj = np.dot(self.I.T, self.I) + self.mu * np.eye(self.k) + 1e-6 * np.eye(self.k)
                det = np.linalg.det(Aj)
                if abs(det) < 1e-25:
                    Aj += 1e-6 * np.eye(self.k)
                print("NaN in Aj:", np.isnan(Aj).sum())
                Vj = np.dot(self.I.T, self.data[:, j])
                print("NaN in Vj:", np.isnan(Vj).sum())
                self.U[:, j] = np.linalg.solve(Aj, Vj)
                print("NaN in U after update:", np.isnan(self.U).sum())
                
        return loss



    def train_masked(self):
        for _ in range(self.n_steps):
            masked = np.ma.array(self.data, mask=np.isnan(self.data))
            masked_T = np.ma.transpose(masked)
            d_U = np.ma.add(np.ma.add(-2*np.ma.dot(masked_T,self.I), 2*self.U.T@self.I.T@self.I) , 2*self.mu*self.U.T)
            #d_U = np.ma.add(np.ma.add(-2*masked_T@self.I, 2*self.U@self.I.T@self.I),2*self.mu*self.U)
            d_I = np.ma.add(np.ma.add(-2*np.ma.dot(masked,self.U.T) ,2*self.I@self.U@self.U.T), 2*self.mu*self.I)
            self.I -= self.alpha*d_I
            self.U -= self.beta*d_U.T

    def rmse(self, test_matrix):
        # diffs = 0
        # predictions = self.predict()
        # T = len(np.argwhere(~np.isnan(test_matrix)))
        # for (i, j) in np.argwhere(~np.isnan(test_matrix)):
        #     diff = (test_matrix[i, j] - predictions[i, j])**2
        #     diffs += diff
        # return np.sqrt(diffs/T)
        masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
        predictions = self.I@self.U
        diff = np.ma.subtract(predictions, masked)
        squared = np.ma.power(diff, 2)
        return np.ma.sqrt(np.ma.mean(squared))
    
    def rmse_als(self, test_matrix):
        predictions = self.I @ self.U
        nans = np.isnan(test_matrix)
        non_nans = ~nans
        num_valid_entries = np.sum(non_nans)
    
        # Only compute RMSE on non-NaN entries
        squared_errors = (test_matrix[non_nans] - predictions[non_nans]) ** 2
        mean_squared_error = np.sum(squared_errors) / num_valid_entries
    
        return np.sqrt(mean_squared_error)




    def predict(self):
        return self.I@self.U

if __name__ == '__main__':
    data_path = '../datasets/'
    data = np.load(data_path + 'ratings_train.npy')
    test_data = np.load(data_path + 'ratings_test.npy')
    
    np.random.seed(42)
    t_1 = time.time()
    solver_2 = Solve(k=5,mu = 0.0002,alpha = 0.00005,beta = 0.000005,train_data=data, n_steps=50)
    pred = solver_2.train_masked()
    t_2 = time.time()
    print(f'elapsed time solver with mask: {t_2 - t_1}')
    t_1 = time.time()
    solver = Solve(k=5,mu = 0.02,alpha = 0.0005,beta = 0.0005,train_data=data, n_steps=50)
    pred = solver.train()
    t_2 = time.time()
    print(f'elapsed time solver without mask: {t_2 - t_1}')
    rmse = solver.rmse(test_data)
    train_rmse = solver.rmse(data)
    print("Solver no mask")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    rmse = solver_2.rmse(test_data)
    train_rmse = solver_2.rmse(data)
    print("Solver mask")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    
    '''
    solver_als = Solve(k=5, mu=0.002, alpha=0.00005, beta=0.000005, train_data=data, n_steps=50)
    loss = solver_als.als_train(output_loss=True)
    t_2 = time.time()
    print(f'elapsed time solver with ALS: {t_2 - t_1}')
    rmse = solver_als.rmse(test_data)
    train_rmse = solver_als.rmse(data)
    print("Solver with ALS")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")
    '''
    t_1 = time.time()
    solver_als = Solve(k=3, mu=0.02, alpha=0.0005, beta=0.0005, train_data=data, n_steps=50)
    pred = solver_als.matrix_completion_als()
    t_2 = time.time()
    print(f'elapsed time ALS solver: {t_2 - t_1}')
    rmse = solver_als.rmse(test_data)
    train_rmse = solver_als.rmse(data)
    predictions = solver_als.predict()
    print("Number of NaNs in predictions:", np.isnan(predictions).sum())
    print("ALS Solver")
    print(f"RMSE against TRAIN: {train_rmse}")
    print(f"RMSE against TEST: {rmse}")

elapsed time solver with mask: 4.7507569789886475
elapsed time solver without mask: 18.428391456604004
Solver no mask
RMSE against TRAIN: 0.9284165251272595
RMSE against TEST: 1.0298078582682044
Solver mask
RMSE against TRAIN: 3.5704404164604746
RMSE against TEST: 3.562575181679188


IndexError: index 44 is out of bounds for axis 0 with size 3

In [None]:
m = data.shape[0]
n = data.shape[1]
k = 5
mu = .1
lam = .1
step_I = .00007
step_U = .00007
I = np.ones((m, k))
U = np.ones((n, k))
V = 3 * np.random.rand(k,m)
W = 3 * np.random.rand(k,n)
R = np.nan_to_num(data, copy=True)

iters = 100

for i in range(iters):
  loss = np.linalg.norm((R - I@U.T), ord='fro')**2 + mu*np.linalg.norm(I, ord='fro')**2 + lam*np.linalg.norm(U, ord='fro')**2

  #print(f'Loss at iter {i+1}: {loss}')

  grad_U = -2*R.T@I + 2*U@I.T@I + 2*mu*U
  grad_I = -2*R@U + 2*I@U.T@U + 2*lam*I
  
  grad_I_als = R@U.dot(U.T@U + lam*np.eye(k))**(-1)
  grad_U_als = R.T@I.dot(I.T@I + mu*np.eye(k))**(-1)
  
  V = grad_I_als
  W = grad_U_als

  U -= step_U*grad_U
  I -= step_I*grad_I


rmse = np.sqrt(np.mean((I@U.T-R)**2))
print(rmse)

rmse_als = np.sqrt(np.mean((V@W.T-R)**2))
print(rmse_als)

In [27]:
data_path = '../datasets/'
data = np.load(data_path + 'ratings_train.npy')
test_data = np.load(data_path + 'ratings_test.npy')

step_I = .00007
step_U = .00007

mu = .1
lam = .1

K = 5
I = np.random.rand(len(data),K) #Generating random matrices, maybe a better initialization can be initialized
U = np.random.rand(len(data[0]),K).T


non_nan = np.argwhere(~np.isnan(data))

for (i, j) in non_nan:
  eij = data[i][j] - np.dot(I[i,:],U[:,j])

  for k in range(K):
    I[i][k] = I[i][k] + step_I * (2 * eij * U[k][j] - mu * I[i][k])
    U[k][j] = U[k][j] + step_U * (2 * eij * I[i][k] - lam * U[k][j])

In [4]:
import numpy as np

def initialize_with_svd(Y, rank):
    # Fill missing values with zeros
    Y_filled = np.nan_to_num(Y)

    # Perform SVD
    U, Sigma, Vt = np.linalg.svd(Y_filled, full_matrices=False)

    # Take top `rank` singular vectors/values
    U_init = U[:, :rank]
    Sigma_init = np.diag(Sigma[:rank])
    Vt_init = Vt[:rank, :]

    # Initialize U and V
    U_initialized = U_init @ np.sqrt(Sigma_init)
    V_initialized = (np.sqrt(Sigma_init) @ Vt_init).T

    return U_initialized, V_initialized


def matrix_completion_als(data, rank, max_iter=100, tol=1e-6, lambda_reg=0.1):
    m, n = data.shape
    I = np.random.rand(m, rank)
    U = np.random.rand(n, rank)
    #I, U = initialize_with_svd(data, rank)
    error = 1e10
    
    Omega = (data > 0).astype(int)
    #R = np.nan_to_num(data, copy=True)
    
    for _ in range(max_iter):
        # Update U while fixing V
        for i in range(m):
            indices = np.where(Omega[i, :] == 1)[0]
            Vi = U[indices, :]
            Yi = data[i, indices]
            I[i, :] = np.linalg.solve(Vi.T @ Vi + lambda_reg * np.eye(rank), Vi.T @ Yi)
        
        # Update V while fixing U
        for j in range(n):
            indices = np.where(Omega[:, j] == 1)[0]
            Uj = I[indices, :]
            Yj = data[indices, j]
            U[j, :] = np.linalg.solve(Uj.T @ Uj + lambda_reg * np.eye(rank), Uj.T @ Yj)

        # Compute the matrix approximation and error
        Y_hat = I @ U.T
        diff = Omega * (data - Y_hat)
        new_error = np.linalg.norm(diff, 'fro')
        if abs(new_error - error) < tol:
            break
        error = new_error
    
    return I @ U.T


data_path = '../datasets/'
data = np.load(data_path + 'ratings_train.npy')
test_data = np.load(data_path + 'ratings_test.npy')


# Create the Omega matrix: 1 where Y has an entry, 0 otherwise
Omega = (data > 0).astype(int)

# Display the generated matrix
print("Original matrix (with missing values set to 0):")
print(data)


# Use the ALS method to complete the matrix
completed_matrix = matrix_completion_als(data, rank=1)

print("\nCompleted matrix:")
print(completed_matrix)


def rmse(completed_matrix, test_matrix):
    masked = np.ma.array(test_matrix, mask=np.isnan(test_matrix))
    predictions = completed_matrix
    diff = np.ma.subtract(predictions, masked)
    squared = np.ma.power(diff, 2)
    return np.ma.sqrt(np.ma.mean(squared))

print(rmse(completed_matrix, data))
print(rmse(completed_matrix, test_data))

Original matrix (with missing values set to 0):
[[ 4. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan  2. nan ... nan nan nan]
 [ 3. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]

Completed matrix:
[[4.62862644 4.09904717 3.6204652  ... 4.41902928 4.13320775 4.68862969]
 [3.78812708 3.35471264 2.96303503 ... 3.61659009 3.3826701  3.83723452]
 [1.52249891 1.34830385 1.19088339 ... 1.4535559  1.35954033 1.54223584]
 ...
 [3.6135064  3.20007099 2.82644849 ... 3.44987672 3.2267397  3.66035013]
 [3.53761119 3.13285925 2.76708402 ... 3.37741826 3.15896783 3.58347106]
 [4.1267091  3.65455616 3.2278705  ... 3.93984017 3.68501245 4.18020575]]
0.7510645330526745
0.9122087962686874
