In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sys
import torch
import matplotlib.pyplot as plt
import timeit

In [3]:
input_csv, relevant_values_csv, output_csv = 'data_train.csv', 'sampleSubmission.csv', 'result.csv'

In [4]:
def convert_csv_to_matrix(input_csv, format):
    df = pd.read_csv(input_csv)
    
    df['row'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
    df['col'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))
    
    max_row = df['row'].max()
    max_col = df['col'].max()

    if(format == "zero"):
        matrix = np.zeros((max_row, max_col))
        for index, row in df.iterrows():
            matrix[row['row']-1, row['col']-1] = row['Prediction']

    else:
        # Initialize and populate dictionary to store rows
        row_dict = {i: {} for i in range(1, max_row + 1)}
        for index, row in df.iterrows():
            row_dict[row['row']][row['col']] = row['Prediction']


        matrix = np.full((max_row, max_col), np.nan)
        for r in range(1, max_row + 1):
            for c in range(1, max_col + 1):
                if c in row_dict[r]:
                    matrix[r-1, c-1] = row_dict[r][c]
    
    return matrix

In [5]:
def mean_matrix(matrix):
    for r in range(matrix.shape[0]):
        row_mean = np.nanmean(matrix[r])
        matrix[r] = np.where(np.isnan(matrix[r]), row_mean, matrix[r])
    return matrix

In [6]:
def normalize_matrix(matrix):
    # Normalize the matrix by subtracting the row mean
    for r in range(matrix.shape[0]):
        row_mean = np.nanmean(matrix[r])
        matrix[r] = np.where(np.isnan(matrix[r]), row_mean, matrix[r]) - row_mean
    
    return matrix

In [7]:
def scale_matrix(matrix):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_matrix = scaler.fit_transform(matrix)
    
    return scaled_matrix

In [8]:
def shrink(S, shrinkage_tau):
    S[:] -= shrinkage_tau
    return torch.clamp(S, min=0)

In [10]:
def save_matrix_to_csv(matrix, relevant_values_csv, output_csv):
    # Import the relevant values csv and convert to dataframe
    relevant_values_df = pd.DataFrame(convert_csv_to_matrix(relevant_values_csv, 'zero'))
    
    matrix_df = pd.DataFrame(matrix)
    
    # Create a filtered version of the matrix. The criteria used is: relevant_values_df == 3
    filtered_matrix_df = matrix_df.where(relevant_values_df == 3, other=np.nan)
    
    # Reshape the matrix into one column and reset the index; also removes NaN values
    stacked_df = filtered_matrix_df.stack().reset_index()
    
    # Rename the columns
    stacked_df.columns = ['row', 'col', 'val']
    
    # Create the desired rN_cN format for the final output
    stacked_df['row'] = (stacked_df['row'] + 1).astype(str)
    stacked_df['col'] = (stacked_df['col'] + 1).astype(str)
    stacked_df['r_c'] = 'r' + stacked_df['row'] + '_c' + stacked_df['col']
    
    result_df = stacked_df[['r_c', 'val']]
    
    result_df.to_csv(output_csv, index=False, header=['Id', 'Prediction'])

In [37]:
matrix = convert_csv_to_matrix(input_csv, 'zero')

In [38]:
torch_matrix = torch.from_numpy(matrix)
# known_values_mask = ~torch.isnan(torch_matrix)
known_values_mask = torch_matrix != 0
# print(torch.sum(known_values_mask))

In [39]:
# ---- matrices for cross validation: Comment to disable

testing_values_mask = known_values_mask.clone()
#testing_values_mask[:,:990] = False # testing data mask

#known_values_mask[:,990:] = False # training data mask

# ---- matrices for cross validation: Comment to disable

In [51]:
rows, cols = 10000, 1000
rank = 15
lam = 0.01
num_iterations = 100
predictions=[]

torch.manual_seed(31415)

A = torch_matrix.float()

U = torch.randn(rows, rank)
V = torch.randn(cols, rank)

In [52]:
U.shape[0]

10000

In [53]:
def calculate_Q(input_matrix):
    dim_1 = input_matrix.shape[0]   # Rows or Columns 
    dim_rank = input_matrix.shape[1]   # Rank
    
    Q = torch.zeros(dim_1, dim_rank, dim_rank)
    for i in range(dim_1):
        Q[i] = input_matrix[i].unsqueeze(0).T @ input_matrix[i].unsqueeze(0)
        
    return Q

In [54]:
def optimize_matrix_ALS(Q, B, optimizable_matrix):
    dim_optimizable = optimizable_matrix.shape[0]   # Rows or Columns
    dim_fixed = Q.shape[0]
    dim_rank = optimizable_matrix.shape[1]   # Rank
    
    for j in range(dim_optimizable):
        sum_r1_mat = 0
        for i in range(dim_fixed):
            if known_values_mask[i, j]:
                sum_r1_mat += Q[i, :, :]
        inv = torch.linalg.inv(sum_r1_mat + 2 * lam * torch.eye(dim_rank)).double()
        optimizable_matrix[j] = inv @ B[:, j]
    
    return optimizable_matrix

In [55]:
predicted_ratings = A.clone()

for iteration in range(num_iterations):

    Q_U = calculate_Q(U)

    B_U = U.T @ A

    # V= optimize_matrix_ALS(Q_U, B_U, V)
    for col in range(cols):
        sum_r1_mat_U = 0
        for row in range(rows):
            if A[row, col]:
                sum_r1_mat_U += Q_U[row, :, :]
        inv = torch.linalg.inv(sum_r1_mat_U + 2 * lam * torch.eye(rank))
        V[col] = inv @ B_U[:, col]

    Q_V = calculate_Q(V)

    B_V = V.T @ A.T
    
    # U = optimize_matrix_ALS(Q_V, B_V, U)
    for row in range(rows):
        sum_r1_mat_V = 0
        for col in range(cols):
            if A[row, col]:
                sum_r1_mat_V += Q_V[col, :, :]
        inv = torch.linalg.inv(sum_r1_mat_V + 2 * lam * torch.eye(rank))
        U[row] = inv @ B_V[:, row]

    predicted_ratings = U @ V.T
    predictions.append(predicted_ratings)
    loss = torch.nn.functional.mse_loss(predicted_ratings[testing_values_mask], A[testing_values_mask])
    print(f"Iteration {iteration + 1}/{num_iterations}, Loss: {loss.item()}")

matrix_out = torch.clamp(predicted_ratings, min=1.0, max=5.0)

Iteration 1/100, Loss: 3.659222364425659
Iteration 2/100, Loss: 0.838337242603302
Iteration 3/100, Loss: 0.7951938509941101
Iteration 4/100, Loss: 0.77431720495224
Iteration 5/100, Loss: 0.7625187635421753
Iteration 6/100, Loss: 0.7551910877227783
Iteration 7/100, Loss: 0.7502580285072327
Iteration 8/100, Loss: 0.7467105984687805
Iteration 9/100, Loss: 0.7440354228019714
Iteration 10/100, Loss: 0.7419514060020447
Iteration 11/100, Loss: 0.7402875423431396
Iteration 12/100, Loss: 0.7389310598373413
Iteration 13/100, Loss: 0.7378060221672058
Iteration 14/100, Loss: 0.736859917640686
Iteration 15/100, Loss: 0.7360562086105347
Iteration 16/100, Loss: 0.7353681921958923
Iteration 17/100, Loss: 0.7347750067710876
Iteration 18/100, Loss: 0.7342600226402283
Iteration 19/100, Loss: 0.733809769153595
Iteration 20/100, Loss: 0.7334129810333252
Iteration 21/100, Loss: 0.7330607771873474
Iteration 22/100, Loss: 0.7327459454536438
Iteration 23/100, Loss: 0.7324627637863159
Iteration 24/100, Loss: 0.

In [56]:
# Save the matrix to the output CSV
# You can also choose an intermediary prediction from a certain iteration. They are saved in the list predictions[iteration - 1]
save_matrix_to_csv(matrix_out, relevant_values_csv, output_csv)