# Imports

In [25]:
import pandas as pd
import numpy as np
import torch
import math

# Supporting Functions

In [26]:
# Definition of file names to import and export
input_csv, relevant_values_csv, output_csv = 'data_train.csv', 'sampleSubmission.csv', 'result.csv'

In [27]:
# Converts the input CSV to a usable format
def convert_csv_to_matrix(input_csv, format):
    df = pd.read_csv(input_csv)
    
    df['row'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
    df['col'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))
    
    max_row = df['row'].max()
    max_col = df['col'].max()

    if(format == "zero"):
        matrix = np.zeros((max_row, max_col))
        for index, row in df.iterrows():
            matrix[row['row']-1, row['col']-1] = row['Prediction']

    else:
        # Initialize and populate dictionary to store rows
        row_dict = {i: {} for i in range(1, max_row + 1)}
        for index, row in df.iterrows():
            row_dict[row['row']][row['col']] = row['Prediction']


        matrix = np.full((max_row, max_col), np.nan)
        for r in range(1, max_row + 1):
            for c in range(1, max_col + 1):
                if c in row_dict[r]:
                    matrix[r-1, c-1] = row_dict[r][c]
    
    return matrix

In [28]:
# Converts a given matrix into the required CSV output format
def save_matrix_to_csv(matrix, relevant_values_csv, output_csv):
    # Import the relevant values csv and convert to dataframe
    relevant_values_df = pd.DataFrame(convert_csv_to_matrix(relevant_values_csv, 'zero'))
    
    matrix_df = pd.DataFrame(matrix)
    
    # Create a filtered version of the matrix. The criteria used is: relevant_values_df == 3
    filtered_matrix_df = matrix_df.where(relevant_values_df == 3, other=np.nan)
    
    # Reshape the matrix into one column and reset the index; also removes NaN values
    stacked_df = filtered_matrix_df.stack().reset_index()
    
    # Rename the columns
    stacked_df.columns = ['row', 'col', 'val']
    
    # Create the desired rN_cN format for the final output
    stacked_df['row'] = (stacked_df['row'] + 1).astype(str)
    stacked_df['col'] = (stacked_df['col'] + 1).astype(str)
    stacked_df['r_c'] = 'r' + stacked_df['row'] + '_c' + stacked_df['col']
    
    result_df = stacked_df[['r_c', 'val']]
    
    result_df.to_csv(output_csv, index=False, header=['Id', 'Prediction'])

# Import Matrix and Choose Parameters

In [29]:
rows, cols = 10000, 1000
rank = 7
regularization_lambda = 0.01
num_iterations = 100

torch.manual_seed(31415)

<torch._C.Generator at 0x10bfbc330>

In [30]:
# Import matrix from CSV, impute missing values by zeros, and convert into a torch float tensor
matrix = convert_csv_to_matrix(input_csv, 'zero')
torch_matrix = torch.from_numpy(matrix)
A = torch_matrix.float()

# Create mask of the known values, i.e. the non-zero values
known_values_mask = A != 0

# Initialize the matrices U and V
U = torch.randn(rows, rank)
V = torch.randn(cols, rank)

# Core ALS Algorithm
Adapted from the lecture script

In [31]:
# Calculate the quadratic part of formula (2.36), for easier use during the summation

def calculate_Q(input_matrix):
    dim_1 = input_matrix.shape[0]   # Rows or Columns 
    dim_rank = input_matrix.shape[1]   # Rank
    
    Q = torch.zeros(dim_1, dim_rank, dim_rank)
    for i in range(dim_1):
        Q[i] = input_matrix[i].unsqueeze(0).T @ input_matrix[i].unsqueeze(0)
        
    return Q

In [32]:
predicted_ratings = A.clone()

for iteration in range(num_iterations):
    # Optimize V, while U is fixed
    Q_U = calculate_Q(U)

    B_U = U.T @ A

    for col in range(cols):
        sum_U_fixed = 0
        for row in range(rows):
            if known_values_mask[row, col]:
                sum_U_fixed += Q_U[row, :, :]
        inv = torch.linalg.inv(sum_U_fixed + 2 * regularization_lambda * torch.eye(rank))
        V[col] = inv @ B_U[:, col]
    
    # Optimize U, while V is fixed
    Q_V = calculate_Q(V)

    B_V = V.T @ A.T
    
    for row in range(rows):
        sum_V_fixed = 0
        for col in range(cols):
            if known_values_mask[row, col]:
                sum_V_fixed += Q_V[col, :, :]
        inv = torch.linalg.inv(sum_V_fixed + 2 * regularization_lambda * torch.eye(rank))
        U[row] = inv @ B_V[:, row]

    predicted_ratings = U @ V.T
    
    # Calculate and display the loss for the known values
    print(f'Iteration {iteration + 1}/{num_iterations}, Error: {round(math.sqrt(torch.nn.functional.mse_loss(predicted_ratings[known_values_mask], A[known_values_mask]).item()),5)}')

# Clamp the result matrix to [1,5]
matrix_out = torch.clamp(predicted_ratings, min=1.0, max=5.0)

Iteration 1/100, Error: 2.54482
Iteration 2/100, Error: 0.964
Iteration 3/100, Error: 0.94186
Iteration 4/100, Error: 0.93445
Iteration 5/100, Error: 0.93049
Iteration 6/100, Error: 0.92795
Iteration 7/100, Error: 0.92615
Iteration 8/100, Error: 0.9248
Iteration 9/100, Error: 0.92375
Iteration 10/100, Error: 0.92292
Iteration 11/100, Error: 0.92227
Iteration 12/100, Error: 0.92178
Iteration 13/100, Error: 0.92143
Iteration 14/100, Error: 0.92118
Iteration 15/100, Error: 0.92101
Iteration 16/100, Error: 0.92088
Iteration 17/100, Error: 0.92079
Iteration 18/100, Error: 0.92072
Iteration 19/100, Error: 0.92067
Iteration 20/100, Error: 0.92062
Iteration 21/100, Error: 0.92059
Iteration 22/100, Error: 0.92056
Iteration 23/100, Error: 0.92053
Iteration 24/100, Error: 0.92051
Iteration 25/100, Error: 0.92049
Iteration 26/100, Error: 0.92047
Iteration 27/100, Error: 0.92046
Iteration 28/100, Error: 0.92045
Iteration 29/100, Error: 0.92043
Iteration 30/100, Error: 0.92042
Iteration 31/100, Erro

# Export Matrix to CSV

In [33]:
# Save the matrix to the output CSV
save_matrix_to_csv(matrix_out, relevant_values_csv, output_csv)