In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import sys
import torch

def convert_csv_to_matrix(input_csv, format):
    df = pd.read_csv(input_csv)
    
    df['row'] = df['Id'].apply(lambda x: int(x.split('_')[0][1:]))
    df['col'] = df['Id'].apply(lambda x: int(x.split('_')[1][1:]))
    
    max_row = df['row'].max()
    max_col = df['col'].max()

    if(format == "zero"):
        matrix = np.zeros((max_row, max_col))
        for index, row in df.iterrows():
            matrix[row['row']-1, row['col']-1] = row['Prediction']

    else:
        # Initialize and populate dictionary to store rows
        row_dict = {i: {} for i in range(1, max_row + 1)}
        for index, row in df.iterrows():
            row_dict[row['row']][row['col']] = row['Prediction']


        matrix = np.full((max_row, max_col), np.nan)
        for r in range(1, max_row + 1):
            for c in range(1, max_col + 1):
                if c in row_dict[r]:
                    matrix[r-1, c-1] = row_dict[r][c]
    
    return matrix

def mean_matrix(matrix):
    for r in range(matrix.shape[0]):
        row_mean = np.nanmean(matrix[r])
        matrix[r] = np.where(np.isnan(matrix[r]), row_mean, matrix[r])
    return matrix


def normalize_matrix(matrix):
    # Normalize the matrix by subtracting the row mean
    for r in range(matrix.shape[0]):
        row_mean = np.nanmean(matrix[r])
        matrix[r] = np.where(np.isnan(matrix[r]), row_mean, matrix[r]) - row_mean
    
    return matrix

def scale_matrix(matrix):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_matrix = scaler.fit_transform(matrix)
    
    return scaled_matrix

def svd_approximation(matrix):
    # SVD Decomposition
    U, S, Vh = torch.linalg.svd(matrix, full_matrices=False)

    # Set the lowest singular values to zero, e.g. S[-200:] sets the lowest 200 of the total 1000 singular values to zero
    S[-200:] = 0
    
    # Return the matrix calculated by the low rank approximation
    return U @ torch.diag(S) @ Vh

def save_matrix_to_csv(matrix, relevant_values_csv, output_csv):
    # Import the relevant values csv and convert to dataframe
    relevant_values_df = pd.DataFrame(convert_csv_to_matrix(relevant_values_csv, 'zero'))
    
    matrix_df = pd.DataFrame(matrix)
    
    # Create a filtered version of the matrix. The criteria used is: relevant_values_df == 3
    filtered_matrix_df = matrix_df.where(relevant_values_df == 3, other=np.nan)
    
    # Reshape the matrix into one column and reset the index; also removes NaN values
    stacked_df = filtered_matrix_df.stack().reset_index()
    
    # Rename the columns
    stacked_df.columns = ['row', 'col', 'val']
    
    # Create the desired rN_cN format for the final output
    stacked_df['row'] = (stacked_df['row'] + 1).astype(str)
    stacked_df['col'] = (stacked_df['col'] + 1).astype(str)
    stacked_df['r_c'] = 'r' + stacked_df['row'] + '_c' + stacked_df['col']
    
    result_df = stacked_df[['r_c', 'val']]
    
    result_df.to_csv(output_csv, index=False, header=['Id', 'Prediction'])


def main(input_csv, relevant_values_csv, output_csv, format):
    matrix = convert_csv_to_matrix(input_csv, format)
    numpy_matrix = torch.from_numpy(matrix)
    known_values_mask = ~torch.isnan(numpy_matrix)
    
    if format == 'zero':
        pass
    elif format == 'mean':
        matrix = mean_matrix(matrix)
    elif format == 'normalize':
        # Normalize the matrix
        matrix = normalize_matrix(matrix)
    elif format == 'scale':
        # Normalize and scale the matrix
        matrix = normalize_matrix(matrix)
        matrix = scale_matrix(matrix)
    elif format == 'svd':
        # Use SVD for low rank approximation
        
        # Mean imputated matrix
        svd_matrix = torch.from_numpy(mean_matrix(matrix))
        
        # Iterate through SVD approximating and re-filling the known original values
        for _ in range(5):
            svd_matrix = svd_approximation(svd_matrix)
            # Metric on how close the approximated values match the known values
            print(torch.dist(svd_matrix[known_values_mask], numpy_matrix[known_values_mask]))
            
            # Re-fill the known original values
            svd_matrix[known_values_mask] = numpy_matrix[known_values_mask]
        
    else:
        print(f"Unknown action: {format}")
        sys.exit(1)
    
    # Save the matrix to the output CSV
    save_matrix_to_csv(svd_matrix, relevant_values_csv, output_csv)

main('data_train.csv', 'sampleSubmission.csv', 'result.csv', "svd")

tensor(149.8536, dtype=torch.float64)
tensor(86.2147, dtype=torch.float64)
tensor(51.7964, dtype=torch.float64)
tensor(32.4075, dtype=torch.float64)
tensor(21.0684, dtype=torch.float64)
