# SVD

- For the training process, user-item interaction matrix with missing value would be filled with mean rating of user.

In [6]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from torch.utils.data import DataLoader
import torch
import yaml

import dataset
import utils

# Load configuration
config_path = '../config/config.yaml'
config = utils.load_config(config_path)

train_path = config['data_config']['train_path']
test_path = config['data_config']['test_path']
num_users = config['data_config']['num_users']
num_items = config['data_config']['num_items']

def load_data_to_matrix(dataset, num_users, num_items):
    """ Convert MovieLens dataset to a sparse matrix using the reindexed user and item IDs """
    rows, cols, data = [], [], []

    for idx in range(len(dataset)):
        user_idx, item_idx, rating = dataset[idx]
        rows.append(user_idx.item() - 1)
        cols.append(item_idx.item() - 1)
        data.append(rating.item())

    return csr_matrix((data, (rows, cols)), shape=(num_users, num_items))

def fill_missing_with_user_means(matrix):
    """ Fill missing values with user means """
    user_means = np.array(matrix.sum(axis=1)).flatten() / np.maximum(matrix.getnnz(axis=1), 1)

    matrix_filled = matrix.toarray()
    for i in range(matrix_filled.shape[0]):
        matrix_filled[i, matrix_filled[i, :] == 0] = user_means[i]

    return csr_matrix(matrix_filled)

# Load datasets
train_dataset = dataset.MovieLensDataset(data_path=train_path)
test_dataset = dataset.MovieLensDataset(data_path=test_path)

train_loader = DataLoader(train_dataset, batch_size=None, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=None, shuffle=False)

# Load data to matrices
train_matrix = load_data_to_matrix(train_dataset, num_users, num_items)
test_matrix = load_data_to_matrix(test_dataset, num_users, num_items)

# Fill missing values in the training matrix with user means
train_matrix_filled = fill_missing_with_user_means(train_matrix)

# Perform SVD on the filled training matrix
k = 10  # Increase the number of latent factors
U, sigma, Vt = svds(train_matrix_filled, k=k)
print(U.shape)
print(Vt.shape)
print(sigma)
sigma = np.diag(sigma)

# Predict ratings
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Evaluate the model on the test matrix
test_nonzero = test_matrix.nonzero()
test_actual = test_matrix[test_nonzero].A1
test_predictions = all_user_predicted_ratings[test_nonzero].flatten()

rmse = np.sqrt(mean_squared_error(test_actual, test_predictions))
print(f'Test RMSE: {rmse}')


(943, 10)
(10, 1682)
[  26.83057367   26.96800995   28.60706748   29.62950085   30.17979715
   31.6563303    33.61237247   41.47937465   54.23470322 4555.66003329]
Test RMSE: 1.0163891510512737
