In [1]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim # To use SGD
from scipy.sparse import coo_matrix
import os

d = '/mnt/workspace/Book-Rec-Sys/input/folds'
o = '/mnt/workspace/Book-Rec-Sys/output'

def load_data(name, fold):
    file = f'{d}/{name}_data_fold{fold}.csv'
    if os.path.exists(file):
        ratings = pd.read_csv(file)
        print(f"{name} Ratings Data for Fold {fold} Loaded")
        print(ratings.head())
        return ratings
    else:
        print(f"File not found: {file}")
        return None



In [2]:
def matrix_factorization(R, K, steps=500, alpha=0.1, beta=0, save_interval=500, output_dir='output', device='cuda'):
    R = torch.FloatTensor(R.toarray()).to(device)
    num_users, num_books = R.shape
    # Added requires_grad=True to P and Q so that PyTorch tracks operations on them for automatic differentiation.
    P = torch.rand(num_users, K)
    P = P.cuda()
    P.requires_grad = True

    Q = torch.rand(num_books, K).T
    Q = Q.cuda()
    Q.requires_grad = True
    
    # Initialized the Adam optimizer with P and Q as the parameters to be updated and alpha as the learning rate.
    optimizer = optim.Adam([P, Q], lr=alpha)
    
    for step in range(steps):
        optimizer.zero_grad()
        eR = torch.matmul(P, Q)
        mask = R > 0
        loss = torch.sum((mask * (R - eR)) ** 2)
        loss += beta / 2 * (torch.sum(P ** 2) + torch.sum(Q ** 2))
        loss.backward()
        optimizer.step()
        print('step:', step)
        print("loss:", loss.item())

        # Calculate error rating in train dataset
        predicted_ratings = torch.matmul(P, Q)
        actual_ratings = torch.FloatTensor(rating_matrix.toarray()).to(device)  # Convert to PyTorch tensor
        mask = actual_ratings > 0
        # Ensure both tensors are on the same device before subtraction
        error = torch.sqrt(torch.mean((actual_ratings[mask] - predicted_ratings[mask]) ** 2))
        print("Prediction Error in train dataset:", error.item())  # Convert to Python scalar for printing
        
        # Calculate error rating in test dataset
        actual_ratings = torch.FloatTensor(trating_matrix.toarray()).to(device)  # Convert to PyTorch tensor
        mask = actual_ratings > 0
        # Ensure both tensors are on the same device before subtraction
        error = torch.sqrt(torch.mean((actual_ratings[mask] - predicted_ratings[mask]) ** 2))
        print("Prediction Error in test dataset:", error.item()) 
        
        # Save P and Q every 'save_interval' steps, print loss function calculate the prediction error
        if step % save_interval == 0 or step == steps - 1:
            torch.save(P, os.path.join(output_dir, f'P_step_{step}_alpha_{alpha}_beta_{beta}.pt'))
            torch.save(Q, os.path.join(output_dir, f'Q_step_{step}_alpha_{alpha}_beta_{beta}.pt'))
            print(f'Saved P and Q')

        if loss.item() < 0.001:
            break

    return P, Q.T

In [3]:
def init(K=2):
    # Run the matrix factorization
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    P, Q = matrix_factorization(rating_matrix, K, device=device, output_dir=o)

In [4]:
# Empty cuda memory
torch.cuda.empty_cache()

In [None]:
# Load training data
ratings = load_data('train',1)
num_users = ratings['user_id'].max()
num_books = ratings['book_id'].max()

# Convert to a sparse matrix
rows = ratings['user_id'] - 1
cols = ratings['book_id'] - 1
values = ratings['rating']
rating_matrix = coo_matrix((values, (rows, cols)), shape=(num_users, num_books))

# Load test data
tratings = load_data('test',1)

tnum_users = tratings['user_id'].max()
tnum_books = tratings['book_id'].max()

# Convert to a sparse matrix
trows = tratings['user_id'] - 1
tcols = tratings['book_id'] - 1
tvalues = tratings['rating']
trating_matrix = coo_matrix((tvalues, (trows, tcols)), shape=(tnum_users, tnum_books))

for K in [3,6,8,12,15,20,40]:
    print(f'Now K = {K}')
    init(K)


train Ratings Data for Fold 1 Loaded
   user_id  book_id  rating
0        2     4081       4
1        2      260       5
2        2     2318       3
3        2       26       4
4        2      315       3
test Ratings Data for Fold 1 Loaded
   user_id  book_id  rating
0        1      258       5
1        2     9296       5
2        2      301       5
3        2     8519       5
4        4       18       5
Now K = 3
step: 0
loss: 53535720.0
Prediction Error in train dataset: 3.0446360111236572
Prediction Error in test dataset: 3.0453174114227295
Saved P and Q
step: 1
loss: 44320648.0
Prediction Error in train dataset: 2.695993185043335
Prediction Error in test dataset: 2.6965444087982178
step: 2
loss: 34751448.0
Prediction Error in train dataset: 2.3095920085906982
Prediction Error in test dataset: 2.3100061416625977
step: 3
loss: 25503864.0
Prediction Error in train dataset: 1.9061341285705566
Prediction Error in test dataset: 1.9064195156097412
step: 4
loss: 17371700.0
Prediction Erro