In [15]:
import pandas as pd
import numpy as np
import torch
from scipy.sparse import coo_matrix
import os

d = '/mnt/workspace/Book-Rec-Sys/input/dataset'
o = '/mnt/workspace/Book-Rec-Sys/output'

def load_data():
    ratings = pd.read_csv(f'{d}/ratings.csv')
    print("Ratings Data Loaded")
    print(ratings.head())
    return ratings

ratings = load_data()

num_users = ratings['user_id'].max()
num_books = ratings['book_id'].max()

Ratings Data Loaded
   user_id  book_id  rating
0        1      258       5
1        2     4081       4
2        2      260       5
3        2     9296       5
4        2     2318       3


In [16]:
# Convert to a sparse matrix
rows = ratings['user_id'] - 1
cols = ratings['book_id'] - 1
values = ratings['rating']
rating_matrix = coo_matrix((values, (rows, cols)), shape=(num_users, num_books))

In [20]:
def matrix_factorization(R, K, steps=5, alpha=0.0002, beta=0.02, save_interval=500, output_dir='output', device='cuda'):
    R = torch.FloatTensor(R.toarray()).to(device)
    num_users, num_books = R.shape
    P = torch.rand(num_users, K, device=device)
    Q = torch.rand(num_books, K, device=device).T

    for step in range(steps):
        for i in range(num_users):
            rated_indices = R[i, :].nonzero().view(-1)
            Q_i = Q[:, rated_indices]
            R_i = R[i, rated_indices]
            e_i = R_i - torch.matmul(P[i, :], Q_i)
            P[i, :] += alpha * (torch.matmul(e_i, Q_i.T) - beta * P[i, :])

        for j in range(num_books):
            rated_indices = R[:, j].nonzero().view(-1)
            P_j = P[rated_indices, :]
            R_j = R[rated_indices, j]
            e_j = R_j - torch.matmul(P_j, Q[:, j])
            Q[:, j] += alpha * (torch.matmul(P_j.T, e_j) - beta * Q[:, j])

        eR = torch.matmul(P, Q)
        e = torch.sum((R[R > 0] - eR[R > 0]) ** 2)
        e += beta / 2 * (torch.sum(P ** 2) + torch.sum(Q ** 2))
        
        # Save P and Q every 'save_interval' steps
        if step % save_interval == 0 or step == steps - 1:
            torch.save(P, os.path.join(output_dir, f'P_step_{step}.pt'))
            torch.save(Q, os.path.join(output_dir, f'Q_step_{step}.pt'))
            print(f'Saved P and Q at step {step}')

        if e < 0.001:
            break

    return P, Q.T

In [21]:
K = 2

# Run the matrix factorization
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
P, Q = matrix_factorization(rating_matrix, K, device=device, output_dir=o)


predicted_ratings = torch.matmul(P, Q.T)
actual_ratings = torch.FloatTensor(rating_matrix.toarray()).to(device)  # Convert to PyTorch tensor
mask = actual_ratings > 0

# Ensure both tensors are on the same device before subtraction
error = torch.sqrt(torch.mean((actual_ratings[mask] - predicted_ratings[mask]) ** 2))

print("Prediction Error:", error.item())  # Convert to Python scalar for printing

Saved P and Q at step 0
Saved P and Q at step 4
Prediction Error: 2.3262786865234375
