In [2]:
import pandas as pd
import numpy as np
import os  # For path operations


def load_data(d):
    # Load Ratings Data
    ratings = pd.read_csv(f'{d}/ratings.csv')
    print("Ratings Data Loaded")
    print(ratings.head())
    return ratings
    
    
# SGD-based Matrix Factorization with saving mechanism
def sgd_matrix_factorization(R, K, iterations, alpha, beta, save_interval, output_dir):
    num_users, num_books = R.shape
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_books, K))

    for iteration in range(iterations):
        for i in range(num_users):
            for j in range(num_books):
                if R[i, j] > 0:
                    eij = R[i, j] - np.dot(P[i, :], Q[j, :].T)
                    P[i, :] += alpha * (eij * Q[j, :] - beta * P[i, :])
                    Q[j, :] += alpha * (eij * P[i, :] - beta * Q[j, :])

        # Save P and Q every 'save_interval' iterations
        if iteration % save_interval == 0:
            np.savetxt(os.path.join(output_dir, f'P_iteration_{iteration}.csv'), P, delimiter=',')
            np.savetxt(os.path.join(output_dir, f'Q_iteration_{iteration}.csv'), Q, delimiter=',')
            print(f'Saved P and Q at iteration {iteration}')

    return P, Q

# Main
def main():
    d = '/mnt/workspace/Book-Rec-Sys/input/dataset'
    o = '/mnt/workspace/Book-Rec-Sys/output'

    # Ensure the output directory exists
    if not os.path.exists(o):
        os.makedirs(o)

    # Load and prepare data
    ratings = load_data(d)

    num_users = ratings['user_id'].max()
    num_books = ratings['book_id'].max()

    rating_matrix = np.zeros((num_users, num_books))
    for row in ratings.itertuples():
        rating_matrix[row.user_id - 1, row.book_id - 1] = row.rating

    # Matrix factorization parameters
    K = 2
    iterations = 1
    alpha = 0.0002
    beta = 0.02
    save_interval = 20  # Save every 20 iterations

    # Perform matrix factorization using SGD
    P, Q = sgd_matrix_factorization(rating_matrix, K, iterations, alpha, beta, save_interval, o)

    # Prediction and Error Calculation
    predicted_ratings = np.dot(P, Q.T)
    actual_ratings = rating_matrix[np.where(rating_matrix > 0)]
    predicted_ratings = predicted_ratings[np.where(rating_matrix > 0)]
    error = np.sqrt(np.mean((actual_ratings - predicted_ratings) ** 2))

    print("Prediction Error:", error)

# Run the main function
main()

Ratings Data Loaded
   user_id  book_id  rating
0        1      258       5
1        2     4081       4
2        2      260       5
3        2     9296       5
4        2     2318       3
Saved P and Q at iteration 0
Prediction Error: 4.053971476650242
