# Full Model Training & Recommendation Generation:

This modeling is added as a future improvement due to the runtime limitation.

In [2]:
import numpy as np
import pandas as pd
import pymc as pm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score

In [3]:
# Load dataset
df = pd.read_csv("../book_ratings_cleaned.csv")

# Select relevant columns
df = df[['User-ID', 'ISBN', 'Book-Rating']]

In [4]:
# Drop zero values as they are not the actual rating
df = df[df['Book-Rating'] > 0]

In [5]:
# Check the size of the data
len(df)

383840

In [6]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276744,038550120X,7
13,276747,0060517794,9


In [7]:
# Create a dataframe grouped by unique users
def create_df_user_reviews(df):
    # Count the number of reviews per user
    user_counts = df['User-ID'].value_counts()
    
    # Store number of reviews per user in a new DataFrame
    df_user_reviews = user_counts.reset_index()
    df_user_reviews.columns = ['User-ID', 'Review-Count']
    
    return df_user_reviews

In [8]:
df_user_reviews = create_df_user_reviews(df)
df_user_reviews

Unnamed: 0,User-ID,Review-Count
0,11676,6943
1,98391,5691
2,189835,1899
3,153662,1845
4,23902,1180
...,...,...
68086,114767,1
68087,114771,1
68088,114772,1
68089,114777,1


In [9]:
# Create a dataframe with the users that made less then or equal to 20 reviews
def create_df_filtered(df):
    # Count the number of reviews per user
    user_counts = df['User-ID'].value_counts()
    
    # Remove users with 20 or more reviews
    valid_users = user_counts[user_counts >= 2].index
    df_filtered = df[df['User-ID'].isin(valid_users)]
    
    return df_filtered

In [10]:
df = create_df_filtered(df)
len(df)

344617

In [11]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
3,276729,052165615X,3
4,276729,0521795028,6
13,276747,0060517794,9
16,276747,0671537458,9
17,276747,0679776818,8


In [12]:
# Encode User-ID and ISBN as categorical for indexing
df['User-Index'] = df['User-ID'].astype("category").cat.codes
df['Book-Index'] = df['ISBN'].astype("category").cat.codes

# **Remap indices to contiguous range** (Fixes the IndexError)
df['User-Index'] = df['User-Index'].astype("category").cat.codes
df['Book-Index'] = df['Book-Index'].astype("category").cat.codes

In [13]:
# Compute rating counts per user and book
user_rating_counts = df.groupby('User-Index')['Book-Rating'].count()
book_rating_counts = df.groupby('Book-Index')['Book-Rating'].count()

# Avoid division by zero
user_rating_counts[user_rating_counts == 0] = 1
book_rating_counts[book_rating_counts == 0] = 1

In [14]:
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
# Convert to numpy arrays for modeling
train_user_ids = train_df['User-Index'].values
test_user_ids = test_df['User-Index'].values
train_book_ids = train_df['Book-Index'].values
test_book_ids = test_df['Book-Index'].values
train_ratings = train_df['Book-Rating'].values # Using raw ratings for Poisson
test_ratings = test_df['Book-Rating'].values

In [16]:
len(train_user_ids)

275693

In [17]:
# Get updated number of unique users and books
num_users = df['User-Index'].nunique()
num_books = df['Book-Index'].nunique()

print("Number of unique users:", num_users)
print("Number of unique books:", num_books)

Number of unique users: 28868
Number of unique books: 141472


#### Model Training Using NUTS

In [18]:
# Set latent dimension 
latent_dim = 5

best_sigma_u = 1.2595968179742412
best_sigma_b = 0.23533042331948476
best_alpha = 4.871402042323151
best_beta = 1.2405795681084912

# Bayesian Probabilistic Matrix Factorization Model with Gamma-Poisson
with pm.Model() as best_model:
    mu = pm.Gamma("mu", alpha=best_alpha, beta=best_beta)
    user_bias = pm.Normal("user_bias", mu=0, sigma=0.5 / np.sqrt(user_rating_counts + 1), shape=num_users)
    book_bias = pm.Normal("book_bias", mu=0, sigma=0.5 / np.sqrt(book_rating_counts + 1), shape=num_books)

    sigma_u = pm.HalfCauchy("sigma_u", beta=best_sigma_u)
    sigma_b = pm.HalfCauchy("sigma_b", beta=best_sigma_b)

    user_factors = pm.Normal("user_factors", mu=0, sigma=sigma_u, shape=(num_users, latent_dim))
    book_factors = pm.Normal("book_factors", mu=0, sigma=sigma_b, shape=(num_books, latent_dim))

    lambda_rating = pm.math.exp(
        mu + user_bias[train_user_ids] + book_bias[train_user_ids] +
        (user_factors[train_user_ids] * book_factors[train_user_ids]).sum(axis=1)
    )

    ratings_obs = pm.Poisson("ratings_obs", mu=lambda_rating, observed=train_ratings)

    # using jax to accelarate the sampling (numpyro) and utilize multiprocessing
    best_trace = pm.sample(
        draws=2000, tune=1000, chains=2,
        nuts_sampler="numpyro",
        nuts_sampler_kwargs={"chain_method": "vectorized"}
    )
   
# **Extract posterior values manually since PyMC won't sample `ratings_obs`**
with best_model:
    print("\nManually Generating Predictions Using Posterior Samples...")
    
    # Extract posterior values
    mu_post = best_trace.posterior["mu"].mean().item()
    user_bias_post = best_trace.posterior["user_bias"].mean(dim=("chain", "draw")).values
    book_bias_post = best_trace.posterior["book_bias"].mean(dim=("chain", "draw")).values
    user_factors_post = best_trace.posterior["user_factors"].mean(dim=("chain", "draw")).values
    book_factors_post = best_trace.posterior["book_factors"].mean(dim=("chain", "draw")).values

    # Compute expected ratings
    predicted_ratings = np.exp(
        mu_post + 
        user_bias_post[test_user_ids] + 
        book_bias_post[test_book_ids] +
        (user_factors_post[test_user_ids] * book_factors_post[test_book_ids]).sum(axis=1)
    )

    print("\nExample of Predicted Ratings (posterior predictive mean):")
    print(predicted_ratings[:5])
    
# Evaluation Metrics
mae = mean_absolute_error(test_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_ratings))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

: 

#### Evaluation of Precision, Recall, MAE, and RMSE

In [None]:
# Evaluation of Precision, Recall, MAE, and RMSE

def evaluate_predictions(true_ratings, predicted_ratings, threshold=7):
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    
    # Convert to binary relevance (1 if rating >= threshold, else 0)
    true_binary = (true_ratings >= threshold).astype(int)
    predicted_binary = (predicted_ratings >= threshold).astype(int)
    
    precision = precision_score(true_binary, predicted_binary, average='micro')
    recall = recall_score(true_binary, predicted_binary, average='micro')
    
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Running evaluation
predicted_train_ratings = np.exp(
    best_trace.posterior["mu"].mean().item() +
    best_trace.posterior["user_bias"].mean(dim=("chain", "draw")).values[train_user_ids] +
    best_trace.posterior["book_bias"].mean(dim=("chain", "draw")).values[train_book_ids]
)
predicted_test_ratings = np.exp(
    best_trace.posterior["mu"].mean().item() +
    best_trace.posterior["user_bias"].mean(dim=("chain", "draw")).values[test_user_ids] +
    best_trace.posterior["book_bias"].mean(dim=("chain", "draw")).values[test_book_ids]
)

evaluate_predictions(train_ratings, predicted_train_ratings)
evaluate_predictions(test_ratings, predicted_test_ratings)

MAE: 1.2467
RMSE: 1.4158
Precision: 0.6050
Recall: 0.6050
MAE: 2.2279
RMSE: 2.5468
Precision: 0.1950
Recall: 0.1950


#### Bayes General Multi-Step Lookahead Recommendation 

In [None]:
# ---- Bayes General Multi-Step Lookahead Recommendation ---- #

def bayes_general_recommendation(user_index, book_indices, trace, top_k=3, exploration_factor=0.5, regret_threshold=0.8, max_regret=2.0):
    """
    Multi-step lookahead Bayesian regret minimization for recommending 5 books.
    """
    mu_samples = trace.posterior["mu"].values
    user_bias_samples = trace.posterior["user_bias"].values[:, :, user_index]
    book_bias_samples = trace.posterior["book_bias"].values[:, :, book_indices]
    user_factors_samples = trace.posterior["user_factors"].values[:, :, user_index, :]
    book_factors_samples = trace.posterior["book_factors"].values[:, :, book_indices, :]

    num_samples = mu_samples.shape[1]  # Number of posterior samples
    
    # Compute expected rewards using posterior sampling
    expected_rewards = np.mean(
        np.exp(mu_samples[:, :, None] + user_bias_samples[:, :, None] + book_bias_samples +
               np.sum(user_factors_samples[:, :, None, :] * book_factors_samples, axis=-1)), axis=1
    )

    # Compute variance (uncertainty measure)
    rating_uncertainty = np.var(
        np.exp(mu_samples[:, :, None] + user_bias_samples[:, :, None] + book_bias_samples +
               np.sum(user_factors_samples[:, :, None, :] * book_factors_samples, axis=-1)), axis=1
    )
    
    # Compute Bayesian regret
    best_expected_reward = np.max(expected_rewards, axis=1)
    regrets = best_expected_reward[:, None] - expected_rewards

    # Cap regret to prevent extreme exploration
    regrets = np.clip(regrets, 0, max_regret)

    # Apply regret threshold
    should_explore = regrets > regret_threshold

    # Compute future learning potential
    expected_future_gain = exploration_factor * rating_uncertainty

    # Compute exploration-adjusted score
    exploration_score = expected_rewards + expected_future_gain

    # Rank books
    ranked_books = np.argsort(-exploration_score, axis=1)  # Sort in descending order

    # Select top-k books for recommendation
    selected_books = [book_indices[i] for i in ranked_books[0, :top_k]]

    return selected_books

# Example usage: Recommend 5 books for a user
user_id_example = 42  # Replace with an actual user ID
book_pool = np.arange(num_books)  # Assuming all books are available

recommended_books = bayes_general_recommendation(user_id_example, book_pool, best_trace, top_k=3)
print("\nTop-3 Recommended Books for User", user_id_example, ":", recommended_books)


Top-3 Recommended Books for User 42 : [738, 849, 625]


#### Map the recommended books to the book metadata
Saved as 'book_recommendations.nc'

In [None]:
import xarray as xr

# Get all user indices
all_users = df['User-Index'].unique()

# Create an empty dictionary to store recommendations
recommended_isbns = {}

# Create a dictionary mapping Book-Index to ISBN
book_index_to_isbn = df.set_index('Book-Index')['ISBN'].to_dict()

# Iterate through each user and get book recommendations
for user_id in all_users:
    recommended_books = bayes_general_recommendation(user_id, np.arange(num_books), best_trace, top_k=3)
    recommended_isbns[user_id] = [book_index_to_isbn[idx] for idx in recommended_books if idx in book_index_to_isbn]
    #recommendations[user_id] = recommended_books

# Convert to DataFrame
rec_df = pd.DataFrame.from_dict(recommended_isbns, orient='index', columns=[f'Rec_{i+1}' for i in range(3)])

# Reset index for merging
rec_df.reset_index(inplace=True)
rec_df.rename(columns={'index': 'User-Index'}, inplace=True)

# Merge with the original dataframe on "User-Index"
df_combined = df.merge(rec_df, on='User-Index', how='left')

df_combined = df_combined.drop(columns=['User-Index', 'Book-Index'])

# Convert the dataframe to xarray Dataset for NetCDF format
ds = xr.Dataset.from_dataframe(df_combined)

# Save as a .nc file
ds.to_netcdf("book_recommendations.nc")

print("✅ Recommendation data saved as 'book_recommendations.nc'")



Top-5 Recommended Books (ISBNs): {}
✅ Recommendation data saved as 'book_recommendations.nc'
