In [131]:
#import pytensor
#print(pytensor.config.cxx)

#set up g++ and openBLAS

#import pytensor
#print(dir(pytensor.config))

#import pytensor
#pytensor.config.blas__ldflags = '-LC:\\OpenBLAS\\lib -lopenblas'
#print(pytensor.config.blas__ldflags)

#import pytensor
#print("BLAS flags:", pytensor.config.blas__ldflags)
# print("Computation Mode:", pytensor.config.mode)


In [132]:
np.__version__

'1.26.4'

In [133]:
# pip install openpyxl

# USE ADVI (only 1000 rows for testing):

In [134]:
import numpy as np
import pandas as pd
import pymc as pm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, precision_score, recall_score

In [109]:
# Load dataset
df = pd.read_excel("book_ratings.xlsx")

# Select relevant columns
df = df[['User-ID', 'ISBN', 'Book-Rating']]

In [110]:
# Drop zero values as they are not the actual rating
df = df[df['Book-Rating'] > 0]

In [111]:
# Check the size of the data
len(df)

383840

In [112]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276744,038550120X,7
13,276747,0060517794,9


In [113]:
# Create a dataframe grouped by unique users
def create_df_user_reviews(df):
    # Count the number of reviews per user
    user_counts = df['User-ID'].value_counts()
    
    # Store number of reviews per user in a new DataFrame
    df_user_reviews = user_counts.reset_index()
    df_user_reviews.columns = ['User-ID', 'Review-Count']
    
    return df_user_reviews

In [114]:
df_user_reviews = create_df_user_reviews(df)
df_user_reviews

Unnamed: 0,User-ID,Review-Count
0,11676,6943
1,98391,5691
2,189835,1899
3,153662,1845
4,23902,1180
...,...,...
68086,114767,1
68087,114771,1
68088,114772,1
68089,114777,1


In [115]:
# Create a dataframe with the users that made less then or equal to 20 reviews
def create_df_filtered(df):
    # Count the number of reviews per user
    user_counts = df['User-ID'].value_counts()
    
    # Remove users with 20 or more reviews
    valid_users = user_counts[user_counts >= 2].index
    df_filtered = df[df['User-ID'].isin(valid_users)]
    
    return df_filtered

In [116]:
df = create_df_filtered(df)
len(df)

344617

In [117]:
# **Downsample to 1000 random rows for testing**
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

In [118]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,44852,440236703,7
1,104113,517061015,7
2,86720,380717581,7
3,19371,446520802,4
4,258769,575075260,8


In [119]:
train_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,User-Index,Book-Index
29,35826,140317937,10,105,86
535,60251,553107445,5,169,534
695,104125,789723107,7,319,751
557,250333,60194154,4,770,15
836,94744,333918975,2,284,161


In [120]:
# Encode User-ID and ISBN as categorical for indexing
df['User-Index'] = df['User-ID'].astype("category").cat.codes
df['Book-Index'] = df['ISBN'].astype("category").cat.codes

# **Remap indices to contiguous range** (Fixes the IndexError)
df['User-Index'] = df['User-Index'].astype("category").cat.codes
df['Book-Index'] = df['Book-Index'].astype("category").cat.codes

In [121]:
# Compute rating counts per user and book
user_rating_counts = df.groupby('User-Index')['Book-Rating'].count()
book_rating_counts = df.groupby('Book-Index')['Book-Rating'].count()

# Avoid division by zero
user_rating_counts[user_rating_counts == 0] = 1
book_rating_counts[book_rating_counts == 0] = 1

In [122]:
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [123]:
# Convert to numpy arrays for modeling
train_user_ids = train_df['User-Index'].values
test_user_ids = test_df['User-Index'].values
train_book_ids = train_df['Book-Index'].values
test_book_ids = test_df['Book-Index'].values
train_ratings = train_df['Book-Rating'].values # Using raw ratings for Poisson
test_ratings = test_df['Book-Rating'].values

In [124]:
# Get updated number of unique users and books
num_users = df['User-Index'].nunique()
num_books = df['Book-Index'].nunique()

print("Number of unique users:", num_users)
print("Number of unique books:", num_books)

Number of unique users: 858
Number of unique books: 972


In [125]:
# Set latent dimension 
latent_dim = 5

# Bayesian Probabilistic Matrix Factorization Model with Gamma-Poisson
with pm.Model() as model:
    # Prior for global mean rating
    mu = pm.Gamma("mu", alpha=2, beta=0.5)
    
    # ----- User and book bias priors (Adjust sigma for best performance) -----
    user_bias = pm.Normal("user_bias", mu=0, sigma=0.5 / np.sqrt(user_rating_counts + 1), shape=num_users)
    book_bias = pm.Normal("book_bias", mu=0, sigma=0.5 / np.sqrt(book_rating_counts + 1), shape=num_books)

    # Hierarchical priors for latent factors (beta=1 worked best)
    sigma_u = pm.HalfCauchy("sigma_u", beta=1)
    sigma_b = pm.HalfCauchy("sigma_b", beta=1)
    
    user_factors = pm.Normal("user_factors", mu=0, sigma=sigma_u, shape=(num_users, latent_dim))
    book_factors = pm.Normal("book_factors", mu=0, sigma=sigma_b, shape=(num_books, latent_dim))

    # Expected rating using Poisson lambda
    lambda_rating = pm.math.exp(
        mu +
        user_bias[train_user_ids] +
        book_bias[train_book_ids] +
        (user_factors[train_user_ids] * book_factors[train_book_ids]).sum(axis=1)
    )

    # Poisson likelihood
    ratings_obs = pm.Poisson("ratings_obs", mu=lambda_rating, observed=train_ratings)
    
    # Use ADVI for fast variational inference instead of NUTS
    print("Running Variational Inference (ADVI)...")
    approx = pm.fit(n=50000, method="advi")
    trace = approx.sample(draws=2000)

# **Extract posterior values manually since PyMC won't sample `ratings_obs`**
with model:
    print("\nManually Generating Predictions Using Posterior Samples...")
    
    # Extract posterior values
    mu_post = trace.posterior["mu"].mean().item()
    user_bias_post = trace.posterior["user_bias"].mean(dim=("chain", "draw")).values
    book_bias_post = trace.posterior["book_bias"].mean(dim=("chain", "draw")).values
    user_factors_post = trace.posterior["user_factors"].mean(dim=("chain", "draw")).values
    book_factors_post = trace.posterior["book_factors"].mean(dim=("chain", "draw")).values

    # Compute expected ratings
    predicted_ratings = np.exp(
        mu_post + 
        user_bias_post[test_user_ids] + 
        book_bias_post[test_book_ids] +
        (user_factors_post[test_user_ids] * book_factors_post[test_book_ids]).sum(axis=1)
    )

    print("\nExample of Predicted Ratings (posterior predictive mean):")
    print(predicted_ratings[:5])
    
# Evaluation Metrics
mae = mean_absolute_error(test_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_ratings))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Running Variational Inference (ADVI)...


Output()

Finished [100%]: Average Loss = 2,787



Manually Generating Predictions Using Posterior Samples...

Example of Predicted Ratings (posterior predictive mean):
[5.87789129 5.78640202 5.7603901  5.71184689 5.87700667]
Mean Absolute Error (MAE): 2.2964
Root Mean Squared Error (RMSE): 2.6153


In [126]:
# Evaluation Metrics
mae = mean_absolute_error(test_ratings, predicted_ratings)
rmse = np.sqrt(mean_squared_error(test_ratings, predicted_ratings))

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Mean Absolute Error (MAE): 2.2964
Root Mean Squared Error (RMSE): 2.6153


In [127]:
# Evaluation of Precision, Recall, MAE, and RMSE

def evaluate_predictions(true_ratings, predicted_ratings, threshold=7):
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    
    # Convert to binary relevance (1 if rating >= threshold, else 0)
    true_binary = (true_ratings >= threshold).astype(int)
    predicted_binary = (predicted_ratings >= threshold).astype(int)
    
    precision = precision_score(true_binary, predicted_binary, average='micro')
    recall = recall_score(true_binary, predicted_binary, average='micro')
    
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

# Running evaluation
predicted_train_ratings = np.exp(
    trace.posterior["mu"].mean().item() +
    trace.posterior["user_bias"].mean(dim=("chain", "draw")).values[train_user_ids] +
    trace.posterior["book_bias"].mean(dim=("chain", "draw")).values[train_book_ids]
)
predicted_test_ratings = np.exp(
    trace.posterior["mu"].mean().item() +
    trace.posterior["user_bias"].mean(dim=("chain", "draw")).values[test_user_ids] +
    trace.posterior["book_bias"].mean(dim=("chain", "draw")).values[test_book_ids]
)

evaluate_predictions(train_ratings, predicted_train_ratings)
evaluate_predictions(test_ratings, predicted_test_ratings)

MAE: 1.3195
RMSE: 1.4894
Precision: 0.5850
Recall: 0.5850
MAE: 2.2966
RMSE: 2.6153
Precision: 0.1950
Recall: 0.1950


In [128]:
# ---- Bayes General Multi-Step Lookahead Recommendation ---- #

def bayes_general_recommendation(user_index, book_indices, trace, top_k=5, exploration_factor=0.5, regret_threshold=0.8, max_regret=2.0):
    """
    Multi-step lookahead Bayesian regret minimization for recommending 5 books.
    """
    mu_samples = trace.posterior["mu"].values
    user_bias_samples = trace.posterior["user_bias"].values[:, :, user_index]
    book_bias_samples = trace.posterior["book_bias"].values[:, :, book_indices]
    user_factors_samples = trace.posterior["user_factors"].values[:, :, user_index, :]
    book_factors_samples = trace.posterior["book_factors"].values[:, :, book_indices, :]

    num_samples = mu_samples.shape[1]  # Number of posterior samples
    
    # Compute expected rewards using posterior sampling
    expected_rewards = np.mean(
        np.exp(mu_samples[:, :, None] + user_bias_samples[:, :, None] + book_bias_samples +
               np.sum(user_factors_samples[:, :, None, :] * book_factors_samples, axis=-1)), axis=1
    )

    # Compute variance (uncertainty measure)
    rating_uncertainty = np.var(
        np.exp(mu_samples[:, :, None] + user_bias_samples[:, :, None] + book_bias_samples +
               np.sum(user_factors_samples[:, :, None, :] * book_factors_samples, axis=-1)), axis=1
    )
    
    # Compute Bayesian regret
    best_expected_reward = np.max(expected_rewards, axis=1)
    regrets = best_expected_reward[:, None] - expected_rewards

    # Cap regret to prevent extreme exploration
    regrets = np.clip(regrets, 0, max_regret)

    # Apply regret threshold
    should_explore = regrets > regret_threshold

    # Compute future learning potential
    expected_future_gain = exploration_factor * rating_uncertainty

    # Compute exploration-adjusted score
    exploration_score = expected_rewards + expected_future_gain

    # Rank books
    ranked_books = np.argsort(-exploration_score, axis=1)  # Sort in descending order

    # Select top-k books for recommendation
    selected_books = [book_indices[i] for i in ranked_books[0, :top_k]]

    return selected_books

# Example usage: Recommend 5 books for a user
user_id_example = 42  # Replace with an actual user ID
book_pool = np.arange(num_books)  # Assuming all books are available

recommended_books = bayes_general_recommendation(user_id_example, book_pool, trace, top_k=5)
print("\nTop-5 Recommended Books for User", user_id_example, ":", recommended_books)


Top-5 Recommended Books for User 42 : [12, 386, 60, 242, 926]


In [97]:
# Evaluation of Precision, Recall, MAE, and RMSE in the top 5 recommendations

def evaluate_recommendations(user_ids, book_pool, trace, test_df, top_k=5, rating_threshold=1):
    total_precision = 0
    total_recall = 0
    total_mae = 0
    total_rmse = 0
    user_count = 0

    for user in user_ids:
        # Get actual books the user has rated in the test set
        user_ratings = test_df[test_df['User-Index'] == user]
        
        # Filter books with ratings above threshold (considered "relevant")
        actual_books = set(user_ratings[user_ratings['Book-Rating'] >= rating_threshold]['Book-Index'].values)
        actual_ratings = user_ratings[user_ratings['Book-Index'].isin(actual_books)]['Book-Rating'].values

        # Get top-k recommended books
        recommended_books = set(bayes_general_recommendation(user, book_pool, trace, top_k))

        # Ensure proper indexing and matching
        correct_matches = recommended_books & actual_books

        print(f"User {user} - Actual Books: {actual_books}")
        print(f"User {user} - Recommended Books: {recommended_books}")
        print(f"Correct Matches: {correct_matches}\n")

        # If there are no actual relevant books, we cannot compute recall
        if len(actual_books) == 0:
            continue  # Skip users with no actual relevant books

        # Precision@K: Fraction of recommended books that are actually relevant
        precision = len(correct_matches) / top_k

        # Recall@K: Fraction of relevant books that were successfully recommended
        recall = len(correct_matches) / len(actual_books)

        # Predict ratings for actual books
        predicted_ratings = np.array([
            trace.posterior["mu"].mean().item() +
            trace.posterior["user_bias"].mean(dim=("chain", "draw")).values[user] +
            trace.posterior["book_bias"].mean(dim=("chain", "draw")).values[book]
            for book in actual_books
        ])

       # Compute MAE and RMSE only if there are matching predictions
        if len(actual_ratings) > 0 and len(predicted_ratings) > 0:
            mae = mean_absolute_error(actual_ratings, predicted_ratings)
            rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
        else:
            mae, rmse = 0, 0

        total_precision += precision
        total_recall += recall
        total_mae += mae
        total_rmse += rmse
        user_count += 1
    
    # Compute average metrics across users
    avg_precision = total_precision / user_count if user_count > 0 else 0
    avg_recall = total_recall / user_count if user_count > 0 else 0
    avg_mae = total_mae / user_count if user_count > 0 else 0
    avg_rmse = total_rmse / user_count if user_count > 0 else 0
    
    print(f"Average Precision@{top_k}: {avg_precision:.4f}")
    print(f"Average Recall@{top_k}: {avg_recall:.4f}")
    print(f"Mean Absolute Error (MAE): {avg_mae:.4f}")
    print(f"Root Mean Squared Error (RMSE): {avg_rmse:.4f}")

# Running evaluation
book_pool = np.arange(num_books)
evaluate_recommendations(test_user_ids, book_pool, trace, test_df, top_k=5)


User 471 - Actual Books: {362}
User 471 - Recommended Books: {386, 55, 439, 761, 122}
Correct Matches: set()

User 320 - Actual Books: {742}
User 320 - Recommended Books: {164, 324, 911, 793, 732}
Correct Matches: set()

User 327 - Actual Books: {169}
User 327 - Recommended Books: {742, 74, 882, 949, 476}
Correct Matches: set()

User 9 - Actual Books: {876}
User 9 - Recommended Books: {164, 806, 654, 47, 732}
Correct Matches: set()

User 192 - Actual Books: {121}
User 192 - Recommended Books: {577, 709, 613, 124, 701}
Correct Matches: set()

User 795 - Actual Books: {811}
User 795 - Recommended Books: {964, 207, 15, 306, 509}
Correct Matches: set()

User 200 - Actual Books: {897}
User 200 - Recommended Books: {867, 41, 73, 601, 827}
Correct Matches: set()

User 569 - Actual Books: {213}
User 569 - Recommended Books: {96, 897, 103, 158, 607}
Correct Matches: set()

User 213 - Actual Books: {686}
User 213 - Recommended Books: {135, 941, 625, 700, 221}
Correct Matches: set()

User 533 - A

#### Map the recommended books to the book metadata

In [137]:
# Create a dictionary mapping Book-Index to ISBN
book_index_to_isbn = df.set_index('Book-Index')['ISBN'].to_dict()

# Example recommended books
recommended_books = [644, 28, 785, 250, 900]  # Replace with actual recommended books output

# Map Book-Index to ISBN
recommended_isbns = [book_index_to_isbn[idx] for idx in recommended_books if idx in book_index_to_isbn]

# Print the results
print("\nTop-5 Recommended Books (ISBNs):", recommended_isbns)



Top-5 Recommended Books (ISBNs): ['0671827138', '0060915544', '0836228995', '0375727086', '1594141819']


In [138]:
import xarray as xr

# Get all user indices
all_users = df['User-Index'].unique()

# Create an empty dictionary to store recommendations
recommendations = {}

# Iterate through each user and get book recommendations
for user_id in all_users:
    recommended_books = bayes_general_recommendation(user_id, np.arange(num_books), trace, top_k=5)
    recommendations[user_id] = recommended_books

# Convert to DataFrame
rec_df = pd.DataFrame.from_dict(recommendations, orient='index', columns=[f'Rec_{i+1}' for i in range(5)])

# Reset index for merging
rec_df.reset_index(inplace=True)
rec_df.rename(columns={'index': 'User-Index'}, inplace=True)

# Merge with the original dataframe on "User-Index"
df_combined = df.merge(rec_df, on='User-Index', how='left')

# Convert the dataframe to xarray Dataset for NetCDF format
ds = xr.Dataset.from_dataframe(df_combined)

# Save as a .nc file
ds.to_netcdf("book_recommendations.nc")

print("✅ Recommendation data saved as 'book_recommendations.nc'")


✅ Recommendation data saved as 'book_recommendations.nc'


In [139]:
# Convert the dataset back to a DataFrame for easy previewing
df_preview = ds.to_dataframe()

# Display first few rows
print(df_preview.head())

       User-ID        ISBN  Book-Rating  User-Index  Book-Index  Rec_1  Rec_2  \
index                                                                           
0        44852  0440236703            7         131         398    790     21   
1       104113  0517061015            7         318         522    314      7   
2        86720  0380717581            7         250         259    307    761   
3        19371  0446520802            4          54         432    260    855   
4       258769  0575075260            8         800         582    282    895   

       Rec_3  Rec_4  Rec_5  
index                       
0        909    613    611  
1        946    865    517  
2        499    242    137  
3        533    897    699  
4        501    790     13  


In [143]:
# Create a mapping dictionary of Book-Index to ISBN
book_index_to_isbn = df_combined[['Book-Index', 'ISBN']].drop_duplicates().set_index('Book-Index').sort_index()

# Book index & ISBN
print(book_index_to_isbn.head())

# Convert the dataframe to xarray Dataset for NetCDF format
ds = xr.Dataset.from_dataframe(df_combined)

# Save as a .nc file
ds.to_netcdf("book_info.nc")

print("✅ Recommendation data saved as 'book_info.nc'")

                  ISBN
Book-Index            
0           0002252341
1           0003700933
2           0006384684
3           0006550789
4           0007105665
✅ Recommendation data saved as 'book_info.nc'
