In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load your datasets
books_df = pd.read_csv('./data/Books.csv')
ratings_df = pd.read_csv('./data/Ratings.csv')
users_df = pd.read_csv('./data/Users.csv')

  books_df = pd.read_csv('./data/Books.csv')


In [60]:
ratings_df['ISBN'] = ratings_df['ISBN'].apply(lambda x : str(x))
books_df['ISBN'] = books_df['ISBN'].apply(lambda x : str(x))

combined_df = pd.merge(ratings_df,books_df,on='ISBN')
combined_df = pd.merge(combined_df,users_df,on='User-ID')
combined_df.drop(columns=['Image-URL-S','Image-URL-M','Image-URL-L'],inplace=True)

In [61]:
combined_df['Year-Of-Publication'] = pd.to_numeric(combined_df['Year-Of-Publication'], errors='coerce')
combined_df.dropna(subset=['Year-Of-Publication'], inplace=True)
combined_df['Year-Of-Publication'] = combined_df['Year-Of-Publication'].astype(int)

def is_numeric(val):
    return isinstance(val, (int, float, np.number))

numeric_authors_mask = combined_df['Book-Author'].apply(is_numeric)
combined_df = combined_df[~numeric_authors_mask]

In [62]:
combined_df.dropna(inplace=True)
combined_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location,Age
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,"cincinnati, ohio, usa",23.0
2,2313,0812533550,9,Ender's Game (Ender Wiggins Saga (Paperback)),Orson Scott Card,1986,Tor Books,"cincinnati, ohio, usa",23.0
3,2313,0679745580,8,In Cold Blood (Vintage International),TRUMAN CAPOTE,1994,Vintage,"cincinnati, ohio, usa",23.0
4,2313,0060173289,9,Divine Secrets of the Ya-Ya Sisterhood : A Novel,Rebecca Wells,1996,HarperCollins,"cincinnati, ohio, usa",23.0
5,2313,0385482388,5,The Mistress of Spices,Chitra Banerjee Divakaruni,1998,Anchor Books/Doubleday,"cincinnati, ohio, usa",23.0


In [63]:
from sklearn.preprocessing import StandardScaler

user_encoder = LabelEncoder()
book_encoder = LabelEncoder()
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()
location_encoder = LabelEncoder()

combined_df['User-ID'] = user_encoder.fit_transform(combined_df['User-ID'])
combined_df['ISBN'] = book_encoder.fit_transform(combined_df['ISBN'])
combined_df['Book-Author'] = author_encoder.fit_transform(combined_df['Book-Author'])
combined_df['Publisher'] = publisher_encoder.fit_transform(combined_df['Publisher'])
combined_df['Location'] = location_encoder.fit_transform(combined_df['Location'])

# Split data
train_data, test_data = train_test_split(combined_df, test_size=0.2, random_state=42)

scaler = StandardScaler()
train_data['Age'] = scaler.fit_transform(train_data[['Age']])
test_data['Age'] = scaler.transform(test_data[['Age']])

# Fit on training data and transform both training and testing data for 'Year-Of-Publication'
train_data['Year-Of-Publication'] = scaler.fit_transform(train_data[['Year-Of-Publication']])
test_data['Year-Of-Publication'] = scaler.transform(test_data[['Year-Of-Publication']])

In [64]:
test_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Location,Age
527981,51438,88913,7,The Call of the Wild: And Selected Stories (Si...,35563,0.12972,11806,12041,-1.659782
186300,20704,2767,0,Watermelon,54891,0.146893,9964,6659,0.539582
836300,19142,17733,10,The Further Adventures of Hank the Cowdog (Han...,42532,0.12972,10544,3218,0.823371
567222,26074,32213,9,The Tipping Point: How Little Things Can Make ...,53814,0.138306,7672,2859,-0.737468
383918,43250,111957,9,A Journey in Ladakh: Encounters with Buddhism,3261,0.138306,8101,11302,-1.375993


In [65]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NCF(nn.Module):
    def __init__(self, num_users, num_items, num_locations, num_authors, num_publishers, embed_size):
        super(NCF, self).__init__()
        # Embeddings for user and item IDs
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embed_size)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=embed_size)
        
        # Additional embeddings for categorical features
        self.location_embedding = nn.Embedding(num_embeddings=num_locations, embedding_dim=embed_size)
        self.author_embedding = nn.Embedding(num_embeddings=num_authors, embedding_dim=embed_size)
        self.publisher_embedding = nn.Embedding(num_embeddings=num_publishers, embedding_dim=embed_size)
        
        # MLP layers
        self.fc_layers = nn.Sequential(
            nn.Linear(embed_size * 5 + 2, 64),  # 5 embeddings + 2 numerical features
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
        
    def forward(self, user_ids, item_ids, locations, ages, authors, years, publishers):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        location_emb = self.location_embedding(locations)
        author_emb = self.author_embedding(authors)
        publisher_emb = self.publisher_embedding(publishers)
        
        # Combine all features
        combined_features = torch.cat([user_emb, item_emb, location_emb, author_emb, publisher_emb, ages.unsqueeze(1), years.unsqueeze(1)], dim=1)
        
        # Pass through MLP
        predictions = self.fc_layers(combined_features)
        return predictions.squeeze()


In [68]:
from torch.utils.data import DataLoader, Dataset
import numpy as np

class RatingsDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['User-ID'].values, dtype=torch.long)
        self.books = torch.tensor(df['ISBN'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['Book-Rating'].values, dtype=torch.float32)
        self.locations = torch.tensor(df['Location'].values, dtype=torch.long)
        self.authors = torch.tensor(df['Book-Author'].values, dtype=torch.long)
        self.publishers = torch.tensor(df['Publisher'].values, dtype=torch.long)
        self.age = torch.tensor(df['Age'].values, dtype=torch.long)
        self.year = torch.tensor(df['Year-Of-Publication'].values, dtype=torch.long)
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return self.users[idx], self.books[idx], self.ratings[idx], self.locations[idx], self.authors[idx],\
              self.publishers[idx], self.age[idx], self.year[idx]

# Prepare DataLoader
train_dataset = RatingsDataset(train_data)
test_dataset = RatingsDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NCF(num_users=len(user_encoder.classes_), num_items=len(book_encoder.classes_), num_authors=len(author_encoder.classes_), \
            num_locations=len(location_encoder.classes_), num_publishers=len(publisher_encoder.classes_), embed_size=16).to(device)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for user_indices, book_indices, ratings, location, author, publisher, age, year in train_loader:
        user_indices, book_indices, ratings = user_indices.to(device), book_indices.to(device), ratings.to(device)
        location, author, publisher, age, year = location.to(device), author.to(device), publisher.to(device),\
                                                 age.to(device), year.to(device)
        
        # Forward pass
        outputs = model(user_indices, book_indices, location, age, author, year, publisher)
        loss = criterion(outputs, ratings)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 16.6313
Epoch [2/10], Loss: 18.3730
Epoch [3/10], Loss: 6.3518
Epoch [4/10], Loss: 5.4577
Epoch [5/10], Loss: 13.5837
Epoch [6/10], Loss: 11.4114
Epoch [7/10], Loss: 6.6981
Epoch [8/10], Loss: 8.8492
Epoch [9/10], Loss: 4.1447
Epoch [10/10], Loss: 1.0009


In [73]:
predictions = []
model.eval()  # Set the model to evaluation mode
test_loss = 0
with torch.no_grad():  # No gradients needed
    for user_indices, book_indices, ratings, location, author, publisher, age, year in test_loader:
        user_indices, book_indices, ratings = user_indices.to(device), book_indices.to(device), ratings.to(device)
        location, author, publisher, age, year = location.to(device), author.to(device), publisher.to(device),\
                                                 age.to(device), year.to(device)
        
        # Forward pass
        outputs = model(user_indices, book_indices, location, age, author, year, publisher)
        loss = criterion(outputs, ratings)
        
        test_loss += loss.item()
        predictions.extend(list(zip(user_indices.cpu().numpy(),
                                    book_indices.cpu().numpy(),
                                    outputs.cpu().numpy(),
                                    ratings.cpu().numpy())))

# Average loss
test_loss /= len(test_loader)
print(f'Test MSE: {test_loss}')

Test MSE: 12.535417493184408


In [93]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, _, est, true_r in predictions:
        user_est_true[uid].append((est, true_r))

    precision = dict()
    recall = dict()

    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])

        precision[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        recall[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return np.mean(list(precision.values())), np.mean(list(recall.values()))

In [94]:
precision_recall_at_k(predictions=predictions)

(0.39776720455287323, 0.41106376766512776)

In [79]:
def hit_rate_at_k(predictions, k=10, threshold=4.0):
    predictions_array = np.array(predictions)
    # Unique users
    unique_users = np.unique(predictions_array[:, 0])
    
    hits = 0
    for user in unique_users:
        # Filter predictions for the current user
        user_predictions = predictions_array[predictions_array[:, 0] == user]
        
        # Sort the user's predictions by predicted rating in descending order
        sorted_user_predictions = user_predictions[user_predictions[:, 2].argsort()[::-1]]
        
        # Check if the top-K items contain at least one relevant item
        top_k = sorted_user_predictions[:k]
        if any(top_k[:, 3] >= threshold):  # Check true ratings in top K
            hits += 1
    
    # Calculate hit rate
    hit_rate = hits / len(unique_users)
    return hit_rate

In [80]:
hit_rate_at_k(predictions)

0.6941712436667092

In [81]:
import math 

def ndcg_at_k_per_user(predictions, k=10, threshold=3.5):
    # Convert to numpy array for easier manipulation
    predictions_array = np.array(predictions)
    unique_users = np.unique(predictions_array[:, 0])
    
    ndcg_scores = []
    
    for user_id in unique_users:
        # Filter predictions for the current user
        user_preds = predictions_array[predictions_array[:, 0] == user_id]
        
        # Sort user predictions by predicted rating in descending order
        sorted_preds = user_preds[user_preds[:, 2].argsort()[::-1]]
        
        DCG = 0
        IDCG = 0
        for i, (_, _, est, true_r) in enumerate(sorted_preds[:k]):
            rel = 1 if true_r >= threshold else 0
            DCG += (rel / math.log(i + 2, 2))
            IDCG += (1 / math.log(i + 2, 2))  # Assuming all top K items are relevant
        
        ndcg_score = DCG / IDCG if IDCG > 0 else 0
        ndcg_scores.append(ndcg_score)
    
    # Average NDCG across all users
    avg_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0
    return avg_ndcg

In [82]:
ndcg_at_k_per_user(predictions=predictions)

0.5340392259224052

In [47]:
# def recommend_books(model, user_id_encoded, book_ids_encoded, num_recommendations=5):
#     model.eval()
#     user_indices = torch.tensor([user_id_encoded] * len(book_ids_encoded), dtype=torch.long).to(device)
#     book_indices = torch.tensor(book_ids_encoded, dtype=torch.long).to(device)
    
#     with torch.no_grad():
#         predictions = model(user_indices, book_indices).cpu().numpy()
    
#     # Get the top N recommendations; argsort sorts in ascending order, so use [-num_recommendations:]
#     recommended_indices = np.argsort(predictions)[-num_recommendations:]
    
#     # Decode book IDs if necessary
#     recommended_books = [book_encoder.inverse_transform([idx])[0] for idx in recommended_indices]
    
#     return recommended_books

# # Example usage
# user_id_encoded = user_encoder.transform([809])[0]  # Replace 'user_id_example' with actual user ID
# book_ids_encoded = list(range(len(book_encoder.classes_)))  # All books
# recommended_books = recommend_books(model, user_id_encoded, book_ids_encoded, num_recommendations=5)
# print("Recommended Books:", recommended_books)


Recommended Books: ['0586092269', '0886777399', '0020303750', '0099296810', '0307204030']
