# Content-Based Recommender on Test Set

This approach is using content-based for every recommendation regardless of whether or not it is a cold start user or warm user. 

Going to use SBERT to generate embeddings of Books and User History Embeddings

In [17]:
import pandas as pd
import datetime
import numpy as np
import json
import gzip

book_test_df = pd.read_csv('../data/Books.test.csv.gz', compression='gzip', sep=',', header=0)
book_val_df = pd.read_csv('../data/Books.valid.csv.gz', compression='gzip', sep=',', header=0)
book_train_df = pd.read_csv('../data/Books.train.csv.gz', compression='gzip', sep=',', header=0)

book_test_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,593235657,5.0,1640629604904,1446304000 1564770672 1442450703 1780671067 16...
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,803736800,4.0,1454676557000,0811849783 0803729952 0735336296 1508558884 08...
2,AGXFEGMNVCSTSYYA5UWXDV7AFSXA,1542046599,5.0,1605649719611,1578052009 1477493395 1594747350 1594749310
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,679450815,5.0,1638987703546,B00INIQVJA 1496407903 1974633225 B07KD27RHM 16...
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1250866448,5.0,1669414969335,0920668372 1589255208 2764322836 2764330898 00...


In [18]:
#Combine into a list of all unique parent_asins for filtering metadata file
unique_test = book_test_df['parent_asin'].unique()
unique_val = book_val_df['parent_asin'].unique()
unique_train = book_train_df['parent_asin'].unique()
unique_items = np.concatenate([unique_train, unique_val, unique_test])
unique_items = list(set(unique_items))
print("Unique Books in train, test and val set: ", len(unique_items))

Unique Books in train, test and val set:  495063


Filter metadata file to only books that are used in train, test and validation sets

In [4]:
#filter down to only books in train, test, val sets
import gzip, json

metadata_file = '../data/meta_Books.jsonl.gz'
output_filtered = '../data/meta_Books_filtered.jsonl'

unique_items_set = set(unique_items)

count = 0
with gzip.open(metadata_file, "rt", encoding="utf-8") as src, \
     open(output_filtered, "w", encoding="utf-8") as dst:
    for line in src:
        item = json.loads(line)
        asin = item.get("parent_asin")
        if asin in unique_items_set:
            dst.write(line)
            count += 1

print("Wrote", count, "filtered book records to:", output_filtered)

Wrote 495063 filtered book records to: ../data/meta_Books_filtered.jsonl


Loading all the Book Metadata safely - the Book metadata compressed file is 4.6 GB. <br>
Generating Book Embeddings while loading in the data for each of the books in the dataset

In [6]:
#gets embedding for a single book
def encode_book(model, text):
    return model.encode(text)

#limits number of words passed to encoder for a given sequence of text
def truncate_words(text, max_words=200):
    words = text.split()
    return " ".join(words[:max_words])

#Builds text that will be passed to encoder to get book embedding
def parse_book_metadata(item):
    if('title' in item and item['title'] and item['title'] != []): #checks key exist, isnt None and is not empty list
        title = item['title']
    else:
        title = ""
    
    if('categories' in item and item['categories'] and item['categories'] != []):
        categories = " ".join(str(x) for x in item['categories'])
    else:
        categories = ""
    
    if('author' in item and item['author']):
        if('name' in item['author'] and item['author']['name'] and item['author']['name'] != []):
            author = item['author']['name']
        else:
            author = ""
    else:
        author = ""
    
    if('features' in item and item['features'] and item['features']!= []):
        features_str = " ".join(str(x) for x in item['features'])
        features = truncate_words(features_str, max_words=50)
    else:
        features = ""

    if('description' in item and item['description'] and item['description']!= []):
        desc_str = " ".join(str(x) for x in item['description'])
        description = truncate_words(desc_str, max_words=50)
    else:
        description = ""
    
    return title, categories, f"{title} {categories} {author} {description}"

In [4]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)
print("Torch version:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

Using device: cuda
Torch version: 2.9.1+cu128
Built with CUDA: 12.8
CUDA available: True


Note: This code will take a while. We are embedding all the books used across the train, test and val sets which is 495,063 books. 

In [None]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

# Config
filtered_file = '../data/meta_Books_filtered.jsonl'
output_dir = '../data/book_embeddings_chunks'
os.makedirs(output_dir, exist_ok=True)

batch_size = 128
save_every = 100_000

# Load Model
model = SentenceTransformer(
    "sentence-transformers/all-mpnet-base-v2",
    device="cuda"
)

book_rows = []
texts_batch = []
meta_batch = []
chunk_id = 0
processed = 0

with open(filtered_file, "r", encoding="utf-8") as f:
    for line in tqdm(f, total=len(unique_items), desc="Encoding books"):
        item = json.loads(line)
        asin = item["parent_asin"]

        # Build text input
        title, categories, item_str = parse_book_metadata(item)

        # Add to batch
        texts_batch.append(item_str)
        meta_batch.append((asin, title, categories))

        # Encode when batch full
        if len(texts_batch) >= batch_size:
            embeddings = model.encode(
                texts_batch,
                convert_to_numpy=True,
                batch_size=batch_size,
                normalize_embeddings=True,
                show_progress_bar=False
            )

            for i, (asin, title, categories) in enumerate(meta_batch):
                book_rows.append({
                    "parent_asin": asin,
                    "title": title,
                    "categories": categories,
                    "embedding": embeddings[i],
                })

            processed += len(texts_batch)
            texts_batch.clear()
            meta_batch.clear()

            # Save chunks
            if processed % save_every == 0:
                df = pd.DataFrame(book_rows)
                df.to_pickle(f"{output_dir}/chunk_{chunk_id}.pkl")
                print(f"Saved chunk {chunk_id} with {len(book_rows)} rows")
                book_rows.clear()
                chunk_id += 1

# encode trailing batch left
if texts_batch:
    embeddings = model.encode(
        texts_batch,
        convert_to_numpy=True,
        batch_size=batch_size,
        normalize_embeddings=True,
        show_progress_bar=False
    )
    for i, (asin, title, categories) in enumerate(meta_batch):
        book_rows.append({
            "parent_asin": asin,
            "title": title,
            "categories": categories,
            "embedding": embeddings[i],
        })

# save to pickle files
df = pd.DataFrame(book_rows)
df.to_pickle(f"{output_dir}/chunk_{chunk_id}.pkl")
print(f"Saved final chunk {chunk_id} with {len(book_rows)} rows")


Encoding books:  81%|████████  | 400000/495063 [13:17<16:14, 97.56it/s] 

Saved chunk 0 with 400000 rows


Encoding books: 100%|██████████| 495063/495063 [16:27<00:00, 501.40it/s]


Saved final chunk 1 with 95063 rows


In [19]:
book_test_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,593235657,5.0,1640629604904,1446304000 1564770672 1442450703 1780671067 16...
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,803736800,4.0,1454676557000,0811849783 0803729952 0735336296 1508558884 08...
2,AGXFEGMNVCSTSYYA5UWXDV7AFSXA,1542046599,5.0,1605649719611,1578052009 1477493395 1594747350 1594749310
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,679450815,5.0,1638987703546,B00INIQVJA 1496407903 1974633225 B07KD27RHM 16...
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,1250866448,5.0,1669414969335,0920668372 1589255208 2764322836 2764330898 00...


Load Book Embeddings from ../data/book_embeddings_chunks folder. Embeddings include all books in train, test and validation sets. 

In [9]:
import glob

embedding_dir = "../data/book_embeddings_chunks"
files = sorted(glob.glob(f"{embedding_dir}/chunk_*.pkl"))

asin2emb = {}

# Load each chunk and populate dictionary
for f in files:
    df = pd.read_pickle(f)
    for row in df.itertuples():
        asin2emb[row.parent_asin] = row.embedding


Get embeddings for each user in test set that are the average of the embeddings for each book in the user's history

In [20]:
book_test_df["history"] = book_test_df["history"].fillna("")
book_test_df['history'] = book_test_df['history'].apply(lambda x: x.split())

In [None]:
test_user_embeddings = {}

for user, rows in book_test_df.groupby("user_id"):
    history = rows["history"].sum()  # concatenates all lists of books in history
    history_embs = []
    for book_id in history: 
        if book_id in asin2emb: # if we have embedding for this book
            history_embs.append(asin2emb[book_id])

    if len(history_embs) == 0: #no books in user history
        continue

    user_emb = np.mean(history_embs, axis=0)
    user_emb = user_emb / np.linalg.norm(user_emb) #normalize user embds
    test_user_embeddings[user] = user_emb  #gets average of the embeddings of all books in test user's history


Only Test Items

In [None]:
# items in test set
test_asins = set(book_test_df['parent_asin'])
test_asins = [a for a in test_asins if a in asin2emb]  #keep only ASINs with embeddings

#Create test item matrix
test_item_matrix = np.vstack([asin2emb[a] for a in test_asins])  # shape: (N_test_items, embedding_dim)
test_item_matrix = test_item_matrix.T  # transpose for dot product

In [12]:
print("Number of Test User Embeddings: " , len(test_user_embeddings))
print("Item Matrix Shape: ", test_item_matrix.shape)

Number of Test User Embeddings:  776370
Item Matrix Shape:  (768, 256252)


In [13]:
test_purchased = {}

for _, row in book_test_df.iterrows():
    user = row["user_id"]
    purchased_book = row['parent_asin']
    test_purchased[user] = purchased_book

In [14]:
def ndcg_at_k(ranked_items, purchased_item, k):
    """Compute nDCG@k for a single user (binary relevance)"""
    try:
        rank = ranked_items[:k].index(purchased_item) + 1
        return 1.0 / np.log2(rank + 1)
    except ValueError:
        return 0.0

def hit_at_k(ranked_items, purchased_item, k):
    """Hit rate@k for a single user"""
    return int(purchased_item in ranked_items[:k])

def mrr_score(ranked_items, purchased_item):
    """Mean Reciprocal Rank for a single user"""
    if purchased_item in ranked_items:
        rank = ranked_items.index(purchased_item) + 1
        return 1.0 / rank
    else:
        return 0.0

def auc_score(scores, purchased_idx, num_samples=1000):
    """Approximate AUC using random sampled negatives"""
    num_items = len(scores)
    neg_idx = np.random.choice(
        [j for j in range(num_items) if j != purchased_idx],
        size=min(num_samples, num_items-1),
        replace=False
    )
    return np.mean(scores[purchased_idx] > scores[neg_idx])

def get_topk_indices(scores, k):
    """Return indices of top-k scores"""
    idx = np.argpartition(-scores, k)[:k]
    return idx[np.argsort(scores[idx])[::-1]]

In [None]:
from tqdm import tqdm
import random

# Parameters
device = "cuda"
batch_size = 1024
top_k = 100
auc_sample_size = 1000
sample_size = 10000  # number of users to sample

# Sample users
all_users = list(test_user_embeddings.keys())
sampled_users = random.sample(all_users, min(sample_size, len(all_users)))
user_ids = sampled_users
user_matrix = np.vstack([test_user_embeddings[u] for u in sampled_users])

# Convert to PyTorch tensors
user_matrix = torch.tensor(user_matrix, dtype=torch.float32, device=device)
test_item_matrix = torch.tensor(test_item_matrix, dtype=torch.float32, device=device)

num_users = user_matrix.size(0)
num_items = test_item_matrix.size(0)

# Metric accumulators
hit5_list, hit10_list = [], []
ndcg5_list, ndcg10_list = [], []
mrr_list, auc_list = [], []

# Batched evaluation
for start_idx in tqdm(range(0, num_users, batch_size), desc="Evaluating sampled users on GPU"):
    end_idx = min(start_idx + batch_size, num_users)
    user_batch = user_matrix[start_idx:end_idx]
    user_ids_batch = user_ids[start_idx:end_idx]

    # Dot product -> (batch_size, num_items)
    scores_batch = torch.matmul(user_batch, test_item_matrix) #get similarity scores between sampled user and test items
    scores_batch_cpu = scores_batch.cpu().numpy()

    for i, user in enumerate(user_ids_batch):
        scores = scores_batch_cpu[i]
        purchased_item = test_purchased[user]
        purchased_idx = test_asins.index(purchased_item)

        # Top-k indices
        topk_idx = get_topk_indices(scores, top_k)
        topk_asins = [test_asins[j] for j in topk_idx]

        # Compute metrics
        hit10_list.append(hit_at_k(topk_asins, purchased_item, 10))
        hit5_list.append(hit_at_k(topk_asins, purchased_item, 5))
        ndcg10_list.append(ndcg_at_k(topk_asins, purchased_item, 10))
        ndcg5_list.append(ndcg_at_k(topk_asins, purchased_item, 5))
        mrr_list.append(mrr_score(topk_asins, purchased_item))
        auc_list.append(auc_score(scores, purchased_idx, num_samples=auc_sample_size))

# Aggregate metrics
metrics = {
    "HitRate@5": np.mean(hit5_list),
    "HitRate@10": np.mean(hit10_list),
    "nDCG@5": np.mean(ndcg5_list),
    "nDCG@10": np.mean(ndcg10_list),
    "MRR": np.mean(mrr_list),
    "AUC": np.mean(auc_list)
}

print("Evaluation Metrics (sampled users):")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")

Evaluating sampled users on GPU: 100%|██████████| 10/10 [05:02<00:00, 30.24s/it]

Evaluation Metrics (sampled users):
HitRate@5: 0.0116
HitRate@10: 0.0237
nDCG@5: 0.0067
nDCG@10: 0.0106
MRR: 0.0080
AUC: 0.7757



