## RegressionRec

- Load interactions (user, item, rating).
- Load metadata for all items.
- Clean and normalize metadata fields.
- Extract authors, categories, prices, etc.
- Merge metadata onto interactions.
- Encode categorical features.
- (Optional) Generate embeddings for text.
- Split into train/val/test.
- Train LightGBM regressor to predict ratings.
- For each user: score all unseen items.
- Rank scores → get top-K recommendations.
- Evaluate using NDCG@K / HR@K.

In [1]:
import pandas as pd

book_test_df = pd.read_csv('data/Books.test.csv.gz', compression='gzip', sep=',', header=0)
book_val_df = pd.read_csv('data/Books.valid.csv.gz', compression='gzip', sep=',', header=0)
book_train_df = pd.read_csv('data/Books.train.csv.gz', compression='gzip', sep=',', header=0)

book_train_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1446304000,5.0,1441260345000,
1,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1564770672,5.0,1441260365000,1446304000
2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1442450703,5.0,1523093714024,1446304000 1564770672
3,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1780671067,1.0,1611623223325,1446304000 1564770672 1442450703
4,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1645671127,3.0,1612044209266,1446304000 1564770672 1442450703 1780671067


In [2]:
book_train_df.drop(columns=['history', 'timestamp'])

#keep history for val and test so we can use to filter out recs later
book_val_df.drop(columns=['timestamp'])
book_test_df.drop(columns=['timestamp'])

book_val_df["history"] = book_val_df["history"].fillna("")
book_val_df['history'] = book_val_df['history'].apply(lambda x: x.split())

book_test_df["history"] = book_test_df["history"].fillna("")
book_test_df['history'] = book_test_df['history'].apply(lambda x: x.split())

In [3]:
book_val_df.head()

Unnamed: 0,user_id,parent_asin,rating,timestamp,history
0,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1782490671,5.0,1640383495102,"[1446304000, 1564770672, 1442450703, 178067106..."
1,AGKASBHYZPGTEPO6LWZPVJWB2BVA,802737803,5.0,1454676232000,"[0811849783, 0803729952, 0735336296, 1508558884]"
2,AGXFEGMNVCSTSYYA5UWXDV7AFSXA,1594749310,5.0,1541884305941,"[1578052009, 1477493395, 1594747350]"
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1633573001,5.0,1612225279592,"[B00INIQVJA, 1496407903, 1974633225, B07KD27RHM]"
4,AHXBL3QDWZGJYH7A5CMPFNUPMF7Q,451450523,2.0,1635710722120,"[0920668372, 1589255208, 2764322836, 276433089..."


In [4]:
print("Num Interactions in Train set: ", len(book_train_df))
print("Num Interactions in Val set: ", len(book_val_df))
print("Num Interactions in Test set: ", len(book_test_df))

Num Interactions in Train set:  7935557
Num Interactions in Val set:  776370
Num Interactions in Test set:  776370


In [6]:
import gzip
import json

file_path = "data/meta_Books.jsonl.gz"

N = 5  # number of lines to preview

with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= N:
            break
        try:
            obj = json.loads(line)
            print(json.dumps(obj, indent=2))
            print("-" * 50)
        except json.JSONDecodeError:
            print("Error decoding line:", line)

{
  "main_category": "Books",
  "title": "Chaucer",
  "subtitle": "Hardcover \u2013 Import, January 1, 2004",
  "author": {
    "avatar": "https://m.media-amazon.com/images/I/21Je2zja9pL._SY600_.jpg",
    "name": "Peter Ackroyd",
    "about": [
      "Peter Ackroyd, (born 5 October 1949) is an English biographer, novelist and critic with a particular interest in the history and culture of London. For his novels about English history and culture and his biographies of, among others, William Blake, Charles Dickens, T. S. Eliot and Sir Thomas More, he won the Somerset Maugham Award and two Whitbread Awards. He is noted for the volume of work he has produced, the range of styles therein, his skill at assuming different voices and the depth of his research.",
      "He was elected a fellow of the Royal Society of Literature in 1984 and appointed a Commander of the Order of the British Empire in 2003.",
      "Bio from Wikipedia, the free encyclopedia."
    ]
  },
  "average_rating": 4.5,
  

In [5]:
import json

rows = []
with open('data/meta_Books_filtered.jsonl', 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        rows.append(json.loads(line))

meta_df=pd.DataFrame(rows)
print(meta_df.head())
print(len(meta_df))

  main_category                                              title  \
0         Books                        Service: A Navy SEAL at War   
1         Books  Monstrous Stories #4: The Day the Mice Stood S...   
2  Buy a Kindle                                    Parker & Knight   
3         Books      Make: Electronics: Learning Through Discovery   
4         Books               Four Centuries of American Education   

                       subtitle  \
0       Hardcover – May 8, 2012   
1  Paperback – October 29, 2013   
2                Kindle Edition   
3                   2nd Edition   
4  Paperback – November 8, 2004   

                                              author  average_rating  \
0  {'avatar': 'https://m.media-amazon.com/images/...             4.7   
1                                               None             4.4   
2  {'avatar': 'https://m.media-amazon.com/images/...             4.5   
3  {'avatar': 'https://m.media-amazon.com/images/...             4.7   
4  {'ava

In [6]:
#grab author names and most specific 'sub' category if one exists
def extract_author_name(a):
    if isinstance(a, dict):
        name = a.get("name")
        if isinstance(name, str):
            return name.strip()
    return ""

def extract_main_category(cat):
    if isinstance(cat, list) and len(cat) > 0:
        return cat[-1]  # deepest sub category
    return ""

meta_df['main_subcategory'] = meta_df['categories'].apply(extract_main_category)
meta_df['author_name'] = meta_df['author'].apply(extract_author_name)

In [7]:
meta_df.head(1)

Unnamed: 0,main_category,title,subtitle,author,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,main_subcategory,author_name
0,Books,Service: A Navy SEAL at War,"Hardcover – May 8, 2012",{'avatar': 'https://m.media-amazon.com/images/...,4.7,3421,"[Marcus Luttrell, author of the #1 bestseller,...","[Review, Praise for SERVICE""An action-packed.....",17.17,[{'large': 'https://m.media-amazon.com/images/...,[],"Marcus Luttrell (Author), James D. Hornfischer","[Books, Biographies & Memoirs, Leaders & Notab...","{'Publisher': 'Little, Brown and Company; 1st ...",316185361,,Leaders & Notable People,Marcus Luttrell


In [8]:
meta_df.drop(columns=['main_category', 'title', 'subtitle', 'author', 'features', 'description', 'images', 'videos', 'store', 'categories', 'details', 'bought_together'],inplace=True)

In [9]:
meta_df.head()

Unnamed: 0,average_rating,rating_number,price,parent_asin,main_subcategory,author_name
0,4.7,3421,17.17,0316185361,Leaders & Notable People,Marcus Luttrell
1,4.4,40,7.43,0545425573,Science Fiction & Fantasy,
2,4.5,381,0.0,B00KFOP3RG,Thrillers & Suspense,Donald Wells
3,4.7,1366,13.43,1680450263,Engineering,Charles Platt
4,4.8,133,6.99,1932225323,Schools & Teaching,David Barton


In [48]:
meta_df.to_csv('data/meta_Books_prepped.csv')

In [None]:
#meta_df = pd.read_csv('data/meta_Books_prepped.csv', index_col=0)
#meta_df.head()

Unnamed: 0,average_rating,rating_number,price,parent_asin,main_subcategory,author_name
0,4.7,3421,17.17,0316185361,Leaders & Notable People,Marcus Luttrell
1,4.4,40,7.43,0545425573,Science Fiction & Fantasy,
2,4.5,381,0.0,B00KFOP3RG,Thrillers & Suspense,Donald Wells
3,4.7,1366,13.43,1680450263,Engineering,Charles Platt
4,4.8,133,6.99,1932225323,Schools & Teaching,David Barton


In [10]:
# prep metadata dataframe for merge with main datasets
def prep_meta_df(meta):
    # If average_rating, rating_number, or price are strings, coerce to numeric
    meta["average_rating"] = pd.to_numeric(meta["average_rating"], errors="coerce")
    meta["rating_number"] = pd.to_numeric(meta["rating_number"], errors="coerce")
    meta["price"] = pd.to_numeric(meta["price"], errors="coerce")
    print(f"Metadata rows: {len(meta)}")
    return meta

meta_df = prep_meta_df(meta_df)

Metadata rows: 495063


In [11]:
# Merge metadata df into interactions and create feature sets
def merge_meta(interactions, meta_df):
    df = interactions.merge(meta_df, on="parent_asin", how="left")
    # Fill missing values with defaults
    df["author_name"] = df["author_name"].fillna("").astype("category")
    df["main_subcategory"] = df["main_subcategory"].fillna("").astype("category")
    df["price"] = df["price"].fillna(0.0)
    # average_rating - fallback to global mean if missing
    global_mean = interactions["rating"].mean()
    df["average_rating"] = df["average_rating"].fillna(global_mean)
    df["rating_number"] = df["rating_number"].fillna(0).astype(int)
    return df

train_df = merge_meta(book_train_df, meta_df)
val_df   = merge_meta(book_val_df, meta_df)
test_df  = merge_meta(book_test_df, meta_df)

In [12]:
feature_cols = ["price", "average_rating", "rating_number", "author_name", "main_subcategory"]
cat_cols = ["author_name", "main_subcategory"]  # categorical subset


In [13]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)
print("Torch version:", torch.__version__)
print("Built with CUDA:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

Using device: cuda
Torch version: 2.9.1+cu128
Built with CUDA: 12.8
CUDA available: True


In [14]:
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Prepare training matrices
def prepare_feature_matrix(df, feature_cols, cat_cols):
    X = df[feature_cols].copy()
    # ensure categorical dtypes for LightGBM
    for c in cat_cols:
        if c in X.columns:
            X[c] = X[c].astype("category")
    y = df["rating"].values
    return X, y

# Train LightGBM
def train_lgbm(X_train, y_train, X_val , y_val, categorical_features, params, num_boost_round=1000):
    print("Creating LightGBM dataset objects ...")
    lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, categorical_feature=categorical_features, free_raw_data=False)
    print("Training LightGBM ...")
    booster = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        valid_names=["train", "valid"],
        num_boost_round=num_boost_round, 
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    return booster

X_train, y_train = prepare_feature_matrix(train_df, feature_cols, cat_cols)
X_val, y_val = prepare_feature_matrix(val_df, feature_cols, cat_cols)

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "verbosity": -1,
    "seed": 42,
}
booster = train_lgbm(X_train, y_train, X_val, y_val, cat_cols, params)
booster.save_model("lgbm_book_ratings.bin")

Creating LightGBM dataset objects ...
Training LightGBM ...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	train's rmse: 0.92824	valid's rmse: 0.965588


<lightgbm.basic.Booster at 0x2c9a7c86d50>

In [15]:
from sklearn.metrics import roc_auc_score

# Evaluation utilities (ranking)
def hit_at_k(ranked_list, true_item, k):
    return 1 if true_item in ranked_list[:k] else 0

def ndcg_at_k(ranked_list, true_item, k):
    if true_item in ranked_list[:k]:
        idx = ranked_list.index(true_item)
        return 1.0 / np.log2(idx + 2)
    return 0.0

def reciprocal_rank(ranked_list, true_item):
    if true_item in ranked_list:
        idx = ranked_list.index(true_item)
        return 1.0 / (idx + 1)
    return 0.0

# Compute AUC for a single user
def auc_for_user(scores_dict, true_item):
    """
    scores_dict: {item: predicted_score}
    true_item: the item that was actually interacted with
    Returns AUC for this user treating true_item as positive and all others as negative
    """
    y_true = [1 if item == true_item else 0 for item in scores_dict.keys()]
    y_scores = list(scores_dict.values())
    
    # Handle case with only one positive
    if sum(y_true) == 0 or sum(y_true) == len(y_true):
        return np.nan
    
    return roc_auc_score(y_true, y_scores)

In [16]:
from tqdm import tqdm
def recommend_and_eval(model, df_split, items_candidates, meta_df, feature_cols, cat_cols, batch_size=4096):
    # Precompute item static features
    items_meta = meta_df.set_index("parent_asin").loc[items_candidates].reset_index()
    items_meta = items_meta.fillna({
        "author_name": "",
        "main_subcategory": "",
        "price": 0.0,
        "average_rating": 0.0,
        "rating_number": 0
    })

    # Ensure categorical columns are proper
    for c in cat_cols:
        if c in items_meta.columns:
            # convert lists/arrays to strings
            items_meta[c] = items_meta[c].apply(lambda x: str(x) if isinstance(x, (list, tuple)) else x)
            items_meta[c] = items_meta[c].astype("category")

    recommendations = {}
    hr5_list = []
    hr10_list = []
    ndcg5_list= []
    ndcg10_list=[]
    mrr_list=[]
    auc_list=[]

    for idx, row in tqdm(df_split.iterrows(), total=len(df_split), desc="Scoring users"):
        user = row["user_id"]
        true_item = row["parent_asin"]
        # Use history column as set of seen items
        seen = set(row["history"]) if isinstance(row["history"], list) else set()

        items_array = np.asarray(items_candidates).ravel()
        valid_items = items_array[items_array != true_item]
        # Sample negatives (remove seen items)
        negatives = np.random.choice(valid_items, size=min(1000, len(valid_items)), replace=False)
        negatives = [item for item in negatives if item not in seen] # Remove already-seen items
        #candidate set= [true item] + negatives
        candidates = np.array([true_item] + negatives)
        # Extract metadata rows for these candidates
        cand_meta = items_meta[items_meta["parent_asin"].isin(candidates)].copy()
        # Fix categorical columns for the batch
        for c in cat_cols:
            cand_meta[c] = cand_meta[c].astype("category")

        # Predict scores in batches
        preds = []
        for i in range(0, len(cand_meta), batch_size):
            batch = cand_meta.iloc[i:i + batch_size].copy()
            for c in cat_cols:
                if c in batch.columns:
                    batch[c] = batch[c].apply(lambda x: str(x) if isinstance(x, (list, tuple)) else x)
                    batch[c] = batch[c].astype("category")
            pred_batch = model.predict(batch[feature_cols])
            preds.append(pred_batch)
        preds = np.concatenate(preds)
        cand_meta["score"] = preds

        # Full ranked list
        ranked_items = cand_meta.sort_values("score", ascending=False)["parent_asin"].tolist()
        recommendations[user] = ranked_items[:10]

        # Metrics
        top10 = ranked_items[:10]
        top5 = top10[:5]

        hr5_list.append(hit_at_k(top5, true_item, 5))
        ndcg5_list.append(ndcg_at_k(top5, true_item, 5))
        mrr_list.append(reciprocal_rank(ranked_items, true_item))
        auc_list.append(auc_for_user(dict(zip(cand_meta["parent_asin"], cand_meta["score"])), true_item))

        # Metrics for K=10
        top10 = ranked_items[:10]
        hr10_list.append(hit_at_k(top10, true_item, 10))
        ndcg10_list.append(ndcg_at_k(top10, true_item, 10))

    metrics = {
        "HR@5": np.mean(hr5_list),
        "HR@10": np.mean(hr10_list),
        "NDCG@5": np.mean(ndcg5_list),
        "NDCG@10": np.mean(ndcg10_list),
        "MRR": np.mean(mrr_list),
        "AUC": np.nanmean(auc_list),
    }

    return recommendations, metrics


In [17]:
val_candidate_items = val_df["parent_asin"].unique().tolist()
test_candidate_items = test_df["parent_asin"].unique().tolist()
print("Possible Items in Val Set: ", len(val_candidate_items))
print("Possible Items in Test Set: ", len(test_candidate_items))

Possible Items in Val Set:  269493
Possible Items in Test Set:  256252


In [18]:
book_val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776370 entries, 0 to 776369
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      776370 non-null  object 
 1   parent_asin  776370 non-null  object 
 2   rating       776370 non-null  float64
 3   timestamp    776370 non-null  int64  
 4   history      776370 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 29.6+ MB


In [None]:
#sample users for eval and test because running with all users takes forever
sample_val_df = book_val_df.sample(n=1000, random_state=42)
sample_test_df = book_test_df.sample(n=1000, random_state=42)

val_recommendations, val_metrics = recommend_and_eval(
    model=booster,
    df_split=sample_val_df,
    items_candidates=val_candidate_items,  # all candidate items in validation set
    meta_df=meta_df,
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    batch_size=4096
)

print("Validation Metrics:")
print(val_metrics)

test_recs, test_metrics = recommend_and_eval(
    model=booster,
    df_split=sample_test_df,
    items_candidates=test_candidate_items,  # all candidate items in test set
    meta_df=meta_df,
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    batch_size=4096
)

print("Test Metrics:")
print(test_metrics)


Scoring users: 100%|██████████| 5000/5000 [1:25:40<00:00,  1.03s/it]


Validation Metrics:
{'HR@5': np.float64(0.0006), 'HR@10': np.float64(0.0014), 'NDCG@5': np.float64(0.00028969182377587847), 'NDCG@10': np.float64(0.0005530806392398107), 'MRR': np.float64(0.0015685182217979679), 'AUC': np.float64(0.5240516712646771)}


Scoring users: 100%|██████████| 5000/5000 [7:24:36<00:00,  5.34s/it]       

Test Metrics:
{'HR@5': np.float64(0.0008), 'HR@10': np.float64(0.0016), 'NDCG@5': np.float64(0.00038092707360810814), 'NDCG@10': np.float64(0.0006440200489652538), 'MRR': np.float64(0.0016364071885312352), 'AUC': np.float64(0.5201680174534405)}





In [20]:
sample_test_df = book_test_df.sample(n=1000, random_state=1)

test_recs, test_metrics = recommend_and_eval(
    model=booster,
    df_split=sample_test_df,
    items_candidates=test_candidate_items,  # all candidate items in test set
    meta_df=meta_df,
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    batch_size=4096
)

print("Test Metrics:")
print(test_metrics)

Scoring users: 100%|██████████| 1000/1000 [06:15<00:00,  2.67it/s]

Test Metrics:
{'HR@5': np.float64(0.005), 'HR@10': np.float64(0.012), 'NDCG@5': np.float64(0.0034922828697182434), 'NDCG@10': np.float64(0.005647644285015368), 'MRR': np.float64(0.008508306995889801), 'AUC': np.float64(0.5087838804576119)}



