In [1]:
# Connect to Google Drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Setup & imports

In [2]:
# === Imports & config
import os, json, random, math, gc
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ---- Repro
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ---- Paths
DATA_DIR = "/content/drive/MyDrive/processed_ice"                  # where Books.csv, Users.csv, Ratings.csv live
OUT_DIR  = "/content/drive/MyDrive/processed_ice/processed"        # where we’ll save cleaned files
os.makedirs(OUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


### Load, clean, map IDs, and build features

In [3]:
# === Load raw CSVs
books   = pd.read_csv(f"{DATA_DIR}/Books.csv", dtype=str, encoding="latin-1")
ratings = pd.read_csv(f"{DATA_DIR}/Ratings.csv", dtype=str, encoding="latin-1")
users   = pd.read_csv(f"{DATA_DIR}/Users.csv", dtype=str, encoding="latin-1")

# Normalize column names
books.columns   = [c.strip().replace(" ", "-") for c in books.columns]
ratings.columns = [c.strip().replace(" ", "-") for c in ratings.columns]
users.columns   = [c.strip().replace(" ", "-") for c in users.columns]

def to_int_safe(x):
    try: return int(float(str(x).strip()))
    except: return np.nan

def clamp_year(y):
    y = to_int_safe(y)
    return y if (not pd.isna(y) and 1450 <= y <= 2025) else np.nan

def clean_age(a):
    a = to_int_safe(a)
    return a if (not pd.isna(a) and 5 <= a <= 95) else np.nan

def split_location(loc):
    if pd.isna(loc): return pd.Series({"City": np.nan, "State": np.nan, "Country": np.nan})
    parts = [p.strip().lower() for p in str(loc).split(",")]
    parts += [np.nan] * (3 - len(parts))
    return pd.Series({"City": parts[0], "State": parts[1], "Country": parts[2]})

# Clean users
users["Age"] = users["Age"].apply(clean_age)
loc_split = users["Location"].apply(split_location)
users = pd.concat([users.drop(columns=["Location"], errors="ignore"), loc_split], axis=1)

bins = [0, 18, 25, 35, 50, 70, 120]
labels = ["<18","18-24","25-34","35-49","50-69","70+"]
users["AgeBucket"] = pd.cut(users["Age"], bins=bins, labels=labels, include_lowest=True)

# Clean books
books["Year-Of-Publication"] = books["Year-Of-Publication"].apply(clamp_year)
for c in ["Book-Title", "Book-Author", "Publisher"]:
    if c in books.columns:
        books[c] = books[c].fillna("").astype(str).str.strip()
books = books.drop_duplicates(subset=["ISBN"])

# Clean ratings
ratings["User-ID"] = ratings["User-ID"].apply(to_int_safe)
ratings["Book-Rating"] = ratings["Book-Rating"].apply(to_int_safe)
ratings = ratings.dropna(subset=["User-ID","ISBN","Book-Rating"])
ratings["User-ID"] = ratings["User-ID"].astype(int)
ratings["Book-Rating"] = ratings["Book-Rating"].astype(int)

users["User-ID"] = users["User-ID"].astype(int)

# Keep only consistent ids across tables
df = ratings.merge(books[["ISBN"]], on="ISBN", how="inner")
df = df.merge(users[["User-ID"]], on="User-ID", how="inner")



In [4]:
df.shape

(1031136, 3)

In [5]:
# Optional: drop "unnormal raters" (strange behaviour: all high or low)
ustats = df.groupby("User-ID")["Book-Rating"].agg(['count','mean', 'max', 'min']).fillna(0)
unnormal_raters = ustats[(ustats['count']>=5) & ((ustats['min']>=9) | (ustats['max']<=1))].index
df = df[~df["User-ID"].isin(unnormal_raters)].copy()

# Optional: drop "uniform raters" (strange behaviour: very low variance)
# ustats = df.groupby("User-ID")["Book-Rating"].agg(["count","std"]).fillna(0)
# uniform_users = ustats[ustats["std"] < 0.5].index
# df2 = df[~df["User-ID"].isin(uniform_users)].copy()



# Activity filtering for CF (keeps CF stable)
MIN_USER_INTERACTIONS = 4
MIN_ITEM_INTERACTIONS = 4
uc = df["User-ID"].value_counts()
ic = df["ISBN"].value_counts()
keep_u = set(uc[uc >= MIN_USER_INTERACTIONS].index)
keep_i = set(ic[ic >= MIN_ITEM_INTERACTIONS].index)
df = df[df["User-ID"].isin(keep_u) & df["ISBN"].isin(keep_i)].copy()

In [6]:
df.shape

(641240, 3)

In [7]:


# Map to contiguous indices
uid2ix = {u:i for i,u in enumerate(sorted(df["User-ID"].unique()))}
ix2uid = {i:u for u,i in uid2ix.items()}
isbn2ix = {b:i for i,b in enumerate(sorted(df["ISBN"].unique()))}
ix2isbn = {i:b for b,i in isbn2ix.items()}

df["uix"] = df["User-ID"].map(uid2ix)
df["iix"] = df["ISBN"].map(isbn2ix)

# Implicit view
IMPLICIT_THRESH = 5
implicit = df.copy()
implicit["y"] = (implicit["Book-Rating"] >= IMPLICIT_THRESH).astype(int)

# Train/val/test split (leave-one-out per user)
def leave_one_out_split(df_in):
    df_shuf = df_in.sample(frac=1.0, random_state=SEED)
    val_idx, test_idx, seen_v, seen_t = [], [], set(), set()
    for idx, row in df_shuf.iterrows():
        u = row["uix"]
        if u not in seen_v:
            val_idx.append(idx); seen_v.add(u)
        elif u not in seen_t:
            test_idx.append(idx); seen_t.add(u)
    val  = df_in.loc[val_idx]
    test = df_in.loc[test_idx]
    train = df_in.drop(index=set(val_idx) | set(test_idx))
    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

train_imp, val_imp, test_imp = leave_one_out_split(implicit[["uix","iix","y","User-ID","ISBN"]])

# Build sparse user–item matrix for CF retrieval
n_users = len(uid2ix); n_items = len(isbn2ix)
rows = train_imp["uix"].to_numpy(); cols = train_imp["iix"].to_numpy(); vals = train_imp["y"].to_numpy()
ui_matrix = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

# Content features for items (TF-IDF title -> SVD(64))
title_series = books.set_index("ISBN").reindex(sorted(isbn2ix.keys()))["Book-Title"].fillna("").astype(str)
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
X_tfidf = tfidf.fit_transform(title_series.tolist())
svd = TruncatedSVD(n_components=64, random_state=SEED)
item_title_emb = svd.fit_transform(X_tfidf).astype(np.float32)  # [n_items, 64]

item_content = pd.DataFrame(item_title_emb, columns=[f"title_svd_{k}" for k in range(64)])
item_content["ISBN"] = sorted(isbn2ix.keys())
item_content["iix"]  = item_content["ISBN"].map(isbn2ix)

# User demographic one-hots (country top-K + age bucket)
K_COUNTRIES = 15
users_small = users[users["User-ID"].isin(uid2ix.keys())].copy()
users_small["uix"] = users_small["User-ID"].map(uid2ix)
top_c = users_small["Country"].value_counts().head(K_COUNTRIES).index.tolist()
for c in top_c: users_small[f"country__{c}"] = (users_small["Country"] == c).astype(int)
users_small["country__other"] = (~users_small["Country"].isin(top_c)).astype(int)
for b in labels: users_small[f"age__{b}"] = (users_small["AgeBucket"] == b).astype(int)

user_features = users_small[["uix"] + [c for c in users_small.columns if c.startswith(("country__","age__"))]].copy()

# Dense feature matrices aligned by index
item_feat_cols = [c for c in item_content.columns if c.startswith("title_svd_")]
I_feat = np.zeros((n_items, len(item_feat_cols)), dtype=np.float32)
I_feat[item_content["iix"].values] = item_content[item_feat_cols].values.astype(np.float32)

user_feat_cols = [c for c in user_features.columns if c != "uix"]
U_feat = np.zeros((n_users, len(user_feat_cols)), dtype=np.float32)
U_feat[user_features["uix"].values] = user_features[user_feat_cols].values.astype(np.float32)

# Save features for inference (also load to torch tensors now)
np.save(f"{OUT_DIR}/I_feat.npy", I_feat)
np.save(f"{OUT_DIR}/U_feat.npy", U_feat)
I_feat_t = torch.from_numpy(I_feat).to(device)
U_feat_t = torch.from_numpy(U_feat).to(device)

print("Data ready:",
      "\n users:", n_users,
      "\n items:", n_items,
      "\n train/val/test sizes:", len(train_imp), len(val_imp), len(test_imp))


Data ready: 
 users: 22978 
 items: 52775 
 train/val/test sizes: 595999 22978 22263


In [8]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np
import heapq

# --- Build User-Item sparse matrix (unchanged)
n_users = len(uid2ix)
n_items = len(isbn2ix)

rows = train_imp["uix"].to_numpy()
cols = train_imp["iix"].to_numpy()
vals = train_imp["y"].to_numpy()

ui_matrix = csr_matrix((vals, (rows, cols)), shape=(n_users, n_items))

from sklearn.neighbors import NearestNeighbors
import numpy as np, heapq

# Fit once (items = columns of UI matrix)
knn_items = NearestNeighbors(
    n_neighbors=50,       # tune K
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
).fit(ui_matrix.T)

def recommend_items_for_user(user_id, N=10, K=50):
    """
    On-the-fly item-item recommendations for a user.
    Uses only the items the user interacted with, expands to K neighbors each,
    and aggregates neighbor similarities.
    """
    if user_id not in uid2ix:
        return []

    uix = uid2ix[user_id]
    user_items = ui_matrix[uix].indices  # items this user interacted with
    if len(user_items) == 0:
        return []

    # Query neighbors for just these items
    dist, idx = knn_items.kneighbors(ui_matrix.T[user_items], n_neighbors=K, return_distance=True)
    sim = 1.0 - dist   # cosine similarity

    # Accumulate scores
    scores = np.zeros(n_items, dtype=np.float32)
    for nbrs, sims in zip(idx, sim):
        # skip self (first neighbor is usually the item itself)
        for j, s in zip(nbrs[1:], sims[1:]):
            scores[j] += s

    # filter items already seen
    seen_items = set(user_items.tolist())
    if seen_items:
        scores[list(seen_items)] = -1e9

    top_items = heapq.nlargest(N, range(n_items), key=lambda i: scores[i])
    return [ix2isbn[i] for i in top_items]


from sklearn.neighbors import NearestNeighbors
import numpy as np, heapq

# Fit once (users = rows of UI matrix)
knn_users = NearestNeighbors(
    n_neighbors=50,     # tune K
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
).fit(ui_matrix)

def recommend_by_similar_users(user_id, N=10, K=50):
    """
    User-user collaborative filtering.
    Finds K nearest users to the target user and aggregates their interactions
    weighted by similarity, all in sparse form (no dense matrix conversion).
    """
    if user_id not in uid2ix:
        return []

    uix = uid2ix[user_id]

    # Find K nearest neighbors for this user (includes self at idx 0)
    dist, idx = knn_users.kneighbors(ui_matrix[uix], n_neighbors=K, return_distance=True)
    idx = idx.ravel()
    sim = (1.0 - dist.ravel()).astype(np.float32)

    # drop self if present
    mask = idx != uix
    nbr_idxs = idx[mask]
    nbr_sims = sim[mask]

    # Accumulate scores sparsely: scores[j] += w * interaction(nbr, j)
    scores = np.zeros(n_items, dtype=np.float32)
    for nbr, w in zip(nbr_idxs, nbr_sims):
        row = ui_matrix[nbr]           # sparse row
        if row.nnz:
            scores[row.indices] += w * row.data

    # filter items already seen by target user
    seen_items = set(ui_matrix[uix].indices.tolist())
    if seen_items:
        scores[list(seen_items)] = -1e9

    top_items = heapq.nlargest(N, range(n_items), key=lambda i: scores[i])
    return [ix2isbn[i] for i in top_items]


# Pick any known user_id from your mappings
some_user_id = next(iter(uid2ix.keys()))

dict_book = books[['ISBN', 'Book-Title']].set_index('ISBN').to_dict()['Book-Title']

recommended_item_itembased = recommend_items_for_user(some_user_id, N=5, K=50)
recommended_item_userbased  = recommend_by_similar_users(some_user_id, N=5, K=50)

print("Item-Item recs:", recommended_item_itembased)
print("User-User recs:", recommended_item_userbased)
[dict_book[x] for x in recommended_item_userbased]

Item-Item recs: ['0786863269', '1558744673', '0373169663', '0373225946', '0771095066']
User-User recs: ['0020811853', '0316569321', '038082101X', '0316601950', '0440214009']


['POSTCARDS',
 'White Oleander : A Novel',
 'Daughter of Fortune: A Novel',
 "The Pilot's Wife : A Novel",
 'Treasures']

#### Dual

In [22]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Build once (analogous to knn_items for columns)
K_USER_DEFAULT = 100
knn_users = NearestNeighbors(
    n_neighbors=K_USER_DEFAULT,
    metric="cosine",
    algorithm="brute",
    n_jobs=-1
).fit(ui_matrix)   # users are rows


def item_knn_scores_for_user(uix: int, K: int = 50) -> np.ndarray:
    """Return a dense score vector over items for one user using item-item CF."""
    user_items = ui_matrix[uix].indices
    scores = np.zeros(n_items, dtype=np.float32)
    if len(user_items) == 0:
        return scores

    # neighbors for the items the user interacted with
    dist, idx = knn_items.kneighbors(ui_matrix.T[user_items], n_neighbors=K, return_distance=True)
    sim = 1.0 - dist

    for nbrs, sims in zip(idx, sim):
        for j, s in zip(nbrs[1:], sims[1:]):  # skip self
            scores[j] += s
    return scores

def user_knn_scores_for_user(uix: int, K: int = 100) -> np.ndarray:
    """Return a dense score vector over items for one user using user-user CF."""
    # find K most similar users to uix
    dist, idx = knn_users.kneighbors(ui_matrix[uix], n_neighbors=K, return_distance=True)
    sim_users = 1.0 - dist.ravel()
    nbr_users = idx.ravel()

    # aggregate neighbor preference weighted by similarity
    scores = np.zeros(n_items, dtype=np.float32)
    for s, v in zip(sim_users[1:], nbr_users[1:]):  # skip self
        if s <= 0:
            continue
        # items that neighbor v interacted with
        v_items = ui_matrix[v].indices
        scores[v_items] += s
    return scores


def dual_pool_candidates(
    user_id,
    pool_item: int = 200,
    pool_user: int = 200,
    K_item: int = 50,
    K_user: int = 100,
    w_item: float = 1.0,
    w_user: float = 1.0,
):
    """
    Produce a merged candidate list from item-item CF and user-user CF.
    Returns a list of ISBNs ordered by combined score.
    """
    if user_id not in uid2ix:
        return []
    uix = uid2ix[user_id]

    # 1) get dense score vectors from both CFs
    s_item = item_knn_scores_for_user(uix, K=K_item)   # shape [n_items]
    s_user = user_knn_scores_for_user(uix, K=K_user)   # shape [n_items]

    # 2) mask items the user already saw
    seen = set(ui_matrix[uix].indices.tolist())
    if seen:
        s_item[list(seen)] = -1e9
        s_user[list(seen)] = -1e9

    # 3) take top pools from each scorer
    cand_item_idx = np.argsort(-s_item)[:pool_item]
    cand_user_idx = np.argsort(-s_user)[:pool_user]

    # 4) merge with weighted sum
    merged = {}
    for i in cand_item_idx:
        if s_item[i] > -1e8:     # not masked
            merged[i] = merged.get(i, 0.0) + w_item * float(s_item[i])
    for i in cand_user_idx:
        if s_user[i] > -1e8:
            merged[i] = merged.get(i, 0.0) + w_user * float(s_user[i])

    if not merged:
        return []

    # 5) rank by combined score
    ranked_iix = sorted(merged.keys(), key=lambda j: merged[j], reverse=True)
    return [ix2isbn[i] for i in ranked_iix]


@torch.no_grad()
def rerank_hybrid_dualpool(user_id, N_final=10, pool_item=200, pool_user=200, K_item=50, K_user=100, w_item=1.0, w_user=1.0):
    # 1) dual candidates
    cands_isbn = dual_pool_candidates(
        user_id,
        pool_item=pool_item, pool_user=pool_user,
        K_item=K_item, K_user=K_user,
        w_item=w_item, w_user=w_user
    )
    if not cands_isbn:
        return []

    # 2) re-rank with Hybrid (same as before)
    cands_iix = torch.tensor([isbn2ix[i] for i in cands_isbn], dtype=torch.long, device=device)
    uix = torch.tensor([uid2ix[user_id]], dtype=torch.long, device=device)
    logits = infer_model(uix, cands_iix, U_feat=U_feat_inf, I_feat=I_feat_inf).float().cpu().numpy()
    order = np.argsort(-logits)[:N_final].tolist()
    return [cands_isbn[i] for i in order]


some_user = next(iter(uid2ix.keys()))
print("Dual-pool Hybrid Top-10:", rerank_hybrid_dualpool(some_user, N_final=10))

Dual-pool Hybrid Top-10: ['0971880107', '0142001740', '0316666343', '0060928336', '0671027360', '067976402X', '0553375407', '0312195516', '1400034779', '0140119906']


### CF Retrieval (Item–Item KNN, Method-B on-the-fly)

In [9]:
# === Dataset with on-the-fly negatives (implicit)
class ImplicitPairDataset(Dataset):
    def __init__(self, df_pos, n_items, num_neg=4, seed=42):
        self.df_pos = df_pos[["uix","iix"]].drop_duplicates().reset_index(drop=True)
        self.n_items = n_items
        self.num_neg = num_neg
        self.rng = random.Random(seed)
        self.user_pos = defaultdict(set)
        for u, i in self.df_pos[["uix","iix"]].itertuples(index=False):
            self.user_pos[int(u)].add(int(i))

    def __len__(self): return len(self.df_pos)

    def _sample_neg(self, u):
        while True:
            j = self.rng.randint(0, self.n_items-1)
            if j not in self.user_pos[u]:
                return j

    def __getitem__(self, idx):
        u, i_pos = map(int, self.df_pos.iloc[idx].values)
        items = [i_pos] + [self._sample_neg(u) for _ in range(self.num_neg)]
        labels = [1] + [0]*self.num_neg
        return int(u), torch.tensor(items, dtype=torch.long), torch.tensor(labels, dtype=torch.float32)

# === Model
class HybridRec(nn.Module):
    def __init__(self, n_users, n_items, d_id=64, d_user_feat=0, d_item_feat=64, hidden=128, dropout=0.1):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, d_id)
        self.item_emb = nn.Embedding(n_items, d_id)
        self.use_user_feat = d_user_feat > 0
        self.use_item_feat = d_item_feat > 0
        if self.use_user_feat: self.user_feat_proj = nn.Linear(d_user_feat, d_id)
        if self.use_item_feat: self.item_feat_proj = nn.Linear(d_item_feat, d_id)
        self.mlp = nn.Sequential(
            nn.Linear(d_id*2, hidden), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(hidden//2, 1)
        )
        nn.init.normal_(self.user_emb.weight, std=0.02)
        nn.init.normal_(self.item_emb.weight, std=0.02)

    def forward(self, uix, iix, U_feat=None, I_feat=None):
        # Inference path: single user vs (M,) items
        if iix.ndim == 1:
            uix = uix.repeat(iix.shape[0])     # (M,)
            u = self.user_emb(uix)             # (M,d)
            i = self.item_emb(iix)             # (M,d)
            if self.use_user_feat and U_feat is not None: u = u + self.user_feat_proj(U_feat[uix])
            if self.use_item_feat and I_feat is not None: i = i + self.item_feat_proj(I_feat[iix])
            x = torch.cat([u, i], dim=-1)      # (M,2d)
            return self.mlp(x).squeeze(-1)     # (M,)

        # Training path: (B,M)
        B, M = iix.shape
        u = self.user_emb(uix)                 # (B,d)
        i = self.item_emb(iix)                 # (B,M,d)
        if self.use_user_feat and U_feat is not None: u = u + self.user_feat_proj(U_feat[uix])
        if self.use_item_feat and I_feat is not None: i = i + self.item_feat_proj(I_feat[iix])
        u = u.unsqueeze(1).expand(-1, M, -1)   # (B,M,d)
        x = torch.cat([u, i], dim=-1)          # (B,M,2d)
        return self.mlp(x).squeeze(-1)         # (B,M)

# === Early stopping helper
class EarlyStopping:
    def __init__(self, patience=4, min_delta=1e-4, mode="min"):
        self.patience, self.min_delta, self.mode = patience, min_delta, mode
        self.best, self.bad_epochs, self.should_stop = None, 0, False
    def step(self, value):
        if self.best is None: self.best = value; return False
        improve = (value < self.best - self.min_delta) if self.mode=="min" else (value > self.best + self.min_delta)
        if improve: self.best = value; self.bad_epochs = 0; return False
        self.bad_epochs += 1
        if self.bad_epochs >= self.patience: self.should_stop = True
        return self.should_stop

# === DataLoaders
num_neg = 4; batch_size = 256
train_ds = ImplicitPairDataset(train_imp, n_items, num_neg=num_neg)
val_ds   = ImplicitPairDataset(val_imp,   n_items, num_neg=num_neg*2)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=True, drop_last=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True, drop_last=False)

# === Train with BCE + early stopping; save best
model = HybridRec(n_users, n_items, d_id=64, d_user_feat=U_feat_t.shape[1], d_item_feat=I_feat_t.shape[1],
                  hidden=128, dropout=0.1).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
loss_fn = nn.BCEWithLogitsLoss()

CKPT_PATH = f"{OUT_DIR}/hybrid_model.pt"
CFG_PATH  = f"{OUT_DIR}/hybrid_config.json"

def run_epoch(loader, train=True):
    model.train(mode=train)
    total, n = 0.0, 0
    for uix, items, labels in loader:
        uix, items, labels = uix.to(device), items.to(device), labels.to(device)
        logits = model(uix, items, U_feat=U_feat_t, I_feat=I_feat_t)  # (B,1+num_neg)
        loss = loss_fn(logits, labels)
        if train:
            opt.zero_grad(set_to_none=True)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 2.0)
            opt.step()
        total += loss.item() * uix.size(0); n += uix.size(0)
    return total / max(n,1)

epochs = 30
early = EarlyStopping(patience=4, min_delta=1e-4, mode="min")
best_val = float("inf")

for ep in range(1, epochs+1):
    tr = run_epoch(train_loader, train=True)
    vl = run_epoch(val_loader,   train=False)
    print(f"epoch {ep:02d} | train_loss {tr:.4f} | val_loss {vl:.4f}")

    if vl < best_val - 1e-4:
        best_val = vl
        torch.save(model.state_dict(), CKPT_PATH)
        with open(CFG_PATH, "w") as f:
            json.dump({
                "n_users": n_users, "n_items": n_items,
                "d_id": 64, "d_user_feat": int(U_feat_t.shape[1]), "d_item_feat": int(I_feat_t.shape[1]),
                "hidden": 128, "dropout": 0.1
            }, f)
        print(f"  ✓ saved best to {CKPT_PATH}")

    if early.step(vl):
        print(f"Early stopping at epoch {ep} (best val_loss={early.best:.4f})")
        break

print("Training done. Best val_loss:", best_val)


epoch 01 | train_loss 0.4162 | val_loss 0.2866
  ✓ saved best to /content/drive/MyDrive/processed_ice/processed/hybrid_model.pt
epoch 02 | train_loss 0.3607 | val_loss 0.2700
  ✓ saved best to /content/drive/MyDrive/processed_ice/processed/hybrid_model.pt
epoch 03 | train_loss 0.3307 | val_loss 0.2649
  ✓ saved best to /content/drive/MyDrive/processed_ice/processed/hybrid_model.pt
epoch 04 | train_loss 0.3081 | val_loss 0.2646
  ✓ saved best to /content/drive/MyDrive/processed_ice/processed/hybrid_model.pt
epoch 05 | train_loss 0.2885 | val_loss 0.2705
epoch 06 | train_loss 0.2712 | val_loss 0.2773
epoch 07 | train_loss 0.2555 | val_loss 0.2904
epoch 08 | train_loss 0.2414 | val_loss 0.3157
Early stopping at epoch 8 (best val_loss=0.2646)
Training done. Best val_loss: 0.26457403995409906


### Metrics (Precision@K, Recall@K, NDCG@K)

In [10]:
def precision_at_k(pred, gt, k):
    pred_k = pred[:k]
    return len(set(pred_k) & set(gt)) / float(k) if k > 0 else 0.0

def recall_at_k(pred, gt, k):
    if len(gt) == 0: return 0.0
    pred_k = pred[:k]
    return len(set(pred_k) & set(gt)) / float(len(gt))

def ndcg_at_k(pred, gt, k):
    pred_k = pred[:k]
    if not pred_k: return 0.0
    gains = [1.0 if p in gt else 0.0 for p in pred_k]
    dcg = sum(g / np.log2(i + 2) for i, g in enumerate(gains))
    ideal_gains = sorted(gains, reverse=True)
    idcg = sum(g / np.log2(i + 2) for i, g in enumerate(ideal_gains))
    return (dcg / idcg) if idcg > 0 else 0.0

# Ground truth from TEST split (implicit)
user_test_gt = defaultdict(list)
for _, r in test_imp.iterrows():
    user_test_gt[int(r["uix"])].append(int(r["iix"]))

def eval_recommender(fn_recommend, k=10, max_users=None):
    users_eval = list(user_test_gt.keys())
    if max_users: users_eval = users_eval[:max_users]
    P, R, N = [], [], []
    for uix in users_eval:
        uid = ix2uid[uix]
        gt_isbn = [ix2isbn[i] for i in user_test_gt[uix]]
        preds = fn_recommend(uid)
        preds = preds[:k]
        P.append(precision_at_k(preds, gt_isbn, k))
        R.append(recall_at_k(preds, gt_isbn, k))
        N.append(ndcg_at_k(preds, gt_isbn, k))
    return {f"Precision@{k}": float(np.mean(P) if P else 0.0),
            f"Recall@{k}": float(np.mean(R) if R else 0.0),
            f"NDCG@{k}": float(np.mean(N) if N else 0.0),
            "Users_evaluated": len(P)}


### Inference: load best checkpoint and re-rank candidates

In [24]:
# === Load best model for inference
with open(CFG_PATH, "r") as f:
    cfg = json.load(f)

infer_model = HybridRec(
    n_users=cfg["n_users"], n_items=cfg["n_items"],
    d_id=cfg["d_id"], d_user_feat=cfg["d_user_feat"], d_item_feat=cfg["d_item_feat"],
    hidden=cfg["hidden"], dropout=cfg["dropout"]
).to(device).eval()

state = torch.load(CKPT_PATH, map_location=device)
infer_model.load_state_dict(state)

U_feat_inf = torch.from_numpy(np.load(f"{OUT_DIR}/U_feat.npy")).to(device)
I_feat_inf = torch.from_numpy(np.load(f"{OUT_DIR}/I_feat.npy")).to(device)

@torch.no_grad()
def rerank_with_model(user_id, N_final=10, cand_K=50, pool=200):
    """Candidate retrieval (item-KNN) + Hybrid re-ranking."""
    if user_id not in uid2ix: return []
    # 1) candidates
    cands_isbn = recommend_items_for_user(user_id, N=max(pool, N_final*5), K=cand_K)
    if not cands_isbn: return []
    # 2) re-rank
    cands_iix = torch.tensor([isbn2ix[i] for i in cands_isbn], dtype=torch.long, device=device)
    uix = torch.tensor([uid2ix[user_id]], dtype=torch.long, device=device)
    logits = infer_model(uix, cands_iix, U_feat=U_feat_inf, I_feat=I_feat_inf).float().cpu().numpy()
    order = np.argsort(-logits)[:N_final].tolist()
    return [cands_isbn[i] for i in order]

# === Quick smoke test
some_user_id = next(iter(uid2ix.keys()))
print("Collaborative Filtering item base:", recommend_items_for_user(some_user_id, N=5, K=50))
print("Collaborative Filtering user base:", recommend_by_similar_users(some_user_id, N=5, K=50))
print("Hybrid re-ranked:", rerank_with_model(some_user_id, N_final=5, cand_K=50, pool=200))
print("Dual-pool Hybrid:", rerank_hybrid_dualpool(some_user, N_final=5))


Collaborative Filtering item base: ['0786863269', '1558744673', '0373169663', '0373225946', '0771095066']
Collaborative Filtering user base: ['0020811853', '0316569321', '038082101X', '0316601950', '0440214009']
Hybrid re-ranked: ['0020811853', '0609600761', '0553263633', '0375704299', '0006375952']
Dual-pool Hybrid: ['0971880107', '0142001740', '0316666343', '0060928336', '0671027360']


### Evaluate CF vs Hybrid

In [13]:
# Wrap recommenders
rec_item = lambda uid: recommend_items_for_user(uid, N=100, K=50)
rec_user = lambda uid: recommend_by_similar_users(uid, N=100, K=50)
rec_hybrid = lambda uid: rerank_with_model(uid, N_final=50, cand_K=50, pool=200)

print("ItemBased-KNN @10:", eval_recommender(rec_item,   k=10, max_users=5000))
print("UserBased-KNN @10:", eval_recommender(rec_user,   k=10, max_users=5000))
print("Hybrid    @10:", eval_recommender(rec_hybrid, k=10, max_users=5000))


ItemBased-KNN @10: {'Precision@10': 0.00152, 'Recall@10': 0.0152, 'NDCG@10': 0.009296202432033643, 'Users_evaluated': 5000}
UserBased-KNN @10: {'Precision@10': 0.00296, 'Recall@10': 0.0296, 'NDCG@10': 0.018920277996271637, 'Users_evaluated': 5000}
Hybrid    @10: {'Precision@10': 0.00244, 'Recall@10': 0.0244, 'NDCG@10': 0.014179776910649622, 'Users_evaluated': 5000}


In [20]:
# Evaluate @10 (same eval_recommender you already have)
rec_dual_hybrid = lambda uid: rerank_hybrid_dualpool(uid, N_final=50, pool_item=150, pool_user=150, K_item=50, K_user=100, w_item=1.0, w_user=1.0)
print("Dual-pool Hybrid @10:", eval_recommender(rec_dual_hybrid, k=10, max_users=5000))


Dual-pool Hybrid @10: {'Precision@10': 0.00224, 'Recall@10': 0.0224, 'NDCG@10': 0.011551350467791734, 'Users_evaluated': 5000}


In [14]:
!python gradio_app.py


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://6538e19ccce0104bba.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
Keyboard interruption in main thread... closing server.
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 3158, in block_thread
    time.sleep(0.1)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/content/gradio_app.py", line 176, in <module>
    demo.launch(share=True)
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 3055, in launch
    self.block_thread()
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 3162, in block_thread
    self.server.close()
  File "/usr/local/lib/python3.12/dist-

### XGBoost

In [14]:
# 0) Install if needed: pip install xgboost

import xgboost as xgb
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# ---------- Helpers to build features ----------

# (A) Precompute handy stats
user_pos_items = {}  # uix -> set of positive item indices (train)
for u,i in train_imp[["uix","iix"]].itertuples(index=False):
    user_pos_items.setdefault(int(u), set()).add(int(i))

item_pop = train_imp["iix"].value_counts().to_dict()
user_act = train_imp["uix"].value_counts().to_dict()

def user_profile_vector(uix):
    """Mean of item content vectors for user's positive items."""
    items = list(user_pos_items.get(int(uix), []))
    if not items:
        return np.zeros(I_feat.shape[1], dtype=np.float32)
    return I_feat[items].mean(axis=0)

# cache user profiles
U_prof = np.vstack([user_profile_vector(u) for u in range(n_users)])  # [n_users, d]

def cosine(a, b, eps=1e-9):
    denom = (np.linalg.norm(a)+eps) * (np.linalg.norm(b)+eps)
    return float(np.dot(a, b) / denom)

# (B) Item–item KNN score for a candidate list (reuse your knn and ui_matrix)
def knn_item_score_for_user_candidates(uix, cand_iix, K=50):
    """Aggregate neighbor sims for each candidate (like retrieval) but returned as a feature."""
    user_items = ui_matrix[uix].indices
    if len(user_items) == 0:
        return np.zeros(len(cand_iix), dtype=np.float32)
    # neighbors for user's items
    dist, idx = knn_items.kneighbors(ui_matrix.T[user_items], n_neighbors=K, return_distance=True)
    sim = 1.0 - dist
    # accumulate into a dense score vector, then pick only candidate entries
    scores = np.zeros(n_items, dtype=np.float32)
    for nbrs, sims in zip(idx, sim):
        for j, s in zip(nbrs[1:], sims[1:]):
            scores[j] += s
    return scores[cand_iix]

# (C) Build feature rows for pairs
def build_pair_features(uix, iix_list):
    # arrays to fill
    feats = []
    # prefetch
    up = U_prof[uix]
    u_demo = U_feat[uix]  # demographics one-hot vector
    # KNN aggregation for these candidates
    knn_scores = knn_item_score_for_user_candidates(uix, np.array(iix_list, dtype=int), K=50)

    for k, iix in enumerate(iix_list):
        ivec = I_feat[iix]
        feats.append([
            # collaborative stats
            user_act.get(int(uix), 0),
            item_pop.get(int(iix), 0),
            knn_scores[k],
            # content match
            cosine(up, ivec),
            # cheap metadata proxies
            # you can append author/publisher one-hots if you made them
        ] + u_demo.tolist()  # append demographics
        )
    return np.asarray(feats, dtype=np.float32)

# Column names (for debugging/importance)
base_cols = ["user_activity", "item_popularity", "knn_item_score", "cos_userprof_item"]
demo_cols = [c for c in user_features.columns if c != "uix"]
feature_names = base_cols + demo_cols


In [15]:
import random

def sample_negatives(uix, num_neg=4):
    pos = user_pos_items.get(int(uix), set())
    out = []
    while len(out) < num_neg:
        j = random.randint(0, n_items - 1)
        if j not in pos:
            out.append(j)
    return out

X_list, y_list, qid_list = [], [], []  # qid optional for pointwise (ignored)

for uix in range(n_users):
    pos_items = list(user_pos_items.get(uix, []))
    for iix in pos_items:
        # positive
        X_list.append(build_pair_features(uix, [iix])[0]); y_list.append(1); qid_list.append(uix)
        # negatives
        negs = sample_negatives(uix, num_neg=4)
        X_list.append(build_pair_features(uix, negs))  # shape [4, F]
        y_list += [0]*len(negs)
        qid_list += [uix]*len(negs)

X = np.vstack([x if x.ndim==2 else np.expand_dims(x,0) for x in X_list])
y = np.array(y_list, dtype=np.float32)
dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)

# Validation set (build the same way with val_imp)
# (for brevity, you can skip or do a smaller sample)


KeyboardInterrupt: 

In [21]:
X_rows, y_rows, group = [], [], []
for uix in range(n_users):
    pos_items = list(user_pos_items.get(uix, []))
    if not pos_items:
        continue
    # sample limited positives to keep dataset size reasonable
    pos_items = pos_items[:5]
    for iix in pos_items:
        negs = sample_negatives(uix, num_neg=4)
        items = [iix] + negs
        feats = build_pair_features(uix, items)
        labels = [1] + [0]*len(negs)
        X_rows.append(feats); y_rows.append(labels); group.append(len(items))

X = np.vstack(X_rows)              # [sum_group, F]
y = np.hstack(y_rows).astype(np.float32)
dtrain = xgb.DMatrix(X, label=y, feature_names=feature_names)
dtrain.set_group(group)

# (Optional) Build dvalid in the same way from val_imp and set_group too.


KeyboardInterrupt: 

In [None]:
params = dict(
    max_depth=6,
    eta=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="ndcg@10",  # good for ranking; for logistic use "logloss" or "auc"
)

# For pointwise:
# params["objective"] = "binary:logistic"

# For pairwise:
params["objective"] = "rank:pairwise"

# Fit
# If you have dvalid:
# watchlist = [(dtrain, "train"), (dvalid, "valid")]
watchlist = [(dtrain, "train")]
booster = xgb.train(params, dtrain, num_boost_round=300, evals=watchlist, early_stopping_rounds=30)

# Save
xgb_path = f"{OUT_DIR}/xgb_ranker.json"
booster.save_model(xgb_path)


In [None]:
# Load
xgb_ranker = xgb.Booster()
xgb_ranker.load_model(f"{OUT_DIR}/xgb_ranker.json")

def rerank_with_xgb(user_id, N_final=10, cand_K=50, pool=200):
    if user_id not in uid2ix:
        return []
    uix = uid2ix[user_id]
    # 1) candidate pool
    cands_isbn = recommend_items_for_user(user_id, N=pool, K=cand_K)
    if not cands_isbn:
        return []
    cands_iix = [isbn2ix[i] for i in cands_isbn]
    # 2) features
    X = build_pair_features(uix, cands_iix)
    dtest = xgb.DMatrix(X, feature_names=feature_names)
    scores = xgb_ranker.predict(dtest)
    order = np.argsort(-scores)[:N_final]
    return [cands_isbn[i] for i in order]


In [None]:
rec_xgb = lambda uid: rerank_with_xgb(uid, N_final=50, cand_K=50, pool=200)
print("XGBoost @10:", eval_recommender(rec_xgb, k=10, max_users=5000))
