# User-Based Collaborative Filtering (5-core • TRAIN)

**Goal**
- Build a user-based CF recommender on **5-core / TRAIN** for a given category.
- Use sparse matrices + cosine neighbors for scalability.
- Produce Top-N recommendations for one user or a batch of users.

**What this notebook does**
1. Load 5-core **TRAIN** from `PROCESSED_DIR` (fallback `RAW_DIR`) with schema:
   - `user_id`, `parent_asin`, `rating`, `timestamp`, `history`
2. Build a **user–item** sparse matrix (CSR).
3. Compute **top-K nearest neighbors** per user (cosine).
4. Predict scores for **unseen items** and generate **Top-N** recommendations.
5. (Optional) Save recommendations to disk for UI integration.

> Notes:
> - We compute neighbors via `NearestNeighbors(metric='cosine', algorithm='brute')` on sparse CSR to avoid full dense similarity.
> - Ratings may be mean-centered per user internally (option), and added back for prediction.

### Task: Import modules and libraries

In [4]:
import os, sys, json, pickle
import numpy as np, polars as pl
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.neighbors import NearestNeighbors

# Add utilities to PYTHONPATH
module_path = os.path.abspath(os.path.join('..', '../utilities'))
sys.path.append(module_path)

from logger import Logger
from configurations import Configurations

# --- Logger ---
m_log_file = Configurations.LOG_PATH
logger = Logger(process_name="user_based", log_file=m_log_file)

# --- Paths ---
PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
logger.log_info(f"[Paths] PROCESSED_DIR={PROCESSED_DIR}")

# --- TRAINING MODELS ---
MODELS_DIR = Path(Configurations.MODELS_PATH)

# --- Runtime params (edit here) ---
CATEGORY      = Configurations.CATEGORIES      # e.g., "Electronics" or "Beauty_and_Personal_Care"
K_NEIGHBORS   = 30                 # user-based top-K neighbors
N_RECS        = 10                 # number of recommended items
MEAN_CENTER   = True               # whether to mean-center per user for similarity/prediction
MAX_USERS     = None               # e.g., 50_000 to sample users for quick experiments
MAX_ITEMS     = None               # e.g., 50_000 to sample items for quick experiments

logger.log_info(f"[Params] CATEGORY={CATEGORY} | K_NEIGHBORS={K_NEIGHBORS} | N_RECS={N_RECS} | MEAN_CENTER={MEAN_CENTER}")

2025-09-28 10:13:58,951 - INFO - [Paths] PROCESSED_DIR=/Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/processed
2025-09-28 10:13:58,951 - INFO - [Params] CATEGORY=['Electronics', 'Beauty_and_Personal_Care'] | K_NEIGHBORS=30 | N_RECS=10 | MEAN_CENTER=True


### Task: Define functions

#### Data Loader

In [23]:
def _candidate_user_files(category: str):
    safe = category.replace('/', '-')
    fname = f"{safe}.5core.train.parquet"
    return [PROCESSED_DIR / fname, RAW_DIR / fname]

def _coerce_ts_seconds(ts: pl.Series) -> pl.Series:
    x = pd.to_numeric(ts, errors='coerce')
    if x.dropna().gt(10**12).mean() > 0.5:
        x = x // 1000
    return x.astype('Int64')

def load_5core_train_user(category: str) -> pd.DataFrame:
    expected = ['user_id', 'parent_asin', 'rating', 'timestamp', 'history']
    for p in _candidate_user_files(category):
        if not (p.exists() and p.stat().st_size > 0):
            continue
        logger.log_info(f"[Load-User] Reading: {p.name}")
        df = pl.read_parquet(p, low_memory=False)
        miss = [c for c in expected if c not in df.columns]
        if miss:
            raise ValueError(f"Missing columns {miss} in {p}")
        df = df[expected].__copy__()
        # df['rating'] = pl.to_numeric(df['rating'], errors='coerce').clip(1, 5).astype('float32')
        # df['timestamp'] = _coerce_ts_seconds(df['timestamp'])
        # df['user_id'] = df['user_id'].astype('category')
        # df['parent_asin'] = df['parent_asin'].astype('category')
        logger.log_info(f"[Load-User] shape={df.shape} | users={df['user_id'].unique()} | items={df['parent_asin'].unique()}")
        return df
    raise FileNotFoundError(f"5-core TRAIN not found for {category}")


#### Build Sparse Matrix & Indexers

In [27]:
def build_user_matrices(df_train: pl.DataFrame, mean_center: bool = True,
                        max_users: int | None = None, max_items: int | None = None):
    # Keep only relevant columns using polars API
    df = df_train.select(['user_id', 'parent_asin', 'rating']).with_columns(
        pl.col('rating').cast(pl.Float32)
    )

    # Optionally limit number of users/items (take first-seen unique values)
    if max_users is not None:
        first_users = df['user_id'].unique()[:max_users].to_list()
        df = df.filter(pl.col('user_id').is_in(first_users))
    if max_items is not None:
        first_items = df['parent_asin'].unique()[:max_items].to_list()
        df = df.filter(pl.col('parent_asin').is_in(first_items))

    # Build reverse lists and index mappings (use order of first appearance)
    user_rev = df['user_id'].unique().to_list()
    item_rev = df['parent_asin'].unique().to_list()
    user_idx = {u_id: idx for idx, u_id in enumerate(user_rev)}
    item_idx = {a_id: idx for idx, a_id in enumerate(item_rev)}

    # Map rows to integer indices and ratings arrays
    u = np.array([user_idx[x] for x in df['user_id'].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df['parent_asin'].to_list()], dtype=np.int32)
    v = np.array(df['rating'].to_list(), dtype=np.float32)

    nU = len(user_rev)
    nI = len(item_rev)
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)

    user_means = np.zeros(nU, dtype=np.float32)
    Rc = None
    if mean_center:
        Rc = R.copy().astype(np.float32)
        row_sums = np.array(R.sum(axis=1)).ravel().astype(np.float32)
        row_cnts = np.diff(R.indptr).astype(np.int32)
        with np.errstate(divide='ignore', invalid='ignore'):
            user_means = np.where(row_cnts > 0, row_sums / row_cnts, 0.0).astype(np.float32)
        if Rc.nnz:
            Rc.data -= np.repeat(user_means, row_cnts)

    user_rev_arr = np.array(user_rev, dtype=object)
    item_rev_arr = np.array(item_rev, dtype=object)

    logger.log_info(f"[Matrix-User] R{R.shape} nnz={R.nnz}")
    return R, Rc, user_idx, item_idx, user_rev_arr, item_rev_arr, user_means


#### load_user_artifacts

In [8]:
def _load_user_artifacts(model_dir: str | Path):
    md = Path(model_dir)
    R  = load_npz(md / "R.npz")
    Rc = load_npz(md / "Rc.npz") if (md / "Rc.npz").exists() else None
    user_means = np.load(md / "user_means.npy")
    with open(md / "user_rev.pkl", "rb") as f: user_rev = pickle.load(f)
    with open(md / "item_rev.pkl", "rb") as f: item_rev = pickle.load(f)
    user_idx = {k: int(v) for k, v in json.loads((md / "user_idx.json").read_text()).items()}
    item_idx = {k: int(v) for k, v in json.loads((md / "item_idx.json").read_text()).items()}
    with open(md / "nn_model.pkl", "rb") as f: nn_model = pickle.load(f)
    return dict(R=R, Rc=Rc, user_means=user_means, user_rev=user_rev, item_rev=item_rev,
                user_idx=user_idx, item_idx=item_idx, nn_model=nn_model)

#### Nearest Neighbors (User–User)

In [9]:
def fit_user_neighbors_userbased(X: csr_matrix, k_neighbors: int = 30) -> NearestNeighbors:
    nn = NearestNeighbors(metric='cosine', algorithm='brute',
                          n_neighbors=min(k_neighbors+1, X.shape[0]))
    nn.fit(X)
    logger.log_info(f"[NN-User] fitted on {X.shape} | k={k_neighbors}")
    return nn

#### Predict & Recommend

In [11]:
def recommend_user_ui(user_id: str, n_recs: int = 5, k_neighbors: int | None = None,
                      models_dir: str | Path | None = None, category: str | None = None) -> pl.DataFrame:
    """UI → recommend using persisted USER-based artifacts (compact implementation)."""
    cat = category or CATEGORY
    model_dir = Path(models_dir) if models_dir else (MODELS_DIR / "user" / cat)
    art = _load_user_artifacts(model_dir)

    R, Rc, user_means = art["R"], art["Rc"], art["user_means"]
    nn_model, user_idx, item_rev = art["nn_model"], art["user_idx"], art["item_rev"]

    if user_id not in user_idx:
        logger.log_warning(f"[UI-USER] user_id={user_id} not found.")
        return pl.DataFrame(columns=["parent_asin", "score"])

    u = user_idx[user_id]
    X = Rc if Rc is not None else R

    distances, indices = nn_model.kneighbors(X.getrow(u), return_distance=True)
    d, idx = distances.ravel(), indices.ravel()
    # remove self if present
    mask = idx != u
    idx, d = idx[mask], d[mask]
    if k_neighbors is not None:
        idx, d = idx[:k_neighbors], d[:k_neighbors]
    if idx.size == 0:
        return pl.DataFrame(columns=["parent_asin", "score"])

    sims = np.clip(1.0 - d, 0.0, 1.0)
    denom = np.sum(np.abs(sims)) + 1e-8
    scores = X[idx, :].T.dot(sims) / denom
    if Rc is not None:
        scores = scores + user_means[u]

    rated = set(R.getrow(u).indices.tolist())
    cand_mask = np.ones(R.shape[1], dtype=bool)
    if rated:
        cand_mask[list(rated)] = False
    cand_indices = np.nonzero(cand_mask)[0]
    if cand_indices.size == 0:
        return pl.DataFrame(columns=["parent_asin", "score"])

    cand_scores = scores[cand_mask]
    n_top = min(n_recs, cand_scores.size)
    if n_top <= 0:
        return pl.DataFrame(columns=["parent_asin", "score"])

    top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
    picked = sorted([(int(cand_indices[p]), float(cand_scores[p])) for p in top_pos], key=lambda x: -x[1])

    rec_asins = [item_rev[i] for i, _ in picked]
    rec_scores = [s for _, s in picked]
    return pl.DataFrame({"parent_asin": rec_asins, "score": rec_scores})

#### run_user_base_CF

In [14]:
def run_user_base_CF(
    category: str = None,
    k_neighbors: int = None,
    n_recs: int = None,
    mean_center: bool = None,
    max_users: int = None,
    max_items: int = None,
    target_user=None
):
    """Compact end-to-end runner for user-based CF (returns recs, artifacts)."""
    _cat = category or CATEGORY
    _k = k_neighbors if k_neighbors is not None else K_NEIGHBORS
    _n = n_recs if n_recs is not None else N_RECS
    _mc = MEAN_CENTER if mean_center is None else mean_center

    logger.log_info(f"[Run] cat={_cat} | k={_k} | n={_n} | mean_center={_mc}")

    df_train = load_5core_train_user(_cat)
    R, Rc, user_idx, item_idx, user_rev, item_rev, user_means = build_user_matrices(
        df_train, mean_center=_mc, max_users=max_users, max_items=max_items
    )

    X = Rc if Rc is not None else R
    nn_model = fit_user_neighbors_userbased(X, k_neighbors=_k)

    # pick target user (try raw then str)
    if target_user is None:
        target_user = user_rev[0]
    if target_user not in user_idx:
        tstr = str(target_user)
        if tstr in user_idx:
            target_user = tstr
        else:
            target_user = user_rev[0]
            logger.log_warning("[Run] target_user not found; using first user")

    u = user_idx[target_user]
    dists, inds = nn_model.kneighbors(X.getrow(u), return_distance=True)
    d = dists.ravel(); inds = inds.ravel()
    mask = inds != u
    inds, d = inds[mask], d[mask]
    if inds.size == 0:
        return pl.DataFrame(columns=["parent_asin","score"]), {"R":R,"Rc":Rc,"user_idx":user_idx,"item_idx":item_idx}

    sims = np.clip(1.0 - d, 0.0, 1.0)
    denom = np.sum(np.abs(sims)) + 1e-8
    scores = X[inds, :].T.dot(sims) / denom
    if Rc is not None:
        scores = scores + user_means[u]

    rated = set(R.getrow(u).indices.tolist())
    cand_mask = np.ones(R.shape[1], dtype=bool)
    if rated:
        cand_mask[list(rated)] = False
    cand_scores = scores[cand_mask]
    if cand_scores.size == 0:
        return pl.DataFrame(columns=["parent_asin","score"]), {"R":R,"Rc":Rc,"user_idx":user_idx,"item_idx":item_idx}

    n_top = min(_n, cand_scores.size)
    top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
    cand_indices = np.nonzero(cand_mask)[0]
    picked = sorted(((int(cand_indices[p]), float(cand_scores[p])) for p in top_pos), key=lambda x: -x[1])

    recs = pl.DataFrame({"parent_asin": [item_rev[i] for i, _ in picked], "score": [s for _, s in picked]})
    artifacts = dict(R=R, Rc=Rc, user_idx=user_idx, item_idx=item_idx,
                     user_rev=user_rev, item_rev=item_rev, user_means=user_means,
                     nn_model=nn_model, df_train=df_train, target_user=target_user)
    logger.log_info(f"[Run] Generated {len(recs)} recs for user={target_user}")
    return recs, artifacts


#### training models for list categories

In [28]:
def _save_user_artifacts(out_dir: Path, R, Rc, user_means, user_rev, item_rev, user_idx, item_idx, nn):
    out_dir.mkdir(parents=True, exist_ok=True)
    save_npz(out_dir / "R.npz", R)
    if Rc is not None: save_npz(out_dir / "Rc.npz", Rc)
    np.save(out_dir / "user_means.npy", user_means)
    with open(out_dir / "user_rev.pkl", "wb") as f: pickle.dump(user_rev, f)
    with open(out_dir / "item_rev.pkl", "wb") as f: pickle.dump(item_rev, f)
    (out_dir / "user_idx.json").write_text(json.dumps({str(k): int(v) for k, v in user_idx.items()}))
    (out_dir / "item_idx.json").write_text(json.dumps({str(k): int(v) for k, v in item_idx.items()}))
    with open(out_dir / "nn_model.pkl", "wb") as f: pickle.dump(nn, f)

def train_user_models_for_categories(categories, k_neighbors=30, mean_center=True, models_dir=None, max_users=None, max_items=None):
    base = Path(models_dir) if models_dir else MODELS_DIR
    out_algo = base / "user"; out_algo.mkdir(parents=True, exist_ok=True)
    rows = []
    for cat in categories:
        try:
            logger.log_info(f"[USER] Training {cat}")
            df = load_5core_train_user(cat)
            R, Rc, user_idx, item_idx, user_rev, item_rev, user_means = build_user_matrices(
                df, mean_center=mean_center, max_users=max_users, max_items=max_items
            )
            X = Rc if Rc is not None else R
            nn = fit_user_neighbors_userbased(X, k_neighbors=k_neighbors)
            out_dir = out_algo / cat
            _save_user_artifacts(out_dir, R, Rc, user_means, user_rev, item_rev, user_idx, item_idx, nn)
            rows.append({"category": cat, "algo": "user", "models_dir": str(out_dir),
                         "k_neighbors": k_neighbors, "mean_center": mean_center,
                         "R_nnz": int(R.nnz), "users": len(user_rev), "items": len(item_rev)})
            logger.log_info(f"[Saved-USER] {out_dir} | nnz={R.nnz}")
        except Exception as e:
            logger.log_exception(f"[Error-USER] {cat}: {e}")
            rows.append({"category": cat, "algo": "user", "models_dir": None,
                         "k_neighbors": k_neighbors, "mean_center": mean_center, "error": str(e)})
    summary = pd.DataFrame(rows)
    logger.log_info(f"[Summary-USER] Trained={len(rows)} OK={summary['models_dir'].notna().sum()} FAIL={summary['models_dir'].isna().sum()}")
    return summary

summary_user = train_user_models_for_categories(CATEGORY, k_neighbors=K_NEIGHBORS, mean_center=MEAN_CENTER, max_users=MAX_USERS, max_items=MAX_ITEMS)
display(summary_user)


2025-09-28 10:28:45,900 - INFO - [USER] Training Electronics
2025-09-28 10:28:45,901 - INFO - [Load-User] Reading: Electronics.5core.train.parquet
2025-09-28 10:28:48,550 - INFO - [Load-User] shape=(12191484, 5) | users=shape: (1_641_026,)
Series: 'user_id' [str]
[
	"AHX5ZW5MVQGAQ2O7TASJSPDOMEBA"
	"AERHWJJPJQ5WDR6MO3LH4QKXGMZA"
	"AG4ANFICOXX7FIZKP6J7VTBR4NYQ"
	"AGNCKL6EDUA276HX2S2KTSYNICIQ"
	"AGSR6YDU2QK6LS4XAXZZDZGRKM4Q"
	…
	"AHDBI2SJVPLXHIZS5GU23DBGQ2CA"
	"AFCOGU6WR5ZCZRYGFXHRRXPU2CKQ"
	"AEWA4E2RXGCXOGTKRZIUK65XPGXQ"
	"AG4GX65RZE7LKCADBIB6CUJHYCCA"
	"AH4A42FIHMBFVW27PS2RGGESUQJA"
] | items=shape: (367_052,)
Series: 'parent_asin' [str]
[
	"B07QM6NC6G"
	"B073S4QRHD"
	"B0BPX9HLMN"
	"B003D5MZUW"
	"B01LXPPHGA"
	…
	"B007PWMWPC"
	"B0865JTN99"
	"B072V2KZ5K"
	"B07N2RLF6D"
	"B08L785SCF"
]
2025-09-28 10:28:57,775 - INFO - [Matrix-User] R(1641026, 367052) nnz=12191484
2025-09-28 10:28:57,797 - INFO - [NN-User] fitted on (1641026, 367052) | k=30
2025-09-28 10:29:09,501 - INFO - [Saved-USER] /User

Unnamed: 0,category,algo,models_dir,k_neighbors,mean_center,R_nnz,users,items
0,Electronics,user,/Users/kevin/Documents/GitHub/Python/VESKL/Per...,30,True,12191484,1641026,367052
1,Beauty_and_Personal_Care,user,/Users/kevin/Documents/GitHub/Python/VESKL/Per...,30,True,5165289,729576,207385


#### Receive UI request then using models to reply

In [30]:
def unit_test_ui_user_recommend(user_id: str, n_recs: int = 5, k_neighbors: int | None = None,
                                models_dir: str | Path | None = None, category: str | None = None):
    """Simulate UI call for USER-based CF: load artifacts, return Top-N for user_id."""
    cat = category or CATEGORY
    md = models_dir if models_dir else (MODELS_DIR / "user" / cat)
    logger.log_info(f"[UnitTest-UI-USER] model_dir={md} | user_id={user_id} | n_recs={n_recs} | k={k_neighbors}")

    recs = recommend_user_ui(user_id=user_id, n_recs=n_recs, k_neighbors=k_neighbors,
                             models_dir=md, category=cat)

    # basic checks (works for polars or pandas DataFrame-like objects)
    cols = set(recs.columns)
    assert {"parent_asin", "score"}.issubset(cols), "recs missing required columns"
    assert len(recs) <= n_recs, f"recs length should be ≤ {n_recs}"

    logger.log_info(f"[UnitTest-UI-USER] returned {len(recs)} items ✅")
    display(recs)
    return recs


### Task: Unit test

#### Receive UI request then using models to reply

In [31]:
unit_test_ui_user_recommend(user_id="AE222HFVDJ4TJ4V2LDRIAMQM2RPA", n_recs=5, k_neighbors=30, category=CATEGORY[0])

2025-09-28 10:30:05,615 - INFO - [UnitTest-UI-USER] model_dir=/Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/models/user/Electronics | user_id=AE222HFVDJ4TJ4V2LDRIAMQM2RPA | n_recs=5 | k=30
2025-09-28 10:30:08,536 - INFO - [UnitTest-UI-USER] returned 5 items ✅


parent_asin,score
str,f64
"""B00KQEJZGA""",4.549891
"""B000FCIJAQ""",4.543452
"""B00CDJSIMS""",4.543452
"""B0765YZMVY""",4.543452
"""B0077SZO7W""",4.543452


parent_asin,score
str,f64
"""B00KQEJZGA""",4.549891
"""B000FCIJAQ""",4.543452
"""B00CDJSIMS""",4.543452
"""B0765YZMVY""",4.543452
"""B0077SZO7W""",4.543452
