# Model-Based Collaborative Filtering (Matrix Factorization, ALS — explicit)

**Goal**
- Learn latent factors (P, Q) from 5-core TRAIN to predict ratings and recommend Top-N.

**Pipeline**
1) Load 5-core TRAIN (category) from `PROCESSED_DIR` (fallback `RAW_DIR`).
2) Build sparse user–item matrix `R` (CSR). (Optional) user mean-centering.
3) Train MF with ALS (explicit) on `R`.
4) Save artifacts (P, Q, indexers…) for fast UI inference.
5) Load & recommend (Top-N) for a user.

**Why MF?**
- Captures hidden tastes/themes.
- Scales better than pure KNN; fast inference via dot products.

### Task: Importing Libraries and Modules

In [1]:
import os, sys, numpy as np, polars as pl, pickle, json
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.decomposition import TruncatedSVD

# add utilities to PYTHONPATH
module_path = os.path.abspath(os.path.join('..', '../utilities'))
if module_path not in sys.path:
    sys.path.append(module_path)

from logger import Logger
from configurations import Configurations

# logger & paths
logger = Logger(process_name="user_based", log_file=Configurations.LOG_PATH)
PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
MODELS_DIR = Path(Configurations.MODELS_PATH)

# runtime params (editable)
CATEGORY    = Configurations.CATEGORIES
N_FACTORS   = 50
N_RECS      = 10
MEAN_CENTER = True
MAX_USERS   = None
MAX_ITEMS   = None

logger.log_info(f"[Paths] PROCESSED_DIR={PROCESSED_DIR}")
logger.log_info(f"[Params] CATEGORY={CATEGORY} | N_RECS={N_RECS} | MEAN_CENTER={MEAN_CENTER}")


2025-09-28 10:50:50,648 - INFO - [Paths] PROCESSED_DIR=/Users/kevin/Documents/GitHub/Python/VESKL/Personal/NEU/NEU/NEU_7275/Prj/Prj_1/APRS_7275_G6/Amazon-Product-Recommendation-System/data/processed
2025-09-28 10:50:50,650 - INFO - [Params] CATEGORY=['Electronics', 'Beauty_and_Personal_Care'] | N_RECS=10 | MEAN_CENTER=True


### Task: Define Functions

#### Loader & Matrix Builder

In [4]:
def _candidate_model_files(category: str):
    safe = category.replace('/', '-')
    fname = f"{safe}.5core.train.parquet"
    return [PROCESSED_DIR / fname]

def load_5core_train_model(category: str) -> pl.DataFrame:
    expected = ['user_id','parent_asin','rating','timestamp','history']
    for p in _candidate_model_files(category):
        if p.exists() and p.stat().st_size > 0:
            logger.log_info(f"[Load-Model] Reading {p.name}")
            df = pl.read_parquet(p)
            miss = [c for c in expected if c not in df.columns]
            if miss: raise ValueError(f"Missing {miss} in {p}")
            df = df.select(expected)
            # polars Expr doesn't have clip_min/clip_max in some versions;
            # use when/then/otherwise to enforce rating bounds while casting.
            df = df.with_columns([
                pl.when(pl.col("rating").cast(pl.Float32) < 1.0).then(1.0)
                  .when(pl.col("rating").cast(pl.Float32) > 5.0).then(5.0)
                  .otherwise(pl.col("rating").cast(pl.Float32))
                  .alias("rating"),
                pl.col("user_id").cast(pl.Utf8).alias("user_id"),
                pl.col("parent_asin").cast(pl.Utf8).alias("parent_asin")
            ])
            return df
    raise FileNotFoundError(f"5-core TRAIN not found for {category}")

def build_matrix_model(df_train: pl.DataFrame,
                       max_users: int | None = None,
                       max_items: int | None = None):
    df = df_train.drop_nulls(subset=['user_id','parent_asin','rating']).with_columns(
        pl.col("rating").cast(pl.Float32)
    )

    if max_users is not None:
        keep_users = df.select(pl.col("user_id")).unique().to_series().to_list()[:max_users]
        df = df.filter(pl.col("user_id").is_in(keep_users))
    if max_items is not None:
        keep_items = df.select(pl.col("parent_asin")).unique().to_series().to_list()[:max_items]
        df = df.filter(pl.col("parent_asin").is_in(keep_items))

    user_rev = np.array(df.select("user_id").unique().to_series().to_list(), dtype=object)
    item_rev = np.array(df.select("parent_asin").unique().to_series().to_list(), dtype=object)
    user_idx = {uid: idx for idx, uid in enumerate(user_rev)}
    item_idx = {iid: idx for idx, iid in enumerate(item_rev)}

    u = np.array([user_idx[x] for x in df["user_id"].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df["parent_asin"].to_list()], dtype=np.int32)
    v = np.array(df["rating"].to_list(), dtype=np.float32)

    nU, nI = user_rev.size, item_rev.size
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)

    logger.log_info(f"[Matrix-Model] R{R.shape} nnz={R.nnz}")
    return R, user_idx, item_idx, user_rev, item_rev


#### Train & Save Model

In [5]:
def train_model_based_for_categories(
    categories: list[str],
    n_factors: int = N_FACTORS,
    max_users: int | None = None,
    max_items: int | None = None,
    models_dir: str | Path | None = None
) -> pl.DataFrame:
    """
    Train model-based CF (TruncatedSVD) for categories and save artifacts.
    Returns a polars DataFrame summary (no pandas used).
    """
    base_out = Path(models_dir) if models_dir else MODELS_DIR
    out_algo = base_out / "model"
    out_algo.mkdir(parents=True, exist_ok=True)

    rows = []
    for cat in categories:
        try:
            logger.log_info(f"[MODEL] {cat}")
            df_train = load_5core_train_model(cat)
            R, user_idx, item_idx, user_rev, item_rev = build_matrix_model(
                df_train, max_users=max_users, max_items=max_items
            )

            svd = TruncatedSVD(n_components=n_factors, random_state=42)
            U = svd.fit_transform(R)
            V = svd.components_.T

            out_dir = out_algo / cat
            out_dir.mkdir(parents=True, exist_ok=True)

            save_npz(out_dir / "R.npz", R)
            np.save(out_dir / "U.npy", U)
            np.save(out_dir / "V.npy", V)
            with open(out_dir / "user_rev.pkl", "wb") as f: pickle.dump(user_rev, f)
            with open(out_dir / "item_rev.pkl", "wb") as f: pickle.dump(item_rev, f)
            with open(out_dir / "user_idx.json", "w") as f: json.dump({str(k): int(v) for k,v in user_idx.items()}, f)
            with open(out_dir / "item_idx.json", "w") as f: json.dump({str(k): int(v) for k,v in item_idx.items()}, f)

            rows.append({
                "category": cat,
                "algo": "model",
                "models_dir": str(out_dir),
                "n_factors": n_factors,
                "users": len(user_rev),
                "items": len(item_rev),
                "R_nnz": int(R.nnz)
            })

        except Exception as e:
            logger.log_exception(f"[Error-MODEL] {cat}: {e}")
            rows.append({
                "category": cat,
                "algo": "model",
                "models_dir": None,
                "n_factors": n_factors,
                "error": str(e)
            })

    summary = pl.DataFrame(rows)
    logger.log_info(f"[Summary-MODEL] trained={len(categories)} ok={summary.filter(pl.col('models_dir').is_not_null()).height} fail={summary.filter(pl.col('models_dir').is_null()).height}")
    return summary

summary_model = train_model_based_for_categories(CATEGORY, max_users=MAX_USERS, max_items=MAX_ITEMS, models_dir=MODELS_DIR)
display(summary_model)


2025-09-28 10:55:04,078 - INFO - [MODEL] Electronics
2025-09-28 10:55:04,079 - INFO - [Load-Model] Reading Electronics.5core.train.parquet
2025-09-28 10:55:13,681 - INFO - [Matrix-Model] R(1641026, 367052) nnz=12191484
2025-09-28 10:55:42,429 - INFO - [MODEL] Beauty_and_Personal_Care
2025-09-28 10:55:42,431 - INFO - [Load-Model] Reading Beauty_and_Personal_Care.5core.train.parquet
2025-09-28 10:55:47,707 - INFO - [Matrix-Model] R(729576, 207385) nnz=5165289
2025-09-28 10:56:00,666 - INFO - [Summary-MODEL] trained=2 ok=2 fail=0


category,algo,models_dir,n_factors,users,items,R_nnz
str,str,str,i64,i64,i64,i64
"""Electronics""","""model""","""/Users/kevin/Documents/GitHub/…",50,1641026,367052,12191484
"""Beauty_and_Personal_Care""","""model""","""/Users/kevin/Documents/GitHub/…",50,729576,207385,5165289


#### Recommend with Model-Based

In [6]:
def recommend_model_ui(user_id: str,
                       n_recs: int = 10,
                       category: str = CATEGORY,
                       models_dir: str | Path | None = None):
    out_dir = Path(models_dir or MODELS_DIR) / "model" / category
    U = np.load(out_dir / "U.npy")
    V = np.load(out_dir / "V.npy")
    with open(out_dir / "user_rev.pkl","rb") as f: user_rev = pickle.load(f)
    with open(out_dir / "item_rev.pkl","rb") as f: item_rev = pickle.load(f)
    with open(out_dir / "user_idx.json","r") as f: user_idx = {k:int(v) for k,v in json.load(f).items()}

    if user_id not in user_idx:
        return pl.DataFrame({"parent_asin": pl.Series([], dtype=pl.Utf8),
                             "score": pl.Series([], dtype=pl.Float32)})
    uidx = user_idx[user_id]

    scores = U[uidx] @ V.T
    k = min(n_recs, scores.size)
    if k == 0:
        return pl.DataFrame({"parent_asin": pl.Series([], dtype=pl.Utf8),
                             "score": pl.Series([], dtype=pl.Float32)})

    top_idx = np.argpartition(-scores, k-1)[:k]
    top_idx = top_idx[np.argsort(-scores[top_idx])]

    rec_asins  = [item_rev[i] for i in top_idx]
    rec_scores = [float(s) for s in scores[top_idx]]
    return pl.DataFrame({"parent_asin": rec_asins, "score": rec_scores})

#### Receive UI request then using models to reply

In [9]:
def unit_test_model_recommend(user_id: str, n_recs: int = 5, category: str = CATEGORY):
    recs = recommend_model_ui(user_id=user_id, n_recs=n_recs, category=category)
    assert isinstance(recs, pl.DataFrame)
    assert {"parent_asin","score"}.issubset(recs.columns)
    logger.log_info(f"[UnitTest-Model] user={user_id} → {len(recs)} recs ✅")
    display(recs)
    return recs

### Task: Unit test

#### Reply from UI using models

In [10]:
unit_test_model_recommend(user_id="AE222HFVDJ4TJ4V2LDRIAMQM2RPA", n_recs=5, category=CATEGORY[0])

2025-09-28 10:57:23,342 - INFO - [UnitTest-Model] user=AE222HFVDJ4TJ4V2LDRIAMQM2RPA → 5 recs ✅


parent_asin,score
str,f64
"""B075KP9XHS""",0.00475
"""B00AJHDZSI""",0.003947
"""B06ZZCZS7R""",0.003537
"""B09SXP5VB5""",0.003281
"""B00HEEOQBO""",0.003177


parent_asin,score
str,f64
"""B075KP9XHS""",0.00475
"""B00AJHDZSI""",0.003947
"""B06ZZCZS7R""",0.003537
"""B09SXP5VB5""",0.003281
"""B00HEEOQBO""",0.003177
