# Model-Based Collaborative Filtering (Matrix Factorization, ALS)

**Goal**
- Learn latent factors (U, V) from 5-core TRAIN to predict ratings and recommend Top-N.

**Pipeline**
1. Load 5-core TRAIN from `PROCESSED_DIR`
2. Build sparse user-item matrix R (CSR)
3. Train Matrix Factorization with TruncatedSVD
4. Save artifacts (U, V, indexers) for fast UI inference
5. Evaluate using TEST/VALID sets: Accuracy, RMSE, Recall@K, NDCG@K, MAP@K
6. Load & recommend Top-N for users

**Why MF?**
- Captures hidden tastes/themes
- Scales better than KNN
- Fast inference via dot products

### Task: Import modules and libraries

In [None]:
import os, sys, numpy as np, polars as pl, pickle, json, time
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import time

module_path = os.path.abspath(os.path.join('..', '../utilities'))
if module_path not in sys.path:
    sys.path.append(module_path)

from logger import Logger
from configurations import Configurations
from visualization_helpers import (
    visualize_hyperparameter_tuning,
    visualize_final_results,
    visualize_val_test_comparison
)
from evaluation_metrics import (
       compute_rmse_accuracy,
       recall_at_k,
       ndcg_at_k,
       map_at_k
)

logger = Logger(process_name="model_based", log_file=Configurations.LOG_PATH)
PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
MODELS_DIR = Path(Configurations.MODELS_PATH)

CATEGORY = Configurations.CATEGORIES

# Auto-detect phase
has_tuning = Configurations.has_tuning_results_model(CATEGORY[0])
FACTORS_VALUES = Configurations.FACTORS_VALUES  # [10, 20, 30, 50, 75, 100]

if has_tuning:
    PHASE = 'final'
    logger.log_info("="*70)
    logger.log_info("PHASE: FINAL EVALUATION (MODEL-BASED)")
    logger.log_info("="*70)
    
    for cat in CATEGORY:
        best_factors = Configurations.load_best_factors(cat)
        logger.log_info(f"  {cat}: Best n_factors = {best_factors}")
else:
    PHASE = 'train_tune'
    logger.log_info("="*70)
    logger.log_info("PHASE: TRAINING + TUNING (MODEL-BASED)")
    logger.log_info("="*70)
    logger.log_info(f"n_factors values: {FACTORS_VALUES}")

logger.log_info("="*70)
logger.log_info(f"Categories: {CATEGORY}")
logger.log_info(f"Sample size: {Configurations.DEV_SAMPLE_SIZE}")
logger.log_info("="*70 + "\n")

# Settings
N_RECS = 10
MEAN_CENTER = True
MAX_USERS = None
MAX_ITEMS = None
N_FACTORS = 50  # Default

### Task: Define functions for CF recommendation

#### Data Loader

In [2]:
import polars as pl

def _candidate_files(category: str, split: str = "train"):
    dev_sample_size = Configurations.DEV_SAMPLE_SIZE

    if dev_sample_size != 'full':
        sample_sizes = Configurations.SAMPLE_SIZES
        for size_name in sample_sizes.keys():
            if size_name == dev_sample_size:
             return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.{size_name}.parquet"
    else:
        return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.parquet"

def load_5core_data(category: str, split: str = "train") -> pl.DataFrame:
    p = _candidate_files(category, split)
    
    df = pl.read_parquet(p, low_memory=False)
    df = df.with_columns([
        pl.when(pl.col("rating").cast(pl.Float32) < 1.0).then(1.0)
            .when(pl.col("rating").cast(pl.Float32) > 5.0).then(5.0)
            .otherwise(pl.col("rating").cast(pl.Float32)).alias("rating")
    ])
    logger.log_info(f"[Load-{split.upper()}] shape={df.shape} | users={df['user_id'].n_unique()} | items={df['parent_asin'].n_unique()}")
    return df

#### Build Matrix, train model

In [3]:
def build_matrix_model(df_train: pl.DataFrame, max_users: int | None = None, 
                      max_items: int | None = None):
    df = df_train.drop_nulls(subset=['user_id', 'parent_asin', 'rating'])
    
    if max_users is not None:
        keep_users = df['user_id'].unique().to_list()[:max_users]
        df = df.filter(pl.col('user_id').is_in(keep_users))
    if max_items is not None:
        keep_items = df['parent_asin'].unique().to_list()[:max_items]
        df = df.filter(pl.col('parent_asin').is_in(keep_items))
    
    user_rev = np.array(df['user_id'].unique().to_list(), dtype=object)
    item_rev = np.array(df['parent_asin'].unique().to_list(), dtype=object)
    user_idx = {uid: idx for idx, uid in enumerate(user_rev)}
    item_idx = {iid: idx for idx, iid in enumerate(item_rev)}
    
    u = np.array([user_idx[x] for x in df['user_id'].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df['parent_asin'].to_list()], dtype=np.int32)
    v = np.array(df['rating'].to_list(), dtype=np.float32)
    
    nU, nI = user_rev.size, item_rev.size
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)
    
    logger.log_info(f"[Matrix-Model] R{R.shape} nnz={R.nnz}")
    return R, user_idx, item_idx, user_rev, item_rev


def train_svd_model(R_train, n_factors=50):
    n_users, n_items = R_train.shape
    max_factors = min(n_users, n_items) - 1  # SVD constraint: k < min(m,n)
    
    original_factors = n_factors
    
    # Auto-adjust if too large
    if n_factors >= max_factors:
        n_factors = max(1, max_factors)  # At least 1 factor
        logger.log_warning(f"n_factors={original_factors} too large for matrix {R_train.shape}")
        logger.log_warning(f"Auto-adjusted to n_factors={n_factors}")
    
    # Validate adjusted value
    if n_factors < 1:
        raise ValueError(
            f"Matrix too small {R_train.shape}! Cannot perform SVD. "
            f"Need larger dataset (use 'small' or 'medium' size)."
        )

    logger.log_info(f"Training SVD: matrix {R_train.shape} with {n_factors} factors...")
    
    start_time = time.time()
    
    try:
        # Perform truncated SVD
        U, sigma, Vt = svds(R_train, k=n_factors)
        
        # Reverse order (svds returns smallest singular values first)
        U = U[:, ::-1]
        sigma = sigma[::-1]
        Vt = Vt[::-1, :]
        
        # Convert sigma to diagonal matrix
        sigma_diag = np.diag(sigma)
        train_time = time.time() - start_time
        
        logger.log_info(f"SVD completed in {train_time:.2f}s")
        logger.log_info(f"  U shape:  {U.shape}")
        logger.log_info(f"  Vt shape: {Vt.shape}")
        logger.log_info(f"  Singular values: [{sigma.min():.4f}, {sigma.max():.4f}]")
        
        return U, Vt, sigma_diag
        
    except Exception as e:
        logger.log_error(f"SVD training failed: {e}")
        logger.log_error(f"Matrix info: shape={R_train.shape}, nnz={R_train.nnz}, n_factors={n_factors}")
        raise

#### Save/Load Artifacts

In [4]:
def save_model_artifacts(out_dir: Path, R, U, V, user_rev, item_rev, user_idx, item_idx):
    out_dir.mkdir(parents=True, exist_ok=True)
    save_npz(out_dir / "R.npz", R)
    np.save(out_dir / "U.npy", U)
    np.save(out_dir / "V.npy", V)
    with open(out_dir / "user_rev.pkl", "wb") as f: pickle.dump(user_rev, f)
    with open(out_dir / "item_rev.pkl", "wb") as f: pickle.dump(item_rev, f)
    (out_dir / "user_idx.json").write_text(json.dumps({str(k): int(v) for k, v in user_idx.items()}))
    (out_dir / "item_idx.json").write_text(json.dumps({str(k): int(v) for k, v in item_idx.items()}))
    logger.log_info(f"[Saved-Model] {out_dir}")

def load_model_artifacts(model_dir: str | Path):
    md = Path(model_dir)
    R = load_npz(md / "R.npz")
    U = np.load(md / "U.npy")
    V = np.load(md / "V.npy")
    with open(md / "user_rev.pkl", "rb") as f: user_rev = pickle.load(f)
    with open(md / "item_rev.pkl", "rb") as f: item_rev = pickle.load(f)
    user_idx = {k: int(v) for k, v in json.loads((md / "user_idx.json").read_text()).items()}
    item_idx = {k: int(v) for k, v in json.loads((md / "item_idx.json").read_text()).items()}
    return dict(R=R, U=U, V=V, user_rev=user_rev, item_rev=item_rev, user_idx=user_idx, item_idx=item_idx)

#### Training Pipeline

In [5]:
def train_model_based_for_categories(categories, n_factors=50, models_dir=None, max_users=None, max_items=None):
    base = Path(models_dir) if models_dir else MODELS_DIR
    out_algo = base / "model"
    out_algo.mkdir(parents=True, exist_ok=True)
    rows = []
    for cat in categories:
        out_dir = out_algo / cat
        if out_dir.exists() and (out_dir / "U.npy").exists():
            logger.log_info(f"[Skip] Model based model exists for {cat}")
            rows.append({"category": cat, "algo": "model", "models_dir": str(out_dir), "n_factors": n_factors, "status": "skipped"})
            continue
        try:
            logger.log_info(f"[MODEL] Training {cat}")
            df_train = load_5core_data(cat, split="train")
            R, user_idx, item_idx, user_rev, item_rev = build_matrix_model(df_train, max_users=max_users, max_items=max_items)
            svd = TruncatedSVD(n_components=n_factors, random_state=42)
            U = svd.fit_transform(R)
            V = svd.components_.T
            save_model_artifacts(out_dir, R, U, V, user_rev, item_rev, user_idx, item_idx)
            rows.append({"category": cat, "algo": "model", "models_dir": str(out_dir), "n_factors": n_factors, "users": len(user_rev), "items": len(item_rev), "R_nnz": int(R.nnz)})
        except Exception as e:
            logger.log_exception(f"[Error-MODEL] {cat}: {e}")
            rows.append({"category": cat, "algo": "model", "models_dir": None, "n_factors": n_factors, "error": str(e)})
    summary = pl.DataFrame(rows)
    ok_count = sum(1 for r in rows if r.get("models_dir"))
    logger.log_info(f"[Summary-MODEL] Total={len(rows)} OK={ok_count} FAIL={len(rows)-ok_count}")
    return summary

#### Prediction & Recommendation

In [6]:
def predict_model_based(user_idx_val: int, U: np.ndarray, V: np.ndarray, R: csr_matrix) -> np.ndarray:
    scores = U[user_idx_val] @ V
    return scores

def recommend_model_based(user_id: str, n_recs: int, artifacts: dict) -> pl.DataFrame:
    U, V = artifacts['U'], artifacts['V']
    R = artifacts['R']
    user_idx = artifacts['user_idx']
    item_rev = artifacts['item_rev']
    if user_id not in user_idx:
        logger.log_warning(f"[Recommend] user_id={user_id} not found.")
        return pl.DataFrame(columns=["parent_asin", "score"])
    u = user_idx[user_id]
    scores = predict_model_based(u, U, V, R)
    rated = set(R.getrow(u).indices.tolist())
    cand_mask = np.ones(len(scores), dtype=bool)
    if rated:
        cand_mask[list(rated)] = False
    cand_scores = scores[cand_mask]
    if cand_scores.size == 0:
        return pl.DataFrame(columns=["parent_asin", "score"])
    n_top = min(n_recs, cand_scores.size)
    cand_indices = np.nonzero(cand_mask)[0]
    top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
    picked = sorted([(int(cand_indices[p]), float(cand_scores[p])) for p in top_pos], key=lambda x: -x[1])
    rec_asins = [item_rev[i] for i, _ in picked]
    rec_scores = [s for _, s in picked]
    return pl.DataFrame({"parent_asin": rec_asins, "score": rec_scores})

### Task: Evaluation

#### Evaluation Pipeline

In [7]:
def evaluate_model_based(category: str, artifacts: dict, k_values: list = [10, 20, 50], 
                        split: str = "test", sample_users: int = 3000):
    
    logger.log_info(f"[Eval-Model] {category} on {split.upper()}")
    
    df_eval = load_5core_data(category, split=split)
    U, V, R = artifacts['U'], artifacts['V'], artifacts['R']
    user_idx, item_idx = artifacts['user_idx'], artifacts['item_idx']
    
    # Filter to train users only
    train_user_list = list(user_idx.keys())
    df_eval = df_eval.filter(pl.col('user_id').is_in(train_user_list))
    
    if len(df_eval) == 0:
        logger.log_warning(f"[Eval-Model] No data after filtering")
        return None
    
    logger.log_info(f"[Eval-Model] After filtering: {len(df_eval):,} ratings, {df_eval['user_id'].n_unique():,} users")
    
    # Sample users for evaluation
    eval_users = df_eval['user_id'].unique().to_list()
    if len(eval_users) > sample_users:
        np.random.seed(42)
        eval_users = np.random.choice(eval_users, sample_users, replace=False).tolist()
    
    logger.log_info(f"[Eval-Model] Evaluating {len(eval_users)} users...")
    
    # Initialize accumulators
    metrics_acc = {
        'rmse': [], 'accuracy': [],
        **{f'recall@{k}': [] for k in k_values},
        **{f'ndcg@{k}': [] for k in k_values},
        **{f'map@{k}': [] for k in k_values}
    }
    
    evaluated_users = 0
    
    for user_id in eval_users:
        if user_id not in user_idx:
            continue
        
        u = user_idx[user_id]
        user_eval = df_eval.filter(pl.col('user_id') == user_id)
        actual_items = set(user_eval['parent_asin'].to_list())
        actual_ratings = {row['parent_asin']: row['rating'] 
                         for row in user_eval.iter_rows(named=True)}
        
        if len(actual_items) == 0:
            continue
        
        known_items = {item for item in actual_items if item in item_idx}
        
        # Skip user if ALL items are unknown
        if len(known_items) == 0:
            continue
        
        evaluated_users += 1
        scores = predict_model_based(u, U, V, R)
        
        # RMSE & Accuracy - only for known items
        predictions = np.full(R.shape[1], np.nan)
        actuals = np.full(R.shape[1], np.nan)
        for asin in known_items:
            idx = item_idx[asin]
            predictions[idx] = scores[idx]
            actuals[idx] = actual_ratings[asin]
        
        rmse, acc = compute_rmse_accuracy(predictions, actuals)
        if not np.isnan(rmse):
            metrics_acc['rmse'].append(rmse)
            metrics_acc['accuracy'].append(acc)
        
        # Ranking metrics
        rated = set(R.getrow(u).indices.tolist())
        cand_mask = np.ones(R.shape[1], dtype=bool)
        if rated:
            cand_mask[list(rated)] = False
        
        cand_scores = scores[cand_mask]
        if cand_scores.size == 0:
            continue
        
        cand_indices = np.nonzero(cand_mask)[0]
        max_k = max(k_values)
        n_top = min(max_k, cand_scores.size)
        top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
        sorted_idx = top_pos[np.argsort(-cand_scores[top_pos])]
        
        item_rev = artifacts['item_rev']
        recommended = [item_rev[cand_indices[i]] for i in sorted_idx]
        
        # Use known_items for ranking metrics
        for k in k_values:
            metrics_acc[f'recall@{k}'].append(recall_at_k(recommended, known_items, k))
            metrics_acc[f'ndcg@{k}'].append(ndcg_at_k(recommended, known_items, k))
            metrics_acc[f'map@{k}'].append(map_at_k(recommended, known_items, k))
    
    logger.log_info(f"[Eval-Model] Actually evaluated: {evaluated_users} users")
    
    # Aggregate results
    results = {
        'category': category,
        'split': split,
        'n_users': evaluated_users,
        'rmse': np.mean(metrics_acc['rmse']) if metrics_acc['rmse'] else np.nan,
        'accuracy': np.mean(metrics_acc['accuracy']) if metrics_acc['accuracy'] else np.nan
    }
    
    for k in k_values:
        for metric in ['recall', 'ndcg', 'map']:
            key = f'{metric}@{k}'
            results[key] = np.mean(metrics_acc[key]) if metrics_acc[key] else 0.0
    
    logger.log_info(f"[Eval-Model] RMSE={results['rmse']:.4f}, Acc={results['accuracy']:.4f}")
    logger.log_info(f"[Eval-Model] NDCG@10={results['ndcg@10']:.4f}, Recall@10={results['recall@10']:.4f}")
    
    return results

#### Hyperparameter Tuning

In [8]:
def tune_n_factors_single_category(category: str, factors_values: list, n_eval_users: int):
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"TUNING N_FACTORS: {category}")
    logger.log_info(f"{'='*70}")
    logger.log_info(f"n_factors values: {factors_values}")
    logger.log_info(f"Validation users: {n_eval_users}\n")
    
    # Load training data ONCE
    logger.log_info("Loading training data...")
    df_train = load_5core_data(category, split="train")
    
    # Build base matrix ONCE
    logger.log_info("Building rating matrix...")
    R, user_idx, item_idx, user_rev, item_rev = build_matrix_model(
        df_train, max_users=MAX_USERS, max_items=MAX_ITEMS
    )
    
    # Prepare base artifacts for evaluation
    base_artifacts = {
        'R': R,
        'user_rev': user_rev,
        'item_rev': item_rev,
        'user_idx': user_idx,
        'item_idx': item_idx
    }
    
    results = []
    
    # Test each n_factors value
    for i, n_factors in enumerate(factors_values, 1):
        logger.log_info(f"\n[{i}/{len(factors_values)}] Testing n_factors={n_factors}")
        logger.log_info("-"*70)
        
        # Train SVD with this n_factors
        start_time = time.time()
        U, V, train_time = train_svd_model(R, n_factors=n_factors)
        
        # Create artifacts for this configuration
        eval_artifacts = base_artifacts.copy()
        eval_artifacts['U'] = U
        eval_artifacts['V'] = V
        eval_artifacts['n_factors'] = n_factors
        
        # Evaluate on validation
        start_time = time.time()
        metrics = evaluate_model_based(
            category, eval_artifacts,
            k_values=[10, 20, 50],
            split="valid",
            sample_users=n_eval_users
        )
        eval_time = time.time() - start_time
        
        if metrics:
            # Store all results
            result = {
                'n_factors': n_factors,
                'NDCG@10': metrics['ndcg@10'],
                'NDCG@20': metrics['ndcg@20'],
                'NDCG@50': metrics['ndcg@50'],
                'Recall@10': metrics['recall@10'],
                'Recall@20': metrics['recall@20'],
                'Recall@50': metrics['recall@50'],
                'MAP@10': metrics['map@10'],
                'MAP@20': metrics['map@20'],
                'MAP@50': metrics['map@50'],
                'RMSE': metrics['rmse'],
                'Accuracy': metrics['accuracy'],
                'Train_Time': train_time,
                'Eval_Time': eval_time
            }
            results.append(result)
            
            # Print summary
            logger.log_info(f"Results:")
            logger.log_info(f"  NDCG@10:   {metrics['ndcg@10']:.4f}")
            logger.log_info(f"  Recall@10: {metrics['recall@10']:.4f}")
            logger.log_info(f"  MAP@10:    {metrics['map@10']:.4f}")
            logger.log_info(f"  RMSE:      {metrics['rmse']:.4f}")
            logger.log_info(f"  Train:     {train_time:.1f}s")
            logger.log_info(f"  Eval:      {eval_time:.1f}s")
    
    df_results = pl.DataFrame(results)
    
    # Save results
    out_csv = MODELS_DIR / 'model' / f'tuning_{category}.csv'
    df_results.write_csv(out_csv)
    logger.log_info(f"\nSaved tuning results: {out_csv}")
    
    return df_results


def select_best_factors(df_results: pl.DataFrame):
    """Select best n_factors using NDCG-primary strategy"""
    
    # FIND column 'K':
    best_factors_ndcg = df_results['n_factors'][df_results['NDCG@10'].arg_max()]  # ← Not 'K'!
    best_ndcg = df_results['NDCG@10'].max()
    
    logger.log_info(f"\nPrimary metric (NDCG@10): n_factors={best_factors_ndcg}, score={best_ndcg:.4f}")
    
    threshold = best_ndcg * 0.98
    similar_rows = df_results.filter(pl.col('NDCG@10') >= threshold)
    similar_factors = similar_rows['n_factors'].to_list()
    
    if len(similar_factors) > 1:
        # Prefer SMALLER n_factors (faster, less overfitting)
        best_factors = min(similar_factors)
        logger.log_info(f"Selected n_factors={best_factors} (smallest among similar)")
    else:
        best_factors = best_factors_ndcg
    
    return best_factors


def analyze_and_select_factors(df_results: pl.DataFrame, category: str):
    """Analyze results and select best n_factors"""
    logger.log_info(f"\n{'='*70}")
    logger.log_info("N_FACTORS SELECTION ANALYSIS")
    logger.log_info(f"{'='*70}")
    
    # Best n_factors for each metric
    best_ndcg = df_results['n_factors'][df_results['NDCG@10'].arg_max()]
    best_recall = df_results['n_factors'][df_results['Recall@10'].arg_max()]
    best_map = df_results['n_factors'][df_results['MAP@10'].arg_max()]
    best_rmse = df_results['n_factors'][df_results['RMSE'].arg_min()]
    
    logger.log_info(f"\nBest n_factors by metric:")
    logger.log_info(f"  NDCG@10:   n_factors={best_ndcg:3d} (score={df_results['NDCG@10'].max():.4f})")
    logger.log_info(f"  Recall@10: n_factors={best_recall:3d} (score={df_results['Recall@10'].max():.4f})")
    logger.log_info(f"  MAP@10:    n_factors={best_map:3d} (score={df_results['MAP@10'].max():.4f})")
    logger.log_info(f"  RMSE:      n_factors={best_rmse:3d} (score={df_results['RMSE'].min():.4f})")
    
    # Select best
    best_factors = select_best_factors(df_results)
    
    # Show final selection details
    final_row = df_results.filter(pl.col('n_factors') == best_factors).row(0, named=True)
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"FINAL SELECTION: n_factors={best_factors}")
    logger.log_info(f"{'='*70}")
    logger.log_info(f"  NDCG@10:   {final_row['NDCG@10']:.4f}")
    logger.log_info(f"  Recall@10: {final_row['Recall@10']:.4f}")
    logger.log_info(f"  MAP@10:    {final_row['MAP@10']:.4f}")
    logger.log_info(f"  RMSE:      {final_row['RMSE']:.4f}")
    logger.log_info(f"  Accuracy:  {final_row['Accuracy']:.4f}")
    logger.log_info(f"  Train:     {final_row['Train_Time']:.1f}s")
    logger.log_info(f"{'='*70}\n")
    
    # Save to configuration
    Configurations.save_best_factors(category, best_factors)
    logger.log_info(f"Saved best n_factors to file\n")
    
    return best_factors, df_results

### Pipeline and execution

#### Training pipeline

In [9]:
def _train_single_category(cat, model_dir, FACTORS_VALUES, n_eval_tune):  
    # ========================================================================
    # STEP 1: BUILD BASE MATRIX (if not exists)
    # ========================================================================
    
    base_exists = (model_dir / "R.npz").exists()
    
    if not base_exists:
        logger.log_info("STEP 1: BUILDING BASE MATRIX")
        logger.log_info("-"*70)
        
        # Load training data
        df_train = load_5core_data(cat, split="train")
        
        # Build rating matrix
        R, user_idx, item_idx, user_rev, item_rev = build_matrix_model(
            df_train, max_users=MAX_USERS, max_items=MAX_ITEMS
        )
        
        # Check matrix dimensions
        n_users, n_items = R.shape
        max_factors = min(n_users, n_items) - 1  # SVD constraint
        
        logger.log_info(f"Matrix shape: {R.shape} (users × items)")
        logger.log_info(f"Max allowed factors: {max_factors}")
        
        # Warning if dataset too small
        if max_factors < 10:
            logger.log_warning("="*70)
            logger.log_warning(f"WARNING: Matrix too small ({R.shape})!")
            logger.log_warning(f"Max factors: {max_factors} (very limited)")
            logger.log_warning("Recommendation: Use 'small' or 'medium' dataset size")
            logger.log_warning("="*70)
        
        # Adjust initial n_factors
        initial_factors = min(50, max_factors)
        logger.log_info(f"Using n_factors={initial_factors} for base model\n")
        
        # Train with adjusted n_factors
        U, V, _ = train_svd_model(R, n_factors=initial_factors)
        
        # Save artifacts
        save_model_artifacts(model_dir, R, U, V, user_rev, item_rev, user_idx, item_idx)
        logger.log_info(f"Base model saved to {model_dir}\n")
    
    else:
        logger.log_info("STEP 1: BASE MATRIX EXISTS")
        logger.log_info("-"*70)
        logger.log_info(f"Loading from {model_dir}\n")
    
    # ========================================================================
    # STEP 2: HYPERPARAMETER TUNING (if not done)
    # ========================================================================
    if not Configurations.has_tuning_results_model(cat):
        logger.log_info("STEP 2: HYPERPARAMETER TUNING (VALIDATION)")
        logger.log_info("-"*70)
        
        # Load base artifacts
        artifacts = load_model_artifacts(model_dir)
        R = artifacts['R']
        n_users, n_items = R.shape
        max_factors = min(n_users, n_items) - 1
        
        # Filter out invalid factor values
        valid_factors = [f for f in FACTORS_VALUES if f <= max_factors]
        
        if len(valid_factors) == 0:
            logger.log_warning("No valid n_factors values! Using max allowed.")
            valid_factors = [max_factors]
        
        if len(valid_factors) < len(FACTORS_VALUES):
            skipped = [f for f in FACTORS_VALUES if f > max_factors]
            logger.log_warning(f"Skipped invalid n_factors: {skipped} (max={max_factors})")
        
        logger.log_info(f"Valid n_factors values: {valid_factors}")
        logger.log_info(f"Validation users: {n_eval_tune}\n")
        
        # Prepare base artifacts
        base_artifacts = {
            'R': R,
            'user_rev': artifacts['user_rev'],
            'item_rev': artifacts['item_rev'],
            'user_idx': artifacts['user_idx'],
            'item_idx': artifacts['item_idx']
        }
        
        results = []
        
        # Test each valid n_factors value
        for i, n_factors in enumerate(valid_factors, 1):
            logger.log_info(f"\n[{i}/{len(valid_factors)}] Testing n_factors={n_factors}")
            logger.log_info("-"*70)
            
            # Train SVD
            train_start = time.time()
            U, V, _ = train_svd_model(R, n_factors=n_factors)
            train_time = time.time() - train_start
            
            # Create artifacts for evaluation
            eval_artifacts = base_artifacts.copy()
            eval_artifacts['U'] = U
            eval_artifacts['V'] = V
            eval_artifacts['n_factors'] = n_factors
            
            # Evaluate on validation set
            eval_start = time.time()
            metrics = evaluate_model_based(
                cat, eval_artifacts,
                k_values=[10, 20, 50],
                split="valid",
                sample_users=n_eval_tune
            )
            eval_time = time.time() - eval_start
            
            if metrics:
                results.append({
                    'n_factors': n_factors,
                    'NDCG@10': metrics['ndcg@10'],
                    'NDCG@20': metrics['ndcg@20'],
                    'NDCG@50': metrics['ndcg@50'],
                    'Recall@10': metrics['recall@10'],
                    'Recall@20': metrics['recall@20'],
                    'Recall@50': metrics['recall@50'],
                    'MAP@10': metrics['map@10'],
                    'MAP@20': metrics['map@20'],
                    'MAP@50': metrics['map@50'],
                    'RMSE': metrics['rmse'],
                    'Accuracy': metrics['accuracy'],
                    'Train_Time': train_time,
                    'Eval_Time': eval_time
                })
                
                logger.log_info(f"NDCG@10: {metrics['ndcg@10']:.4f}, "
                              f"Recall@10: {metrics['recall@10']:.4f}, "
                              f"Train: {train_time:.1f}s")
        
        # Check if we got any results
        if len(results) == 0:
            logger.log_error("No tuning results! Using max_factors as best.")
            best_factors = max_factors
            
            # Train final model with max_factors
            U_best, V_best, _ = train_svd_model(R, n_factors=max_factors)
            save_model_artifacts(model_dir, R, U_best, V_best,
                               artifacts['user_rev'], artifacts['item_rev'],
                               artifacts['user_idx'], artifacts['item_idx'])
            
            Configurations.save_best_factors(cat, best_factors)
            
            return {'tuned_now': True, 'best_factors': best_factors}
        
        # Save tuning results
        df_results = pl.DataFrame(results)
        df_results.write_csv(MODELS_DIR / 'model' / f'tuning_{cat}.csv')
        logger.log_info(f"\nSaved tuning results to: tuning_{cat}.csv")
        
        # Select best n_factors
        best_factors = select_best_factors(df_results)
        
        # Log final selection
        final_row = df_results.filter(pl.col('n_factors') == best_factors).row(0, named=True)
        
        logger.log_info(f"\n{'='*70}")
        logger.log_info(f"BEST N_FACTORS SELECTED: {best_factors}")
        logger.log_info(f"{'='*70}")
        logger.log_info(f"  NDCG@10:   {final_row['NDCG@10']:.4f}")
        logger.log_info(f"  Recall@10: {final_row['Recall@10']:.4f}")
        logger.log_info(f"  MAP@10:    {final_row['MAP@10']:.4f}")
        logger.log_info(f"  RMSE:      {final_row['RMSE']:.4f}")
        logger.log_info(f"{'='*70}\n")
        
        # Save best n_factors to configuration
        Configurations.save_best_factors(cat, best_factors)
        logger.log_info(f"Saved best n_factors to configuration\n")
        
        # Generate visualization
        logger.log_info("Generating hyperparameter tuning plot...")
        visualize_hyperparameter_tuning(
            df_results,
            category=cat,
            param_col='n_factors',
            param_name='n_factors (Latent Dimensions)',
            save_dir=MODELS_DIR / 'model',
            algo_name='Model-Based'
        )
        logger.log_info(f"Saved: factors_tuning_{cat}.png\n")
        
        # Re-train with best n_factors and save
        logger.log_info(f"Re-training final model with n_factors={best_factors}...")
        U_best, V_best, _ = train_svd_model(R, n_factors=best_factors)
        save_model_artifacts(model_dir, R, U_best, V_best,
                           artifacts['user_rev'], artifacts['item_rev'],
                           artifacts['user_idx'], artifacts['item_idx'])
        logger.log_info(f"Final model saved to {model_dir}\n")
        
        return {'tuned_now': True, 'best_factors': best_factors}
    
    else:
        # Tuning already done - load from configuration
        best_factors = Configurations.load_best_factors(cat)
        logger.log_info("STEP 2: TUNING ALREADY DONE")
        logger.log_info("-"*70)
        logger.log_info(f"Best n_factors (loaded): {best_factors}\n")
        
        return {'tuned_now': False, 'best_factors': best_factors}

#### Phase 1: Training + Tuning

In [None]:
# ============================================================================
# PHASE 1: TRAINING + TUNING ALL CATEGORIES
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1: TRAINING + TUNING ALL CATEGORIES (MODEL-BASED)")
logger.log_info("="*70 + "\n")

# Configuration
if not Configurations.has_tuning_results_model(CATEGORY[0]):
    FACTORS_VALUES = Configurations.FACTORS_VALUES  # [10, 20, 30, 50, 75, 100]
    logger.log_info(f"n_factors to test: {FACTORS_VALUES}\n")

workflow_results = {}

for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nCATEGORY: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "model" / cat
    workflow_results[cat] = _train_single_category(
        cat, model_dir, FACTORS_VALUES, Configurations.get_eval_samples_tuning()
    )

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1 COMPLETE: ALL MODELS TRAINED AND TUNED")
logger.log_info("="*70 + "\n")

logger.log_info("Tuning Summary:")
for cat in CATEGORY:
    status = 'newly tuned' if workflow_results[cat]['tuned_now'] else 'loaded from cache'
    logger.log_info(f"  {cat}: n_factors={workflow_results[cat]['best_factors']} ({status})")

logger.log_info("\n" + "="*70)
logger.log_info("Ready for Phase 2: Final Evaluation")
logger.log_info("="*70 + "\n")

#### Phase 2: Final Evaluation

In [None]:
# ============================================================================
# PHASE 2: FINAL EVALUATION ON TEST SET (ALL CATEGORIES)
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 2: FINAL EVALUATION ON TEST SET (MODEL-BASED)")
logger.log_info("="*70 + "\n")

# ============================================================================
# LOAD WORKFLOW RESULTS
# ============================================================================

if 'workflow_results' not in locals():
    workflow_results = {}
    for cat in CATEGORY:
        best_factors = Configurations.load_best_factors(cat)
        workflow_results[cat] = {'best_factors': best_factors, 'tuned_now': False}
    logger.log_info("Loaded best n_factors from configuration\n")

n_eval_final = Configurations.get_eval_samples_final()
logger.log_info(f"Test users per category: {n_eval_final}\n")

# ============================================================================
# RUN TEST EVALUATION FOR ALL CATEGORIES
# ============================================================================

for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nTESTING: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "model" / cat
    best_factors = workflow_results[cat]['best_factors']
    logger.log_info(f"Using n_factors: {best_factors}")
    
    # Load model
    final_artifacts = load_model_artifacts(model_dir)
    
    logger.log_info("Evaluating on test set...\n")
    results = evaluate_model_based(cat, final_artifacts, k_values=[10, 20, 50],
                                  split="test", sample_users=n_eval_final)
    
    if results:
        workflow_results[cat]['test_results'] = results
        logger.log_info(f"\nTest Results (n_factors={best_factors}):")
        logger.log_info(f"  NDCG@10: {results['ndcg@10']:.4f}, "
                       f"Recall@10: {results['recall@10']:.4f}, "
                       f"MAP@10: {results['map@10']:.4f}")
        logger.log_info(f"  RMSE: {results['rmse']:.4f}, "
                       f"Accuracy: {results['accuracy']:.4f}\n")

# ============================================================================
# SAVE FINAL RESULTS
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("SAVING FINAL RESULTS")
logger.log_info("="*70 + "\n")

test_results_list = [workflow_results[cat]['test_results'] 
                     for cat in CATEGORY 
                     if 'test_results' in workflow_results[cat]]

if test_results_list:
    df_final_results = pl.DataFrame(test_results_list)
    
    logger.log_info("Final Test Results:")
    display(df_final_results)
    
    out_csv = MODELS_DIR / 'model' / 'final_test_results.csv'
    df_final_results.write_csv(out_csv)
    logger.log_info(f"\nSaved: {out_csv}")
    
    logger.log_info("Generating final evaluation plot...")
    visualize_final_results(
                            test_results_list,
                            save_dir=MODELS_DIR / 'model',
                            algo_name='Model-Based',
                            k_values=[10, 20, 50]
                        )
    logger.log_info(f"Saved: evaluation_results.png\n")

# ============================================================================
# POST-ANALYSIS VISUALIZATION
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("POST-ANALYSIS VISUALIZATION")
logger.log_info("="*70 + "\n")

for cat in CATEGORY:
    tuning_csv = MODELS_DIR / 'model' / f'tuning_{cat}.csv'
    
    if not tuning_csv.exists():
        logger.log_info(f"No tuning results for {cat}, skipping\n")
        continue
    
    if 'test_results' not in workflow_results[cat]:
        continue
    
    logger.log_info(f"Generating Val vs Test comparison for {cat}...")
    
    df_tuning = pl.read_csv(tuning_csv)
    best_factors = workflow_results[cat]['best_factors']
    
    tuning_row = df_tuning.filter(pl.col('n_factors') == best_factors).row(0, named=True)
    final_row = df_final_results.filter(pl.col('category') == cat).row(0, named=True)
    
    visualize_val_test_comparison(
                                    cat=cat,
                                    param_val=best_factors,
                                    tuning_row=tuning_row,
                                    final_row=final_row,
                                    save_dir=MODELS_DIR / 'model',
                                    param_name='n_factors',
                                    algo_name='Model-Based'
                                )
    logger.log_info(f"  Saved: val_vs_test_{cat}.png\n")

# ============================================================================
# FINAL WORKFLOW SUMMARY
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("COMPLETE WORKFLOW SUMMARY (MODEL-BASED)")
logger.log_info("="*70)

for cat in CATEGORY:
    logger.log_info(f"\n{cat}: n_factors={workflow_results[cat]['best_factors']}")
    if 'test_results' in workflow_results[cat]:
        test = workflow_results[cat]['test_results']
        logger.log_info(f"  NDCG@10: {test['ndcg@10']:.4f}, "
                       f"Recall@10: {test['recall@10']:.4f}, "
                       f"MAP@10: {test['map@10']:.4f}")

logger.log_info("\n" + "="*70)
logger.log_info("ALL PHASES COMPLETE")
logger.log_info("="*70)
logger.log_info("\nGenerated files:")
logger.log_info("  Phase 1: tuning_[category].csv, factors_tuning_[category].png")
logger.log_info("  Phase 2: final_test_results.csv, evaluation_results.png, val_vs_test_[category].png")
logger.log_info("="*70 + "\n")

#### Debug info

In [None]:
def check_model_quality(category: str):
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"MODEL QUALITY CHECK: {category}")
    logger.log_info(f"{'='*70}\n")
    
    # Load data
    df_train = load_5core_data(category, split='train')
    df_valid = load_5core_data(category, split='valid')
    df_test = load_5core_data(category, split='test')
    
    def get_stats(df, name):
        n_ratings = len(df)
        n_users = df['user_id'].n_unique()
        n_items = df['parent_asin'].n_unique()
        sparsity = 1 - (n_ratings / (n_users * n_items))
        
        logger.log_info(f"{name}:")
        logger.log_info(f"  Ratings: {n_ratings:7,}")
        logger.log_info(f"  Users:   {n_users:7,}")
        logger.log_info(f"  Items:   {n_items:7,}")
        logger.log_info(f"  Sparsity: {sparsity:6.2%}\n")
        
        return {'n_ratings': n_ratings, 'n_users': n_users, 
                'n_items': n_items, 'sparsity': sparsity}
    
    logger.log_info("Dataset Statistics:")
    logger.log_info("-" * 70)
    train_stats = get_stats(df_train, "TRAIN")
    valid_stats = get_stats(df_valid, "VALID")
    test_stats = get_stats(df_test, "TEST")
    
    # Load model
    model_dir = MODELS_DIR / 'model' / category
    if model_dir.exists():
        logger.log_info("-" * 70)
        logger.log_info("Model Analysis:")
        
        artifacts = load_model_artifacts(model_dir)
        U, V = artifacts['U'], artifacts['V']
        R = artifacts['R']
        
        logger.log_info(f"  U (user factors): {U.shape}")
        logger.log_info(f"  V (item factors): {V.shape}")
        logger.log_info(f"  n_factors: {U.shape[1]}")
        
        # Analyze factor magnitudes
        logger.log_info(f"\nFactor Statistics:")
        logger.log_info(f"  U magnitude: mean={np.abs(U).mean():.4f}, std={np.abs(U).std():.4f}")
        logger.log_info(f"  V magnitude: mean={np.abs(V).mean():.4f}, std={np.abs(V).std():.4f}")
        
        # Test reconstruction error on training data
        R_pred = U @ V.T
        train_rated = R.nonzero()
        actual_vals = R.data
        pred_vals = R_pred[train_rated[0], train_rated[1]]
        
        train_rmse = np.sqrt(mean_squared_error(actual_vals, pred_vals))
        logger.log_info(f"\nTraining Set Reconstruction:")
        logger.log_info(f"  RMSE: {train_rmse:.4f}")
        logger.log_info(f"  MAE:  {np.mean(np.abs(actual_vals - pred_vals)):.4f}")
        
        # Sparsity info
        logger.log_info(f"\nCoverage:")
        logger.log_info(f"  Model users: {len(artifacts['user_idx']):,}")
        logger.log_info(f"  Model items: {len(artifacts['item_idx']):,}")
        logger.log_info(f"  Matrix sparsity: {train_stats['sparsity']:.2%}")
    
    logger.log_info(f"\n{'='*70}\n")

# Run check
check_model_quality(CATEGORY[0])

### Task: Unit test

#### UI Recommendation Test

In [29]:
def recommend_model_ui(user_id: str, n_recs: int = 5, models_dir: str | Path | None = None, category: str | None = None) -> pl.DataFrame:
    cat = category or CATEGORY[0]
    model_dir = Path(models_dir) if models_dir else (MODELS_DIR / "model" / cat)
    artifacts = load_model_artifacts(model_dir)
    return recommend_model_based(user_id, n_recs, artifacts)

def unit_test_ui_model_recommend(user_id: str, n_recs: int = 5, models_dir: str | Path | None = None, category: str | None = None):
    cat = category or CATEGORY[0]
    md = models_dir if models_dir else (MODELS_DIR / "model" / cat)
    logger.log_info(f"[UnitTest-UI-MODEL] model_dir={md} | user_id={user_id} | n_recs={n_recs}")
    recs = recommend_model_ui(user_id=user_id, n_recs=n_recs, models_dir=md, category=cat)
    cols = set(recs.columns)
    assert {"parent_asin", "score"}.issubset(cols), "recs missing required columns"
    assert len(recs) <= n_recs, f"recs length should be <= {n_recs}"
    logger.log_info(f"[UnitTest-UI-MODEL] returned {len(recs)} items")
    display(recs)
    return recs

#### Test All Categories

In [None]:
def test_all_categories():
    """Unit test: Verify recommendation function works for all categories"""
    logger.log_info("\n" + "="*70)
    logger.log_info("[UNIT TEST] Testing Recommendation Function (Model-Based)")
    logger.log_info("="*70 + "\n")
    
    test_summary = []
    
    for cat in CATEGORY:
        logger.log_info(f"\n[Test] {cat}")
        logger.log_info("-"*70)
        
        try:
            model_dir = MODELS_DIR / "model" / cat
            
            if not model_dir.exists():
                logger.log_warning(f"  Model not found")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'Model not found'})
                continue
            
            # Load artifacts
            artifacts = load_model_artifacts(model_dir)
            best_factors = Configurations.load_best_factors(cat)
            
            user_rev = artifacts['user_rev']
            item_rev = artifacts['item_rev']
            U, V = artifacts['U'], artifacts['V']
            
            logger.log_info(f"  Model loaded: {len(user_rev):,} users, {len(item_rev):,} items")
            logger.log_info(f"  n_factors: {U.shape[1]}")
            logger.log_info(f"  Best n_factors: {best_factors}")
            
            if len(user_rev) == 0:
                logger.log_warning(f"  No users in model")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'No users'})
                continue
            
            # Test recommendation
            sample_user = user_rev[0]
            logger.log_info(f"  Testing user: {sample_user}")
            
            recs = recommend_model_ui(sample_user, n_recs=N_RECS, category=cat)
            
            # Validate output
            assert set(recs.columns) >= {"parent_asin", "score"}, "Missing required columns"
            assert len(recs) <= N_RECS, f"Too many recommendations: {len(recs)}"
            
            logger.log_info(f"  Generated {len(recs)} recommendations")
            logger.log_info(f"  Score range: [{recs['score'].min():.4f}, {recs['score'].max():.4f}]")
            
            test_summary.append({
                'category': cat, 
                'status': 'PASS', 
                'n_recs': len(recs),
                'n_factors': U.shape[1],
                'score_min': float(recs['score'].min()),
                'score_max': float(recs['score'].max())
            })
            
            display(recs.head(5))
            
        except Exception as e:
            logger.log_exception(f"  Error: {e}")
            test_summary.append({'category': cat, 'status': 'FAIL', 'reason': str(e)})
    
    # Summary
    logger.log_info("\n" + "="*70)
    logger.log_info("UNIT TEST SUMMARY")
    logger.log_info("="*70)
    
    df_summary = pl.DataFrame(test_summary)
    display(df_summary)
    
    passed = sum(1 for r in test_summary if r['status'] == 'PASS')
    total = len(test_summary)
    
    logger.log_info(f"\nResults: {passed}/{total} categories passed")
    
    if passed == total:
        logger.log_info("ALL TESTS PASSED")
    else:
        logger.log_warning(f"{total - passed} tests failed")
    
    logger.log_info("="*70 + "\n")

test_all_categories()