# Item-Based Collaborative Filtering (5-core • TRAIN)

**Goal**
- Build an item-based CF recommender on **5-core / TRAIN** for a given category.
- Use sparse matrices + cosine similarity for scalability.
- Produce Top-N recommendations for one user or a batch of users.

**What this notebook does**
1. Load 5-core **TRAIN** from `PROCESSED_DIR` with schema: `user_id`, `parent_asin`, `rating`, `timestamp`, `history`
2. Build a **user-item** sparse matrix (CSR).
3. Compute **item-item similarity** matrix using cosine similarity.
4. Predict scores for **unseen items** and generate **Top-N** recommendations.
5. Evaluate using TEST/VALID sets: Accuracy, RMSE, Recall@K, NDCG@K, MAP@K.
6. (Optional) Save recommendations to disk for UI integration.

> Notes:
> - We compute item-item similarity on the item-user matrix (transpose of R).
> - Ratings may be mean-centered per user (optional).
> - Predictions based on weighted sum of similar items.

### Task: Import modules and libraries

In [None]:
import os, sys, json, pickle, time
import numpy as np, polars as pl
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

module_path = os.path.abspath(os.path.join('..', '../utilities'))
sys.path.append(module_path)

from logger import Logger
from configurations import Configurations
from visualization_helpers import (
    visualize_hyperparameter_tuning,
    visualize_final_results,
    visualize_val_test_comparison
)
from evaluation_metrics import (
       compute_rmse_accuracy,
       recall_at_k,
       ndcg_at_k,
       map_at_k
)

m_log_file = Configurations.LOG_PATH
logger = Logger(process_name="item_based", log_file=m_log_file)

PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
MODELS_DIR = Path(Configurations.MODELS_PATH)

# ============================================================================
# WORKFLOW CONFIGURATION
# ============================================================================

CATEGORY = Configurations.CATEGORIES

# Auto-detect phase
has_tuning = Configurations.has_tuning_results_item(CATEGORY[0]) 

K_VALUES = Configurations.K_VALUES_ITEM
if has_tuning:
    PHASE = 'final'
    logger.log_info("="*70)
    logger.log_info("PHASE: FINAL EVALUATION (ITEM-BASED)")
    logger.log_info("="*70)
    
    for cat in CATEGORY:
        best_k = Configurations.load_best_k_item(cat)
        logger.log_info(f"  {cat}: Best K = {best_k}")
else:
    PHASE = 'train_tune'
    logger.log_info("="*70)
    logger.log_info("PHASE: TRAINING + TUNING (ITEM-BASED)")
    logger.log_info("="*70)
    logger.log_info(f"K values: {K_VALUES}")

logger.log_info("="*70)
logger.log_info(f"Categories: {CATEGORY}")
logger.log_info(f"Sample size: {Configurations.DEV_SAMPLE_SIZE}")
logger.log_info("="*70 + "\n")

# Settings
N_RECS = 10
MEAN_CENTER = True
MAX_USERS = None
MAX_ITEMS = None
TOP_K_SIMILAR = 30

### Task: Define functions for CF recommendation

#### Data Loader

In [2]:
def _candidate_files(category: str, split: str = "train"):
    dev_sample_size = Configurations.DEV_SAMPLE_SIZE

    if dev_sample_size != 'full':
        sample_sizes = Configurations.SAMPLE_SIZES
        for size_name in sample_sizes.keys():
            if size_name == dev_sample_size:
             return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.{size_name}.parquet"
    else:
        return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.parquet"

def load_5core_data(category: str, split: str = "train") -> pl.DataFrame:
    p = _candidate_files(category, split)
    df = pl.read_parquet(p, low_memory=False)
    df = df.__copy__()
    logger.log_info(f"[Load-{split.upper()}] shape={df.shape} | users={df['user_id'].n_unique()} | items={df['parent_asin'].n_unique()}")
    return df

#### Build Item-Based Model

In [3]:
def build_item_model(df_train: pl.DataFrame, mean_center: bool = True, 
                    max_users: int | None = None, max_items: int | None = None,
                    min_similarity: float = 0.01):
    df = df_train.select(['user_id', 'parent_asin', 'rating']).with_columns(pl.col('rating').cast(pl.Float32))
    if max_users is not None:
        first_users = df['user_id'].unique()[:max_users].to_list()
        df = df.filter(pl.col('user_id').is_in(first_users))
    if max_items is not None:
        first_items = df['parent_asin'].unique()[:max_items].to_list()
        df = df.filter(pl.col('parent_asin').is_in(first_items))
    user_rev = df['user_id'].unique().to_list()
    item_rev = df['parent_asin'].unique().to_list()
    user_idx = {u_id: idx for idx, u_id in enumerate(user_rev)}
    item_idx = {a_id: idx for idx, a_id in enumerate(item_rev)}
    u = np.array([user_idx[x] for x in df['user_id'].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df['parent_asin'].to_list()], dtype=np.int32)
    v = np.array(df['rating'].to_list(), dtype=np.float32)
    nU = len(user_rev)
    nI = len(item_rev)
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)
    user_means = np.zeros(nU, dtype=np.float32)
    Rc = None
    if mean_center:
        Rc = R.copy().astype(np.float32)
        row_sums = np.array(R.sum(axis=1)).ravel().astype(np.float32)
        row_cnts = np.diff(R.indptr).astype(np.int32)
        with np.errstate(divide='ignore', invalid='ignore'):
            user_means = np.where(row_cnts > 0, row_sums / row_cnts, 0.0).astype(np.float32)
        if Rc.nnz:
            Rc.data -= np.repeat(user_means, row_cnts)
    X = Rc if Rc is not None else R
    logger.log_info(f"[Item-Similarity] Computing cosine similarity for {nI} items...")
    item_similarity = cosine_similarity(X.T, dense_output=False)
    if min_similarity > 0:
        item_similarity.data[item_similarity.data < min_similarity] = 0
        item_similarity.eliminate_zeros()
        logger.log_info(f"[Item-Similarity] Applied threshold {min_similarity}, "
                        f"kept {item_similarity.nnz:,} similarities")
    user_rev_arr = np.array(user_rev, dtype=object)
    item_rev_arr = np.array(item_rev, dtype=object)
    logger.log_info(f"[Item-Model] R{R.shape} nnz={R.nnz} | Similarity{item_similarity.shape}")
    return R, Rc, user_idx, item_idx, user_rev_arr, item_rev_arr, user_means, item_similarity

#### Prediction & Recommendation

In [4]:
def predict_item_based(user_idx_val: int, R: csr_matrix, Rc: csr_matrix | None, 
                      item_similarity: csr_matrix, user_means: np.ndarray, 
                      top_k: int = 30) -> np.ndarray:
    X = Rc if Rc is not None else R
    user_ratings = X.getrow(user_idx_val).toarray().ravel()
    rated_items = np.nonzero(R.getrow(user_idx_val).toarray().ravel())[0]
    
    if len(rated_items) == 0:
        return np.zeros(R.shape[1], dtype=np.float32)
    
    rated_ratings = user_ratings[rated_items]
    n_items = R.shape[1]
    scores = np.zeros(n_items, dtype=np.float32)
    
    # Get similarity matrix: all items vs rated items
    sim_matrix = item_similarity[:, rated_items].toarray()
    
    for i in range(n_items):
        sims = sim_matrix[i, :]
        
        # Only use POSITIVE similarities
        positive_mask = sims > 0
        n_positive = positive_mask.sum()
        
        # SAFETY CHECK 1: Must have positive similarities
        if n_positive == 0:
            continue
        
        sims_positive = sims[positive_mask]
        ratings_positive = rated_ratings[positive_mask]
        
        # SAFETY CHECK 2: Validate arrays match
        assert len(sims_positive) == len(ratings_positive), \
            f"Length mismatch: sims={len(sims_positive)}, ratings={len(ratings_positive)}"
        
        # Select top-K
        k_use = min(top_k, n_positive)
        
        # 🔥 SAFETY CHECK 3: k_use must be valid
        if k_use <= 0:
            continue
        
        if k_use < n_positive:
            # Need to select top-K
            # FIX: Use k_use-1 as kth parameter for argpartition
            if k_use == 1:
                top_idx = np.array([np.argmax(sims_positive)])
            else:
                top_idx = np.argpartition(-sims_positive, min(k_use-1, len(sims_positive)-1))[:k_use]
        else:
            # Use all positive similarities
            top_idx = np.arange(n_positive)
        
        # SAFETY CHECK 4: Validate indices
        if len(top_idx) == 0:
            continue
        
        # Get final similarities and ratings
        final_sims = sims_positive[top_idx]
        final_ratings = ratings_positive[top_idx]
        
        # SAFETY CHECK 5: Must have valid data
        if len(final_sims) == 0 or len(final_ratings) == 0:
            continue
        
        # Weighted average
        sim_sum = np.sum(final_sims)
        if sim_sum > 1e-8:
            scores[i] = np.dot(final_sims, final_ratings) / sim_sum
    
    # Add back user mean if centered
    if Rc is not None:
        scores = scores + user_means[user_idx_val]
    
    return scores


def recommend_item_based(user_id: str, n_recs: int, artifacts: dict) -> pl.DataFrame:
    """
    Generate top-N recommendations for a user using item-based CF
    """
    R = artifacts['R']
    Rc = artifacts.get('Rc')
    user_idx = artifacts['user_idx']
    item_rev = artifacts['item_rev']
    user_means = artifacts['user_means']
    item_similarity = artifacts['item_similarity']
    
    # Get top_k from artifacts or use default
    top_k = artifacts.get('top_k_similar', 30)
    
    # Check if user exists
    if user_id not in user_idx:
        logger.log_warning(f"[Recommend] user_id={user_id} not found.")
        return pl.DataFrame({"parent_asin": [], "score": []})
    
    # Get user index
    u = user_idx[user_id]
    
    # Predict scores for all items
    scores = predict_item_based(u, R, Rc, item_similarity, user_means, top_k=top_k)
    
    # Filter out already rated items
    rated = set(R.getrow(u).indices.tolist())
    cand_mask = np.ones(R.shape[1], dtype=bool)
    if rated:
        cand_mask[list(rated)] = False
    
    # Get candidate scores
    cand_scores = scores[cand_mask]
    
    if cand_scores.size == 0:
        logger.log_warning(f"[Recommend] No candidate items for user {user_id}")
        return pl.DataFrame({"parent_asin": [], "score": []})
    
    # Select top-N
    n_top = min(n_recs, cand_scores.size)
    cand_indices = np.nonzero(cand_mask)[0]
    
    # Use argpartition for efficient top-N selection
    top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
    
    # Sort by score descending
    picked = sorted(
        [(int(cand_indices[p]), float(cand_scores[p])) for p in top_pos],
        key=lambda x: -x[1]
    )
    
    # Extract ASINs and scores
    rec_asins = [item_rev[i] for i, _ in picked]
    rec_scores = [s for _, s in picked]
    
    return pl.DataFrame({
        "parent_asin": rec_asins,
        "score": rec_scores
    })

#### Save/Load Artifacts

In [5]:
def save_item_artifacts(out_dir: Path, R, Rc, user_rev, item_rev, user_idx, item_idx, user_means, item_similarity):
    out_dir.mkdir(parents=True, exist_ok=True)
    save_npz(out_dir / "R.npz", R)
    if Rc is not None:
        save_npz(out_dir / "Rc.npz", Rc)
    save_npz(out_dir / "item_similarity.npz", item_similarity)
    np.save(out_dir / "user_means.npy", user_means)
    with open(out_dir / "user_rev.pkl", "wb") as f: pickle.dump(user_rev, f)
    with open(out_dir / "item_rev.pkl", "wb") as f: pickle.dump(item_rev, f)
    (out_dir / "user_idx.json").write_text(json.dumps({str(k): int(v) for k, v in user_idx.items()}))
    (out_dir / "item_idx.json").write_text(json.dumps({str(k): int(v) for k, v in item_idx.items()}))
    logger.log_info(f"[Saved-Item] {out_dir}")

def load_item_artifacts(model_dir: str | Path):
    md = Path(model_dir)
    R = load_npz(md / "R.npz")
    Rc = load_npz(md / "Rc.npz") if (md / "Rc.npz").exists() else None
    item_similarity = load_npz(md / "item_similarity.npz")
    user_means = np.load(md / "user_means.npy")
    with open(md / "user_rev.pkl", "rb") as f: user_rev = pickle.load(f)
    with open(md / "item_rev.pkl", "rb") as f: item_rev = pickle.load(f)
    user_idx = {k: int(v) for k, v in json.loads((md / "user_idx.json").read_text()).items()}
    item_idx = {k: int(v) for k, v in json.loads((md / "item_idx.json").read_text()).items()}
    return dict(R=R, Rc=Rc, item_similarity=item_similarity, user_means=user_means, user_rev=user_rev, item_rev=item_rev, user_idx=user_idx, item_idx=item_idx)

### Task: Evaluation

#### Evaluation Pipeline

In [6]:
def evaluate_item_based(category: str, artifacts: dict, k_values: list = [10, 20, 50], 
                       split: str = "test", sample_users: int = 1000):
    
    logger.log_info(f"[Eval-Item] {category} on {split.upper()}")
    
    df_eval = load_5core_data(category, split=split)
    R, Rc = artifacts['R'], artifacts.get('Rc')
    user_idx, item_idx = artifacts['user_idx'], artifacts['item_idx']
    user_means = artifacts['user_means']
    item_similarity = artifacts['item_similarity']
    
    top_k = artifacts.get('top_k_similar', 30)
    
    # Filter to train users only
    train_user_list = list(user_idx.keys())
    df_eval = df_eval.filter(pl.col('user_id').is_in(train_user_list))
    
    if len(df_eval) == 0:
        logger.log_warning(f"[Eval-Item] No data after filtering")
        return None
    
    logger.log_info(f"[Eval-Item] After filtering: {len(df_eval):,} ratings, {df_eval['user_id'].n_unique():,} users")
    
    # Sample users for evaluation
    eval_users = df_eval['user_id'].unique().to_list()
    if len(eval_users) > sample_users:
        np.random.seed(42)
        eval_users = np.random.choice(eval_users, sample_users, replace=False).tolist()
    
    logger.log_info(f"[Eval-Item] Evaluating {len(eval_users)} users with top_k={top_k}...")
    
    # Initialize accumulators
    metrics_acc = {
        'rmse': [], 'accuracy': [],
        **{f'recall@{k}': [] for k in k_values},
        **{f'ndcg@{k}': [] for k in k_values},
        **{f'map@{k}': [] for k in k_values}
    }
    
    evaluated_users = 0
    
    for user_id in eval_users:
        if user_id not in user_idx:
            continue
        
        u = user_idx[user_id]
        user_eval = df_eval.filter(pl.col('user_id') == user_id)
        actual_items = set(user_eval['parent_asin'].to_list())
        actual_ratings = {row['parent_asin']: row['rating'] 
                         for row in user_eval.iter_rows(named=True)}
        
        if len(actual_items) == 0:
            continue
        
        known_items = {item for item in actual_items if item in item_idx}
        
        # Skip user if ALL items are unknown
        if len(known_items) == 0:
            continue
        
        evaluated_users += 1
        scores = predict_item_based(u, R, Rc, item_similarity, user_means, top_k=top_k)
        
        # RMSE & Accuracy - only for known items
        predictions = np.full(R.shape[1], np.nan)
        actuals = np.full(R.shape[1], np.nan)
        for asin in known_items:
            idx = item_idx[asin]
            predictions[idx] = scores[idx]
            actuals[idx] = actual_ratings[asin]
        
        rmse, acc = compute_rmse_accuracy(predictions, actuals)
        if not np.isnan(rmse):
            metrics_acc['rmse'].append(rmse)
            metrics_acc['accuracy'].append(acc)
        
        # Ranking metrics
        rated = set(R.getrow(u).indices.tolist())
        cand_mask = np.ones(R.shape[1], dtype=bool)
        if rated:
            cand_mask[list(rated)] = False
        
        cand_scores = scores[cand_mask]
        if cand_scores.size == 0:
            continue
        
        cand_indices = np.nonzero(cand_mask)[0]
        max_k = max(k_values)
        n_top = min(max_k, cand_scores.size)
        top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
        sorted_idx = top_pos[np.argsort(-cand_scores[top_pos])]
        
        item_rev = artifacts['item_rev']
        recommended = [item_rev[cand_indices[i]] for i in sorted_idx]
        
        # Use known_items for ranking metrics
        for k in k_values:
            metrics_acc[f'recall@{k}'].append(recall_at_k(recommended, known_items, k))
            metrics_acc[f'ndcg@{k}'].append(ndcg_at_k(recommended, known_items, k))
            metrics_acc[f'map@{k}'].append(map_at_k(recommended, known_items, k))
    
    logger.log_info(f"[Eval-Item] Actually evaluated: {evaluated_users} users")
    
    # Aggregate results
    results = {
        'category': category,
        'split': split,
        'n_users': evaluated_users,
        'rmse': np.mean(metrics_acc['rmse']) if metrics_acc['rmse'] else np.nan,
        'accuracy': np.mean(metrics_acc['accuracy']) if metrics_acc['accuracy'] else np.nan
    }
    
    for k in k_values:
        for metric in ['recall', 'ndcg', 'map']:
            key = f'{metric}@{k}'
            results[key] = np.mean(metrics_acc[key]) if metrics_acc[key] else 0.0
    
    logger.log_info(f"[Eval-Item] RMSE={results['rmse']:.4f}, Acc={results['accuracy']:.4f}")
    logger.log_info(f"[Eval-Item] NDCG@10={results['ndcg@10']:.4f}, Recall@10={results['recall@10']:.4f}")
    
    return results

#### Hyperparameter Tuning Functions

In [7]:
def tune_k_similar_single_category(category: str, k_values: list, n_eval_users: int):
    """
    Tune top_k_similar for a single category on VALIDATION set
    """
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"TUNING TOP_K_SIMILAR: {category}")
    logger.log_info(f"{'='*70}")
    logger.log_info(f"K values: {k_values}")
    logger.log_info(f"Validation users: {n_eval_users}\n")
    
    # Load training data ONCE
    logger.log_info("Loading training data...")
    df_train = load_5core_data(category, split="train")
    
    # Build base model ONCE (item similarity matrix)
    logger.log_info("Building item-based model...")
    R, Rc, user_idx, item_idx, user_rev, item_rev, user_means, item_similarity = build_item_model(
        df_train, mean_center=MEAN_CENTER, max_users=MAX_USERS, max_items=MAX_ITEMS
    )
    
    # Prepare base artifacts for evaluation
    base_artifacts = {
        'R': R, 'Rc': Rc,
        'user_means': user_means,
        'user_rev': user_rev,
        'item_rev': item_rev,
        'user_idx': user_idx,
        'item_idx': item_idx,
        'item_similarity': item_similarity
    }
    
    results = []
    
    # Test each K value
    for i, k in enumerate(k_values, 1):
        logger.log_info(f"\n[{i}/{len(k_values)}] Testing K={k}")
        logger.log_info("-"*70)
        
        # Create artifacts for this K
        eval_artifacts = base_artifacts.copy()
        eval_artifacts['top_k_similar'] = k
        
        # Evaluate on validation
        start_time = time.time()
        metrics = evaluate_item_based(
            category, eval_artifacts,
            k_values=[10, 20, 50],
            split="valid",
            sample_users=n_eval_users
        )
        eval_time = time.time() - start_time
        
        if metrics:
            # Store all results
            result = {
                'K': k,
                'NDCG@10': metrics['ndcg@10'],
                'NDCG@20': metrics['ndcg@20'],
                'NDCG@50': metrics['ndcg@50'],
                'Recall@10': metrics['recall@10'],
                'Recall@20': metrics['recall@20'],
                'Recall@50': metrics['recall@50'],
                'MAP@10': metrics['map@10'],
                'MAP@20': metrics['map@20'],
                'MAP@50': metrics['map@50'],
                'RMSE': metrics['rmse'],
                'Accuracy': metrics['accuracy'],
                'Eval_Time': eval_time
            }
            results.append(result)
            
            # Print summary
            logger.log_info(f"Results:")
            logger.log_info(f"  NDCG@10:   {metrics['ndcg@10']:.4f}")
            logger.log_info(f"  Recall@10: {metrics['recall@10']:.4f}")
            logger.log_info(f"  MAP@10:    {metrics['map@10']:.4f}")
            logger.log_info(f"  RMSE:      {metrics['rmse']:.4f}")
            logger.log_info(f"  Eval:      {eval_time:.1f}s")
    
    df_results = pl.DataFrame(results)
    
    # Save results
    out_csv = MODELS_DIR / 'item' / f'tuning_{category}.csv'
    df_results.write_csv(out_csv)
    logger.log_info(f"\nSaved tuning results: {out_csv}")
    
    return df_results


def select_best_k(df_results: pl.DataFrame):
    """
    Select best K using NDCG-primary strategy.
    """
    # Primary: NDCG@10
    best_k_ndcg = df_results['K'][df_results['NDCG@10'].arg_max()]
    best_ndcg = df_results['NDCG@10'].max()
    
    logger.log_info(f"\nPrimary metric (NDCG@10): K={best_k_ndcg}, score={best_ndcg:.4f}")
    
    # Find K values within 2% of best NDCG
    threshold = best_ndcg * 0.98
    similar_rows = df_results.filter(pl.col('NDCG@10') >= threshold)
    similar_k = similar_rows['K'].to_list()
    
    if len(similar_k) > 1:
        logger.log_info(f"Multiple K with similar NDCG (within 2%): {similar_k}")
        
        # Secondary: Recall@10
        best_k = similar_rows['K'][similar_rows['Recall@10'].arg_max()]
        best_recall = similar_rows['Recall@10'].max()
        
        logger.log_info(f"Secondary metric (Recall@10): K={best_k}, score={best_recall:.4f}")
    else:
        best_k = best_k_ndcg
        logger.log_info(f"Clear winner based on NDCG@10: K={best_k}")
    
    return best_k


def analyze_and_select_k(df_results: pl.DataFrame, category: str):
    """Analyze results and select best K"""
    logger.log_info(f"\n{'='*70}")
    logger.log_info("K SELECTION ANALYSIS")
    logger.log_info(f"{'='*70}")
    
    # Best K for each metric
    best_k_ndcg = df_results['K'][df_results['NDCG@10'].arg_max()]
    best_k_recall = df_results['K'][df_results['Recall@10'].arg_max()]
    best_k_map = df_results['K'][df_results['MAP@10'].arg_max()]
    best_k_rmse = df_results['K'][df_results['RMSE'].arg_min()]
    
    logger.log_info(f"\nBest K by metric:")
    logger.log_info(f"  NDCG@10:   K={best_k_ndcg:3d} (score={df_results['NDCG@10'].max():.4f})")
    logger.log_info(f"  Recall@10: K={best_k_recall:3d} (score={df_results['Recall@10'].max():.4f})")
    logger.log_info(f"  MAP@10:    K={best_k_map:3d} (score={df_results['MAP@10'].max():.4f})")
    logger.log_info(f"  RMSE:      K={best_k_rmse:3d} (score={df_results['RMSE'].min():.4f})")
    
    # Apply NDCG-primary strategy
    best_k = select_best_k(df_results)
    
    # Show final selection details
    final_row = df_results.filter(pl.col('K') == best_k).row(0, named=True)
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"FINAL SELECTION: K={best_k}")
    logger.log_info(f"{'='*70}")
    logger.log_info(f"  NDCG@10:   {final_row['NDCG@10']:.4f}")
    logger.log_info(f"  Recall@10: {final_row['Recall@10']:.4f}")
    logger.log_info(f"  MAP@10:    {final_row['MAP@10']:.4f}")
    logger.log_info(f"  RMSE:      {final_row['RMSE']:.4f}")
    logger.log_info(f"  Accuracy:  {final_row['Accuracy']:.4f}")
    logger.log_info(f"  Eval:      {final_row['Eval_Time']:.1f}s")
    logger.log_info(f"{'='*70}\n")
    
    # Update configuration - SỬ DỤNG SEPARATE CONFIG CHO ITEM-BASED
    Configurations.save_best_k_item(category, best_k)
    logger.log_info(f"Saved best K to file\n")
    
    return best_k, df_results

### Task: Pipeline and execution

#### Training pipeline

In [8]:
def _train_single_category(cat, model_dir, K_VALUES, n_eval_tune):
    """Helper: Train and tune a single category for item-based CF"""
    
    # ========================================================================
    # STEP 1: BUILD BASE MODEL (if not exists)
    # ========================================================================
    
    if not (model_dir / "R.npz").exists():
        logger.log_info("STEP 1: TRAINING BASE MODEL")
        logger.log_info("-"*70)
        logger.log_info("Building item-item similarity matrix (done once)\n")
        
        # Load training data
        df_train = load_5core_data(cat, split="train")
        
        # Build item-based model
        logger.log_info("Computing item-item similarities...")
        R, Rc, user_idx, item_idx, user_rev, item_rev, user_means, item_similarity = build_item_model(
            df_train, mean_center=MEAN_CENTER, max_users=MAX_USERS, max_items=MAX_ITEMS
        )
        
        # Save artifacts
        save_item_artifacts(model_dir, R, Rc, user_rev, item_rev,
                          user_idx, item_idx, user_means, item_similarity)
        
        logger.log_info(f"Base model saved to {model_dir}\n")
    
    else:
        logger.log_info("STEP 1: BASE MODEL EXISTS")
        logger.log_info("-"*70)
        logger.log_info(f"Loading from {model_dir}\n")
    
    # ========================================================================
    # STEP 2: HYPERPARAMETER TUNING (if not done)
    # ========================================================================
    
    if not Configurations.has_tuning_results_item(cat):
        logger.log_info("STEP 2: HYPERPARAMETER TUNING (VALIDATION)")
        logger.log_info("-"*70)
        logger.log_info(f"K values: {K_VALUES}")
        logger.log_info(f"Validation users: {n_eval_tune}\n")
        
        # Load base artifacts (similarity matrix already computed!)
        artifacts = load_item_artifacts(model_dir)
        
        results = []
        
        # Test each K value
        for i, k in enumerate(K_VALUES, 1):
            logger.log_info(f"\n[{i}/{len(K_VALUES)}] Testing K={k}")
            logger.log_info("-"*70)
            
            # Create artifacts for this K
            eval_artifacts = artifacts.copy()
            eval_artifacts['top_k_similar'] = k
            
            # Evaluate on validation
            start_time = time.time()
            metrics = evaluate_item_based(
                cat, eval_artifacts,
                k_values=[10, 20, 50],
                split="valid",
                sample_users=n_eval_tune
            )
            eval_time = time.time() - start_time
            
            if metrics:
                results.append({
                    'K': k,
                'NDCG@10': metrics['ndcg@10'],
                'NDCG@20': metrics['ndcg@20'],
                'NDCG@50': metrics['ndcg@50'],
                'Recall@10': metrics['recall@10'],
                'Recall@20': metrics['recall@20'],
                'Recall@50': metrics['recall@50'],
                'MAP@10': metrics['map@10'],
                'MAP@20': metrics['map@20'],
                'MAP@50': metrics['map@50'],
                'RMSE': metrics['rmse'],
                'Accuracy': metrics['accuracy'],
                'Eval_Time': eval_time
                })
                
                logger.log_info(f"Results:")
                logger.log_info(f"  NDCG@10:   {metrics['ndcg@10']:.4f}")
                logger.log_info(f"  Recall@10: {metrics['recall@10']:.4f}")
                logger.log_info(f"  Eval:      {eval_time:.1f}s")
        
        # Save tuning results
        df_results = pl.DataFrame(results)
        out_csv = MODELS_DIR / 'item' / f'tuning_{cat}.csv'
        df_results.write_csv(out_csv)
        logger.log_info(f"\nSaved tuning results: {out_csv}")
        
        # Select best K
        best_k = select_best_k(df_results)
        
        # Log selection
        final_row = df_results.filter(pl.col('K') == best_k).row(0, named=True)
        
        logger.log_info(f"\n{'='*70}")
        logger.log_info(f"BEST K SELECTED: {best_k}")
        logger.log_info(f"{'='*70}")
        logger.log_info(f"  NDCG@10:   {final_row['NDCG@10']:.4f}")
        logger.log_info(f"  Recall@10: {final_row['Recall@10']:.4f}")
        logger.log_info(f"  MAP@10:    {final_row['MAP@10']:.4f}")
        logger.log_info(f"{'='*70}\n")
        
        # Save best K
        Configurations.save_best_k_item(cat, best_k)
        logger.log_info(f"Saved best K to file\n")
        
        # Visualize tuning
        logger.log_info("Generating K tuning visualization...")
        visualize_hyperparameter_tuning(
                                        df_results,
                                        category=cat,
                                        param_col='K',
                                        param_name='K (Similar Items - Co-rating)',
                                        save_dir=MODELS_DIR / 'item',
                                        algo_name='Item-Based'
                                    )
        logger.log_info(f"Saved: k_tuning_{cat}.png\n")
        
        return {'tuned_now': True, 'best_k': best_k}
    
    else:
        best_k = Configurations.load_best_k_item(cat)
        logger.log_info("STEP 2: TUNING ALREADY DONE")
        logger.log_info("-"*70)
        logger.log_info(f"Best K (loaded): {best_k}\n")
        
        return {'tuned_now': False, 'best_k': best_k}

#### Phase 1: Training + Tuning

In [None]:
# ============================================================================
# PHASE 1: TRAINING + TUNING ALL CATEGORIES
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1: TRAINING + TUNING ALL CATEGORIES (ITEM-BASED)")
logger.log_info("="*70 + "\n")

# Configuration
if not Configurations.has_tuning_results_item(CATEGORY[0]):
    K_VALUES = Configurations.K_VALUES_ITEM
    logger.log_info(f"K values to test: {K_VALUES}\n")

workflow_results = {}

for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nCATEGORY: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "item" / cat
    workflow_results[cat] = _train_single_category(
        cat, model_dir, K_VALUES, Configurations.get_eval_samples_tuning()
    )

# ============================================================================
# PHASE 1 SUMMARY
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1 COMPLETE: ALL MODELS TRAINED AND TUNED")
logger.log_info("="*70 + "\n")

logger.log_info("Tuning Summary:")
for cat in CATEGORY:
    status = 'newly tuned' if workflow_results[cat]['tuned_now'] else 'loaded from cache'
    logger.log_info(f"  {cat}: K={workflow_results[cat]['best_k']} ({status})")

logger.log_info("\n" + "="*70)
logger.log_info("Ready for Phase 2: Final Evaluation")
logger.log_info("="*70 + "\n")

#### Phase 2: Final Evaluation

In [None]:
# ============================================================================
# PHASE 2: FINAL EVALUATION ON TEST SET (ALL CATEGORIES)
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 2: FINAL EVALUATION ON TEST SET (ITEM-BASED)")
logger.log_info("="*70 + "\n")

# ============================================================================
# LOAD WORKFLOW RESULTS
# ============================================================================

if 'workflow_results' not in locals():
    workflow_results = {}
    for cat in CATEGORY:
        best_k = Configurations.load_best_k_item(cat)
        workflow_results[cat] = {'best_k': best_k, 'tuned_now': False}
    logger.log_info("Loaded best K values from configuration\n")

n_eval_final = Configurations.get_eval_samples_final()
logger.log_info(f"Test users per category: {n_eval_final}\n")

# ============================================================================
# RUN TEST EVALUATION FOR ALL CATEGORIES
# ============================================================================

for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nTESTING: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "item" / cat
    best_k = workflow_results[cat]['best_k']
    logger.log_info(f"Using K: {best_k}")
    
    # Load model and evaluate
    final_artifacts = load_item_artifacts(model_dir)
    final_artifacts['top_k_similar'] = best_k
    
    logger.log_info("Evaluating on test set...\n")
    results = evaluate_item_based(cat, final_artifacts, k_values=[10, 20, 50],
                                 split="test", sample_users=n_eval_final)
    
    if results:
        workflow_results[cat]['test_results'] = results
        logger.log_info(f"\nTest Results (K={best_k}):")
        logger.log_info(f"  NDCG@10: {results['ndcg@10']:.4f}, "
                       f"Recall@10: {results['recall@10']:.4f}, "
                       f"MAP@10: {results['map@10']:.4f}")
        logger.log_info(f"  RMSE: {results['rmse']:.4f}, "
                       f"Accuracy: {results['accuracy']:.4f}\n")

# ============================================================================
# SAVE FINAL RESULTS
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("SAVING FINAL RESULTS")
logger.log_info("="*70 + "\n")

test_results_list = [workflow_results[cat]['test_results'] 
                     for cat in CATEGORY 
                     if 'test_results' in workflow_results[cat]]

if test_results_list:
    df_final_results = pl.DataFrame(test_results_list)
    
    logger.log_info("Final Test Results:")
    display(df_final_results)
    
    out_csv = MODELS_DIR / 'item' / 'final_test_results.csv'
    df_final_results.write_csv(out_csv)
    logger.log_info(f"\nSaved: {out_csv}")
    
    logger.log_info("Generating final evaluation plot...")
    visualize_final_results(
                            test_results_list,
                            save_dir=MODELS_DIR / 'item',
                            algo_name='Item-Based',
                            k_values=[10, 20, 50]
                        )
    logger.log_info(f"Saved: evaluation_results.png\n")

# ============================================================================
# POST-ANALYSIS VISUALIZATION
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("POST-ANALYSIS VISUALIZATION")
logger.log_info("="*70 + "\n")

for cat in CATEGORY:
    tuning_csv = MODELS_DIR / 'item' / f'tuning_{cat}.csv'
    
    if not tuning_csv.exists():
        logger.log_info(f"No tuning results for {cat}, skipping\n")
        continue
    
    if 'test_results' not in workflow_results[cat]:
        continue
    
    logger.log_info(f"Generating Val vs Test comparison for {cat}...")
    
    df_tuning = pl.read_csv(tuning_csv)
    best_k = workflow_results[cat]['best_k']
    
    tuning_row = df_tuning.filter(pl.col('K') == best_k).row(0, named=True)
    final_row = df_final_results.filter(pl.col('category') == cat).row(0, named=True)
    
    visualize_val_test_comparison(
                                    cat=cat,
                                    param_val=best_k,
                                    tuning_row=tuning_row,
                                    final_row=final_row,
                                    save_dir=MODELS_DIR / 'item',
                                    param_name='K',
                                    algo_name='Item-Based'
                                )
    logger.log_info(f"  Saved: val_vs_test_{cat}.png\n")

# ============================================================================
# FINAL WORKFLOW SUMMARY
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("COMPLETE WORKFLOW SUMMARY (ITEM-BASED)")
logger.log_info("="*70)

for cat in CATEGORY:
    logger.log_info(f"\n{cat}: K={workflow_results[cat]['best_k']}")
    if 'test_results' in workflow_results[cat]:
        test = workflow_results[cat]['test_results']
        logger.log_info(f"  NDCG@10: {test['ndcg@10']:.4f}, "
                       f"Recall@10: {test['recall@10']:.4f}, "
                       f"MAP@10: {test['map@10']:.4f}")

logger.log_info("\n" + "="*70)
logger.log_info("ALL PHASES COMPLETE")
logger.log_info("="*70)
logger.log_info("\nGenerated files:")
logger.log_info("  Phase 1: tuning_[category].csv, k_tuning_[category].png")
logger.log_info("  Phase 2: final_test_results.csv, evaluation_results.png, val_vs_test_[category].png")
logger.log_info("="*70 + "\n")

#### Debug info

In [None]:
def check_data_quality(category: str):
    """Check train/valid/test overlap, quality, and sparsity metrics"""
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"DATA QUALITY CHECK (ITEM-BASED): {category}")
    logger.log_info(f"{'='*70}\n")
    
    # Load all splits
    df_train = load_5core_data(category, split='train')
    df_valid = load_5core_data(category, split='valid')
    df_test = load_5core_data(category, split='test')
    
    def get_stats(df, name):
        n_ratings = len(df)
        n_users = df['user_id'].n_unique()
        n_items = df['parent_asin'].n_unique()
        
        sparsity = 1 - (n_ratings / (n_users * n_items))
        avg_items_per_user = n_ratings / n_users
        avg_users_per_item = n_ratings / n_items
        
        logger.log_info(f"{name}:")
        logger.log_info(f"  Size:           {n_ratings:7,} ratings")
        logger.log_info(f"  Users:          {n_users:7,}")
        logger.log_info(f"  Items:          {n_items:7,}")
        logger.log_info(f"  Sparsity:       {sparsity:7.2%}")
        logger.log_info(f"  Avg items/user: {avg_items_per_user:7.2f}")
        logger.log_info(f"  Avg users/item: {avg_users_per_item:7.2f}")
        logger.log_info("")
        
        return {
            'n_ratings': n_ratings, 'n_users': n_users, 'n_items': n_items,
            'sparsity': sparsity, 'avg_items_per_user': avg_items_per_user,
            'avg_users_per_item': avg_users_per_item
        }
    
    logger.log_info("Dataset Statistics:")
    logger.log_info("-" * 70)
    train_stats = get_stats(df_train, "TRAIN")
    valid_stats = get_stats(df_valid, "VALID")
    test_stats = get_stats(df_test, "TEST")
    
    # Check overlaps
    train_users = set(df_train['user_id'].unique())
    valid_users = set(df_valid['user_id'].unique())
    test_users = set(df_test['user_id'].unique())
    
    logger.log_info("-" * 70)
    logger.log_info(f"User Overlap:")
    logger.log_info(f"  Train ∩ Valid: {len(train_users & valid_users):6,} ({len(train_users & valid_users)/len(valid_users)*100:5.1f}% of valid)")
    logger.log_info(f"  Train ∩ Test:  {len(train_users & test_users):6,} ({len(train_users & test_users)/len(test_users)*100:5.1f}% of test)")
    
    train_items = set(df_train['parent_asin'].unique())
    valid_items = set(df_valid['parent_asin'].unique())
    test_items = set(df_test['parent_asin'].unique())
    
    logger.log_info(f"\nItem Overlap:")
    logger.log_info(f"  Train ∩ Valid: {len(train_items & valid_items):6,} ({len(train_items & valid_items)/len(valid_items)*100:5.1f}% of valid)")
    logger.log_info(f"  Train ∩ Test:  {len(train_items & test_items):6,} ({len(train_items & test_items)/len(test_items)*100:5.1f}% of test)")
    
    # Load model
    model_dir = MODELS_DIR / 'item' / category
    if model_dir.exists():
        logger.log_info(f"\n{'-' * 70}")
        logger.log_info("Model Statistics:")
        
        artifacts = load_item_artifacts(model_dir)
        user_idx = artifacts['user_idx']
        item_idx = artifacts['item_idx']
        item_similarity = artifacts['item_similarity']
        
        logger.log_info(f"  Model users:       {len(user_idx):7,}")
        logger.log_info(f"  Model items:       {len(item_idx):7,}")
        logger.log_info(f"  Similarity matrix: {item_similarity.shape}")
        logger.log_info(f"  Similarity nnz:    {item_similarity.nnz:7,}")
        logger.log_info(f"  Similarity sparsity: {(1 - item_similarity.nnz/(item_similarity.shape[0]*item_similarity.shape[1])):7.2%}")
        logger.log_info(f"  Model sparsity:    {train_stats['sparsity']:7.2%}")
        logger.log_info(f"  Avg items/user:    {train_stats['avg_items_per_user']:7.2f}")
        logger.log_info(f"  Avg users/item:    {train_stats['avg_users_per_item']:7.2f}")
    
    logger.log_info(f"\n{'='*70}\n")

# Run check
check_data_quality(CATEGORY[0])

#### Debug functions

In [None]:
def debug_item_based_predictions(category: str):
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"DEBUG ITEM-BASED: {category}")
    logger.log_info(f"{'='*70}\n")
    
    model_dir = MODELS_DIR / 'item' / category
    artifacts = load_item_artifacts(model_dir)
    
    R = artifacts['R']
    Rc = artifacts['Rc']
    item_similarity = artifacts['item_similarity']
    user_idx = artifacts['user_idx']
    item_idx = artifacts['item_idx']
    user_means = artifacts['user_means']
    
    # Check 1: Similarity matrix
    logger.log_info("1. Item Similarity Matrix:")
    logger.log_info(f"   Shape: {item_similarity.shape}")
    logger.log_info(f"   nnz: {item_similarity.nnz:,}")
    logger.log_info(f"   Sparsity: {(1 - item_similarity.nnz/(item_similarity.shape[0]*item_similarity.shape[1])):7.2%}")
    logger.log_info(f"   Min: {item_similarity.data.min():.6f}")
    logger.log_info(f"   Max: {item_similarity.data.max():.6f}")
    logger.log_info(f"   Mean: {item_similarity.data.mean():.6f}\n")
    
    # Check 2: Test prediction for one user
    test_user_id = list(user_idx.keys())[0]
    u = user_idx[test_user_id]
    
    logger.log_info(f"2. Test User: {test_user_id}")
    logger.log_info(f"   User index: {u}")
    
    # Get rated items
    rated_items = R.getrow(u).indices
    rated_values = R.getrow(u).data
    
    logger.log_info(f"   Rated items: {len(rated_items)}")
    logger.log_info(f"   Rating range: [{rated_values.min():.1f}, {rated_values.max():.1f}]")
    logger.log_info(f"   Mean rating: {rated_values.mean():.2f}\n")
    
    # Check 3: Predict scores
    logger.log_info("3. Prediction Test:")
    scores = predict_item_based(u, R, Rc, item_similarity, user_means, top_k=30)
    
    logger.log_info(f"   Scores computed: {len(scores)}")
    logger.log_info(f"   Non-zero scores: {np.count_nonzero(scores)}")
    logger.log_info(f"   Score range: [{scores.min():.6f}, {scores.max():.6f}]")
    logger.log_info(f"   Score mean: {scores.mean():.6f}")
    logger.log_info(f"   Score std: {scores.std():.6f}\n")
    
    # Check 4: Top recommendations
    logger.log_info("4. Top 10 Predictions:")
    top_10_idx = np.argsort(-scores)[:10]
    for rank, idx in enumerate(top_10_idx, 1):
        item_id = artifacts['item_rev'][idx]
        score = scores[idx]
        logger.log_info(f"   {rank}. Item {item_id}: score={score:.6f}")
    
    logger.log_info(f"\n{'='*70}\n")

# RUN DEBUG
debug_item_based_predictions(CATEGORY[0])

### Task: Unit test

In [13]:
def recommend_item_ui(user_id: str, n_recs: int = 5, models_dir: str | Path | None = None, category: str | None = None) -> pl.DataFrame:
    cat = category or CATEGORY[0]
    model_dir = Path(models_dir) if models_dir else (MODELS_DIR / "item" / cat)
    artifacts = load_item_artifacts(model_dir)
    
    # Get best K from configuration
    best_k = Configurations.load_best_k_item(cat)
    artifacts['top_k_similar'] = best_k
    
    return recommend_item_based(user_id, n_recs, artifacts)

#### Test All Categories

In [None]:
def test_all_categories():
    """Unit test: Verify recommendation function works for all categories"""
    logger.log_info("\n" + "="*70)
    logger.log_info("[UNIT TEST] Testing Recommendation Function")
    logger.log_info("="*70 + "\n")
    
    test_summary = []
    
    for cat in CATEGORY:
        logger.log_info(f"\n[Test] {cat}")
        logger.log_info("-"*70)
        
        try:
            model_dir = MODELS_DIR / "item" / cat
            
            # Check model exists
            if not model_dir.exists():
                logger.log_warning(f"  ✗ Model not found")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'Model not found'})
                continue
            
            # Load artifacts
            artifacts = load_item_artifacts(model_dir)
            best_k = Configurations.load_best_k_item(cat)
            artifacts['top_k_similar'] = best_k
            
            user_rev = artifacts['user_rev']
            item_rev = artifacts['item_rev']
            
            logger.log_info(f"  Model loaded: {len(user_rev):,} users, {len(item_rev):,} items")
            logger.log_info(f"  Using top_k_similar: {best_k}")
            
            if len(user_rev) == 0:
                logger.log_warning(f"  ✗ No users in model")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'No users'})
                continue
            
            # Test recommendation
            sample_user = user_rev[0]
            logger.log_info(f"  Testing user: {sample_user}")
            
            recs = recommend_item_ui(sample_user, n_recs=N_RECS, category=cat)
            
            # Validate output
            assert set(recs.columns) >= {"parent_asin", "score"}, "Missing required columns"
            assert len(recs) <= N_RECS, f"Too many recommendations: {len(recs)}"
            assert all(recs['score'] >= 0), "Negative scores detected"
            
            logger.log_info(f"  Generated {len(recs)} recommendations")
            logger.log_info(f"  Score range: [{recs['score'].min():.4f}, {recs['score'].max():.4f}]")
            
            test_summary.append({
                'category': cat, 
                'status': 'PASS', 
                'n_recs': len(recs),
                'score_min': float(recs['score'].min()),
                'score_max': float(recs['score'].max())
            })
            
            # Display sample recommendations
            display(recs.head(5))
            
        except Exception as e:
            logger.log_exception(f"  Error: {e}")
            test_summary.append({'category': cat, 'status': 'FAIL', 'reason': str(e)})
    
    # Summary
    logger.log_info("\n" + "="*70)
    logger.log_info("UNIT TEST SUMMARY")
    logger.log_info("="*70)
    
    df_summary = pl.DataFrame(test_summary)
    display(df_summary)
    
    passed = sum(1 for r in test_summary if r['status'] == 'PASS')
    total = len(test_summary)
    
    logger.log_info(f"\nResults: {passed}/{total} categories passed")
    
    if passed == total:
        logger.log_info("ALL TESTS PASSED")
    else:
        logger.log_warning(f"{total - passed} tests failed")
    
    logger.log_info("="*70 + "\n")

test_all_categories()