# Content-Based Collaborative Filtering (5-core • TRAIN)

**Goal**
- Build a content-based CF recommender on **5-core / TRAIN** for a given category.
- Use TF-IDF vectorization on product metadata (title, description, features, categories, details) + cosine similarity.
- Produce Top-N recommendations for one user or a batch of users.

**What this notebook does**
1. Load 5-core **TRAIN** from `PROCESSED_DIR` with schema: `user_id`, `parent_asin`, `rating`, `timestamp`, `history`
2. Load **METADATA** from `RAW_DIR` with product information.
3. Combine multiple text fields (title 3x, features 2x, description 1x, categories 1x, details 1x) with appropriate weights.
4. Build **TF-IDF item-item** similarity matrix based on combined text.
5. Predict scores for **unseen items** based on user's rated items and generate **Top-N** recommendations.
6. Evaluate using TEST/VALID sets: Accuracy, RMSE, Recall@K, NDCG@K, MAP@K.
7. (Optional) Save recommendations to disk for UI integration.

> Notes:
> - Metadata columns: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together']
> - Description is LIST, Features is LIST, Categories is FLAT LIST, Details is DICT
> - Item-item similarity computed via cosine similarity on TF-IDF vectors.

### Task: Import modules and libraries

In [None]:
import os, sys, json, pickle, re, time
import numpy as np, polars as pl
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

# Download stopwords if needed
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords', quiet=True)

module_path = os.path.abspath(os.path.join('..', '../utilities'))
sys.path.append(module_path)

from logger import Logger
from configurations import Configurations
from visualization_helpers import (
    visualize_hyperparameter_tuning,
    visualize_final_results,
    visualize_val_test_comparison
)
from evaluation_metrics import (
       compute_rmse_accuracy,
       recall_at_k,
       ndcg_at_k,
       map_at_k
)

logger = Logger(process_name="content_based", log_file=Configurations.LOG_PATH)

PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
RAW_DIR = Path(Configurations.DATA_RAW_PATH)
MODELS_DIR = Path(Configurations.MODELS_PATH)

CATEGORY = Configurations.CATEGORIES

# Auto-detect phase
has_tuning = Configurations.has_tuning_results_content(CATEGORY[0])
K_VALUES = Configurations.K_VALUES_CONTENT

if has_tuning:
    PHASE = 'final'
    logger.log_info("="*70)
    logger.log_info("PHASE: FINAL EVALUATION (CONTENT-BASED)")
    logger.log_info("="*70)
    
    for cat in CATEGORY:
        best_k = Configurations.load_best_k_content(cat)
        logger.log_info(f"  {cat}: Best K = {best_k}")
else:
    PHASE = 'train_tune'
    logger.log_info("="*70)
    logger.log_info("PHASE: TRAINING + TUNING (CONTENT-BASED)")
    logger.log_info("="*70)
    logger.log_info(f"K values: {K_VALUES}")

logger.log_info("="*70)
logger.log_info(f"Categories: {CATEGORY}")
logger.log_info(f"Sample size: {Configurations.DEV_SAMPLE_SIZE}")
logger.log_info("="*70 + "\n")

# Settings
N_RECS = 10
MAX_USERS = None
MAX_ITEMS = None
TOP_K_SIMILAR = 30  # Default, will be tuned

# TF-IDF Settings (can be tuned later)
TFIDF_MAX_FEATURES = 5000
TFIDF_MIN_DF = 2
TFIDF_NGRAM_RANGE = (1, 2)

### Task: Define functions for CF recommendation

#### Data Loader

In [2]:
def _candidate_files(category: str, split: str = "train"):
    dev_sample_size = Configurations.DEV_SAMPLE_SIZE

    if dev_sample_size != 'full':
        sample_sizes = Configurations.SAMPLE_SIZES
        for size_name in sample_sizes.keys():
            if size_name == dev_sample_size:
             return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.{size_name}.parquet"
    else:
        return PROCESSED_DIR / f"{category.replace('/', '-')}.5core.{split}.parquet"

def _candidate_meta_files(category: str):
    safe = category.replace('/', '-')
    fname = f"{safe}.meta.parquet"
    return [PROCESSED_DIR / fname]

def load_5core_data(category: str, split: str = "train") -> pl.DataFrame:
    p = _candidate_files(category, split)
    df = pl.read_parquet(p, low_memory=False)
    df = df.__copy__()
    logger.log_info(f"[Load-{split.upper()}] shape={df.shape} | users={df['user_id'].n_unique()} | items={df['parent_asin'].n_unique()}")
    return df

#### Inspect & Visualize Data Quality

In [3]:
def inspect_and_visualize_descriptions(df: pl.DataFrame, category: str):
    logger.log_info(f"[Inspect] Analyzing description data for {category}...")
    descriptions = df['description'].to_list()
    desc_strings = []
    for d in descriptions:
        if isinstance(d, list):
            desc_strings.append(' '.join(str(item) for item in d if item))
        elif d is not None:
            desc_strings.append(str(d))
        else:
            desc_strings.append('')
    total_count = len(desc_strings)
    none_count = sum(1 for d in descriptions if d is None or d == [])
    empty_count = sum(1 for d in desc_strings if d.strip() == '')
    valid_count = total_count - none_count - empty_count
    valid_descs = [d for d in desc_strings if d.strip() != '']
    if valid_descs:
        lengths = [len(d) for d in valid_descs]
        word_counts = [len(d.split()) for d in valid_descs]
        special_char_counts = [len(re.findall(r'[^a-zA-Z0-9\\s]', d)) for d in valid_descs]
        digit_counts = [len(re.findall(r'\\d', d)) for d in valid_descs]
        uppercase_ratios = [sum(1 for c in d if c.isupper()) / len(d) if len(d) > 0 else 0 for d in valid_descs]
        stats = {'Total': total_count, 'None/Null/Empty List': none_count, 'Empty String': empty_count, 'Valid': valid_count, 'Valid %': f"{valid_count/total_count*100:.2f}%", 'Avg Length': f"{np.mean(lengths):.1f}", 'Median Length': f"{np.median(lengths):.1f}", 'Avg Words': f"{np.mean(word_counts):.1f}", 'Avg Special Chars': f"{np.mean(special_char_counts):.1f}", 'Avg Digits': f"{np.mean(digit_counts):.1f}", 'Avg Uppercase Ratio': f"{np.mean(uppercase_ratios):.3f}"}
        logger.log_info(f"[Inspect] Description Statistics:")
        for key, val in stats.items():
            logger.log_info(f"  {key}: {val}")
        fig, axes = plt.subplots(2, 3, figsize=(18, 10))
        fig.suptitle(f'Description Data Quality Analysis - {category}', fontsize=16, fontweight='bold')
        ax1 = axes[0, 0]
        categories_data = ['Valid', 'None/Null/Empty', 'Empty String']
        counts = [valid_count, none_count, empty_count]
        colors = ['#2ecc71', '#e74c3c', '#f39c12']
        wedges, texts, autotexts = ax1.pie(counts, labels=categories_data, autopct='%1.1f%%', colors=colors, startangle=90)
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        ax1.set_title('Data Completeness')
        ax2 = axes[0, 1]
        ax2.hist(lengths, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
        ax2.axvline(np.mean(lengths), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(lengths):.0f}')
        ax2.axvline(np.median(lengths), color='green', linestyle='--', linewidth=2, label=f'Median: {np.median(lengths):.0f}')
        ax2.set_xlabel('Character Length')
        ax2.set_ylabel('Frequency')
        ax2.set_title('Description Length Distribution')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        ax3 = axes[0, 2]
        ax3.hist(word_counts, bins=50, color='coral', alpha=0.7, edgecolor='black')
        ax3.axvline(np.mean(word_counts), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(word_counts):.0f}')
        ax3.set_xlabel('Word Count')
        ax3.set_ylabel('Frequency')
        ax3.set_title('Word Count Distribution')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
        ax4 = axes[1, 0]
        ax4.hist(special_char_counts, bins=50, color='purple', alpha=0.7, edgecolor='black')
        ax4.axvline(np.mean(special_char_counts), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(special_char_counts):.0f}')
        ax4.set_xlabel('Special Character Count')
        ax4.set_ylabel('Frequency')
        ax4.set_title('Special Characters Distribution')
        ax4.legend()
        ax4.grid(True, alpha=0.3)
        ax5 = axes[1, 1]
        ax5.hist(digit_counts, bins=50, color='orange', alpha=0.7, edgecolor='black')
        ax5.axvline(np.mean(digit_counts), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(digit_counts):.0f}')
        ax5.set_xlabel('Digit Count')
        ax5.set_ylabel('Frequency')
        ax5.set_title('Digit Distribution')
        ax5.legend()
        ax5.grid(True, alpha=0.3)
        ax6 = axes[1, 2]
        ax6.hist(uppercase_ratios, bins=50, color='teal', alpha=0.7, edgecolor='black')
        ax6.axvline(np.mean(uppercase_ratios), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(uppercase_ratios):.3f}')
        ax6.set_xlabel('Uppercase Ratio')
        ax6.set_ylabel('Frequency')
        ax6.set_title('Uppercase Character Ratio')
        ax6.legend()
        ax6.grid(True, alpha=0.3)
        plt.tight_layout()
        out_dir = MODELS_DIR / 'content' / 'inspection'
        out_dir.mkdir(parents=True, exist_ok=True)
        out_file = out_dir / f'{category.replace("/", "-")}_description_analysis.png'
        plt.savefig(out_file, dpi=300, bbox_inches='tight')
        logger.log_info(f"[Inspect] Saved visualization to {out_file}")
        plt.close()
        logger.log_info(f"[Inspect] Sample descriptions (first 3):")
        for i, desc in enumerate(valid_descs[:3], 1):
            preview = desc[:200] + '...' if len(desc) > 200 else desc
            logger.log_info(f"  Sample {i}: {preview}")
        return stats
    else:
        logger.log_warning(f"[Inspect] No valid descriptions found!")
        return None

#### Metadata Loader with Multi-Field Combination

In [4]:
def load_metadata(category: str) -> pl.DataFrame:
    try:
        stop_words = set(stopwords.words('english'))
    except LookupError:
        nltk.download('stopwords', quiet=True)
        stop_words = set(stopwords.words('english'))
    def preprocess_text(text: str) -> str:
        if not text or not isinstance(text, str) or text.strip() == '':
            return ""
        text = text.lower()
        text = re.sub(r'[^a-z0-9\\s]', ' ', text)
        text = re.sub(r'\\s+', ' ', text)
        words = text.split()
        words = [w for w in words if w not in stop_words and len(w) > 2]
        return ' '.join(words).strip()
    def combine_text_fields(row: dict) -> str:
        title = str(row.get('title', '') or '')
        description = row.get('description', [])
        features = row.get('features', [])
        categories = row.get('categories', [])
        # details = row.get('details', {})
        if isinstance(description, list):
            description_text = ' '.join(str(d) for d in description if d)
        else:
            description_text = str(description) if description else ''
        if isinstance(features, list):
            features_text = ' '.join(str(f) for f in features if f)
        else:
            features_text = str(features) if features else ''
        if isinstance(categories, list):
            categories_text = ' '.join(str(cat) for cat in categories if cat)
        else:
            categories_text = str(categories) if categories else ''
        # details_text = ''
        # if isinstance(details, dict):
        #     important_keys = ['Brand', 'Material', 'Color', 'Hair Type', 'Model', 'Type', 'Style', 'Size', 'Manufacturer', 'Fabric Type', 'Pattern', 'Fit Type', 'Sleeve Type', 'Collar Style']
        #     details_parts = []
        #     for key in important_keys:
        #         if key in details and details[key]:
        #             details_parts.append(str(details[key]))
        #     details_text = ' '.join(details_parts)
        combined = f"{title} {title} {title} " + f"{features_text} {features_text} " + f"{description_text} " + f"{categories_text} "# + f"{details_text}"
        return combined.strip()
    for p in _candidate_meta_files(category):
        if not (p.exists() and p.stat().st_size > 0):
            continue
        logger.log_info(f"[Load-META] Reading: {p.name}")
        df = pl.read_parquet(p, low_memory=False)
        if 'parent_asin' not in df.columns:
            raise ValueError(f"Missing 'parent_asin' column in {p}")
        logger.log_info(f"[Load-META] Available columns: {df.columns}")
        text_cols = ['title', 'description', 'features', 'categories']#, 'details']
        for col in text_cols:
            if col not in df.columns:
                logger.log_warning(f"[Load-META] '{col}' column not found, using empty values")
                if col in ['features', 'categories']:
                    df = df.with_columns(pl.lit([]).alias(col))
                # elif col == 'details':
                #     df = df.with_columns(pl.lit({}).alias(col))
                else:
                    df = df.with_columns(pl.lit("").alias(col))
        logger.log_info(f"[Load-META] Raw shape={df.shape}")
        logger.log_info(f"[Load-META] Data structure:")
        logger.log_info(f"  - title: string")
        logger.log_info(f"  - description: LIST containing text")
        logger.log_info(f"  - features: LIST of feature strings")
        logger.log_info(f"  - categories: FLAT LIST of category strings")
        logger.log_info(f"  - details: DICT with Brand, Material, Color, etc.")
        if 'description' in df.columns:
            inspect_df = df.select(['parent_asin', 'description'])
            inspect_and_visualize_descriptions(inspect_df, category)
        logger.log_info(f"[Load-META] Combining text fields with weights:")
        logger.log_info(f"  - Title: 3x (most important)")
        logger.log_info(f"  - Features: 2x (technical specs)")
        logger.log_info(f"  - Description: 1x (detailed info)")
        logger.log_info(f"  - Categories: 1x (classification)")
        logger.log_info(f"  - Details: 1x (brand, material, etc.)")
        combined_texts = []
        for row in df.iter_rows(named=True):
            combined = combine_text_fields(row)
            combined_texts.append(combined)
        df = df.with_columns(pl.Series("combined_text", combined_texts))
        logger.log_info(f"[Load-META] Preprocessing combined text (lowercase, remove special chars, remove stopwords)...")
        processed_texts = [preprocess_text(text) for text in combined_texts]
        df = df.with_columns(pl.Series("description", processed_texts))
        original_len = len(df)
        df = df.filter(pl.col('description').str.len_chars() > 0)
        removed = original_len - len(df)
        logger.log_info(f"[Load-META] Processed shape={df.shape} (removed {removed} empty descriptions)")
        logger.log_info(f"[Load-META] Preprocessing Impact:")
        logger.log_info(f"  Before: {original_len} items")
        logger.log_info(f"  After: {len(df)} items")
        logger.log_info(f"  Removed: {removed} ({removed/original_len*100:.2f}%)")
        if len(df) > 0:
            sample = df['description'].to_list()[0]
            logger.log_info(f"[Load-META] Sample processed text (first 300 chars):")
            logger.log_info(f"  {sample[:300]}...")
        return df.select(['parent_asin', 'description'])
    raise FileNotFoundError(f"Metadata not found for {category}")

#### Build Content-Based Model

In [5]:
def build_content_model(df_train: pl.DataFrame, df_meta: pl.DataFrame, 
                       max_users: int | None = None, max_items: int | None = None,
                       max_features: int = 5000, min_df: int = 2, 
                       ngram_range: tuple = (1, 2)):
    """Build content-based model with TF-IDF on product metadata"""
    
    df = df_train.select(['user_id', 'parent_asin', 'rating']).with_columns(
        pl.col('rating').cast(pl.Float32)
    )
    
    if max_users is not None:
        first_users = df['user_id'].unique()[:max_users].to_list()
        df = df.filter(pl.col('user_id').is_in(first_users))
    if max_items is not None:
        first_items = df['parent_asin'].unique()[:max_items].to_list()
        df = df.filter(pl.col('parent_asin').is_in(first_items))
    
    # Get items from training data
    item_list = df['parent_asin'].unique().to_list()
    
    # Filter metadata to only items in training
    df_meta_filtered = df_meta.filter(pl.col('parent_asin').is_in(item_list))
    
    logger.log_info(f"[Content] Items in train: {len(item_list):,}")
    logger.log_info(f"[Content] Items with metadata: {len(df_meta_filtered):,}")
    
    # Create indices
    item_rev = item_list
    item_idx = {a_id: idx for idx, a_id in enumerate(item_rev)}
    user_rev = df['user_id'].unique().to_list()
    user_idx = {u_id: idx for idx, u_id in enumerate(user_rev)}
    
    # Build rating matrix
    u = np.array([user_idx[x] for x in df['user_id'].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df['parent_asin'].to_list()], dtype=np.int32)
    v = np.array(df['rating'].to_list(), dtype=np.float32)
    
    nU = len(user_rev)
    nI = len(item_rev)
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)
    
    # Create description mapping
    meta_dict = {row['parent_asin']: row['description'] 
                 for row in df_meta_filtered.iter_rows(named=True)}
    
    descriptions = [meta_dict.get(asin, "") for asin in item_rev]
    
    # Count missing descriptions
    missing_count = sum(1 for d in descriptions if not d or d.strip() == "")
    logger.log_info(f"[Content] Descriptions: {len(descriptions) - missing_count:,} valid, "
                   f"{missing_count:,} missing ({missing_count/len(descriptions)*100:.1f}%)")
    
    # TF-IDF Vectorization
    logger.log_info(f"[TF-IDF] Vectorizing with max_features={max_features}, "
                   f"min_df={min_df}, ngram_range={ngram_range}...")
    
    start_time = time.time()
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        min_df=min_df,
        ngram_range=ngram_range
    )
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    
    logger.log_info(f"[TF-IDF] Matrix shape: {tfidf_matrix.shape}, "
                   f"nnz: {tfidf_matrix.nnz:,}, "
                   f"sparsity: {(1 - tfidf_matrix.nnz/(tfidf_matrix.shape[0]*tfidf_matrix.shape[1])):.2%}")
    logger.log_info(f"[TF-IDF] Vocabulary size: {len(vectorizer.vocabulary_):,}")
    
    # Compute similarity
    logger.log_info(f"[Similarity] Computing cosine similarity for {nI} items...")
    item_similarity = cosine_similarity(tfidf_matrix, dense_output=False)
    
    # Filter low similarities (optional, for speed and quality)
    min_sim_threshold = 0.01
    item_similarity.data[item_similarity.data < min_sim_threshold] = 0
    item_similarity.eliminate_zeros()
    
    elapsed = time.time() - start_time
    
    logger.log_info(f"[Similarity] Shape: {item_similarity.shape}, "
                   f"nnz: {item_similarity.nnz:,}, "
                   f"sparsity: {(1 - item_similarity.nnz/(nI*nI)):.2%}")
    logger.log_info(f"[Similarity] Range: [{item_similarity.data.min():.4f}, "
                   f"{item_similarity.data.max():.4f}]")
    logger.log_info(f"[Content] Built in {elapsed:.1f}s")
    
    user_rev_arr = np.array(user_rev, dtype=object)
    item_rev_arr = np.array(item_rev, dtype=object)
    
    logger.log_info(f"[Content-Model] R{R.shape} nnz={R.nnz} | Similarity{item_similarity.shape}")
    
    return R, user_idx, item_idx, user_rev_arr, item_rev_arr, item_similarity, vectorizer

#### Prediction & Recommendation

In [6]:
def predict_content_based(user_idx_val: int, R: csr_matrix, item_similarity: csr_matrix, 
                         top_k: int = 30) -> np.ndarray:
    """
    Predict ratings for all items using content-based CF
    SAFE VECTORIZED VERSION (same as fixed item-based)
    
    Args:
        user_idx_val: User index
        R: Rating matrix
        item_similarity: Item-item similarity from TF-IDF
        top_k: Number of similar items to use
    
    Returns:
        Predicted scores for all items
    """
    user_ratings = R.getrow(user_idx_val).toarray().ravel()
    rated_items = np.nonzero(user_ratings)[0]
    
    if len(rated_items) == 0:
        return np.zeros(R.shape[1], dtype=np.float32)
    
    rated_ratings = user_ratings[rated_items]
    n_items = R.shape[1]
    scores = np.zeros(n_items, dtype=np.float32)
    
    # Get similarity matrix: all items vs rated items
    sim_matrix = item_similarity[:, rated_items].toarray()
    
    for i in range(n_items):
        sims = sim_matrix[i, :]
        
        # Only use POSITIVE similarities
        positive_mask = sims > 0
        n_positive = positive_mask.sum()
        
        if n_positive == 0:
            continue
        
        sims_positive = sims[positive_mask]
        ratings_positive = rated_ratings[positive_mask]
        
        # Select top-K
        k_use = min(top_k, n_positive)
        
        if k_use <= 0:
            continue
        
        if k_use < n_positive:
            if k_use == 1:
                top_idx = np.array([np.argmax(sims_positive)])
            else:
                top_idx = np.argpartition(-sims_positive, min(k_use-1, len(sims_positive)-1))[:k_use]
        else:
            top_idx = np.arange(n_positive)
        
        if len(top_idx) == 0:
            continue
        
        final_sims = sims_positive[top_idx]
        final_ratings = ratings_positive[top_idx]
        
        if len(final_sims) == 0:
            continue
        
        # Weighted average
        sim_sum = np.sum(final_sims)
        if sim_sum > 1e-8:
            scores[i] = np.dot(final_sims, final_ratings) / sim_sum
    
    return scores

def recommend_content_based(user_id: str, n_recs: int, artifacts: dict) -> pl.DataFrame:
    R = artifacts['R']
    user_idx = artifacts['user_idx']
    item_rev = artifacts['item_rev']
    item_similarity = artifacts['item_similarity']
    top_k = artifacts.get('top_k_similar', TOP_K_SIMILAR)
    if user_id not in user_idx:
        logger.log_warning(f"[Recommend] user_id={user_id} not found.")
        return pl.DataFrame(columns=["parent_asin", "score"])
    u = user_idx[user_id]
    scores = predict_content_based(u, R, item_similarity, top_k=top_k)
    rated = set(R.getrow(u).indices.tolist())
    cand_mask = np.ones(R.shape[1], dtype=bool)
    if rated:
        cand_mask[list(rated)] = False
    cand_scores = scores[cand_mask]
    if cand_scores.size == 0:
        return pl.DataFrame(columns=["parent_asin", "score"])
    n_top = min(n_recs, cand_scores.size)
    cand_indices = np.nonzero(cand_mask)[0]
    top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
    picked = sorted([(int(cand_indices[p]), float(cand_scores[p])) for p in top_pos], key=lambda x: -x[1])
    rec_asins = [item_rev[i] for i, _ in picked]
    rec_scores = [s for _, s in picked]
    return pl.DataFrame({"parent_asin": rec_asins, "score": rec_scores})

#### Save/Load Artifacts

In [7]:
def save_content_artifacts(out_dir: Path, R, user_rev, item_rev, user_idx, item_idx, item_similarity, vectorizer):
    out_dir.mkdir(parents=True, exist_ok=True)
    save_npz(out_dir / "R.npz", R)
    save_npz(out_dir / "item_similarity.npz", item_similarity)
    with open(out_dir / "user_rev.pkl", "wb") as f: pickle.dump(user_rev, f)
    with open(out_dir / "item_rev.pkl", "wb") as f: pickle.dump(item_rev, f)
    (out_dir / "user_idx.json").write_text(json.dumps({str(k): int(v) for k, v in user_idx.items()}))
    (out_dir / "item_idx.json").write_text(json.dumps({str(k): int(v) for k, v in item_idx.items()}))
    with open(out_dir / "vectorizer.pkl", "wb") as f: pickle.dump(vectorizer, f)
    logger.log_info(f"[Saved-Content] {out_dir}")

def load_content_artifacts(model_dir: str | Path):
    md = Path(model_dir)
    R = load_npz(md / "R.npz")
    item_similarity = load_npz(md / "item_similarity.npz")
    with open(md / "user_rev.pkl", "rb") as f: user_rev = pickle.load(f)
    with open(md / "item_rev.pkl", "rb") as f: item_rev = pickle.load(f)
    user_idx = {k: int(v) for k, v in json.loads((md / "user_idx.json").read_text()).items()}
    item_idx = {k: int(v) for k, v in json.loads((md / "item_idx.json").read_text()).items()}
    with open(md / "vectorizer.pkl", "rb") as f: vectorizer = pickle.load(f)
    return dict(R=R, item_similarity=item_similarity, user_rev=user_rev, item_rev=item_rev, user_idx=user_idx, item_idx=item_idx, vectorizer=vectorizer)

In [8]:
def train_content_models_for_categories(categories, top_k_similar=30, models_dir=None, max_users=None, max_items=None):
    base = Path(models_dir) if models_dir else MODELS_DIR
    out_algo = base / "content"
    out_algo.mkdir(parents=True, exist_ok=True)
    rows = []
    for cat in categories:
        try:
            out_dir = out_algo / cat
            if out_dir.exists() and (out_dir / "R.npz").exists():
                logger.log_info(f"[Skip] Content based model exists for {cat}")
                rows.append({"category": cat, "algo": "user", "models_dir": str(out_dir), "top_k_similar": top_k_similar})
                continue

            logger.log_info(f"[Content] Training {cat}")
            df_train = load_5core_data(cat, split="train")
            df_meta = load_metadata(cat)
            R, user_idx, item_idx, user_rev, item_rev, item_similarity, vectorizer = build_content_model(df_train, df_meta, max_users=max_users, max_items=max_items)
            
            save_content_artifacts(out_dir, R, user_rev, item_rev, user_idx, item_idx, item_similarity, vectorizer)
            rows.append({"category": cat, "algo": "content", "models_dir": str(out_dir), "top_k_similar": top_k_similar, "R_nnz": int(R.nnz), "users": len(user_rev), "items": len(item_rev)})
        except Exception as e:
            logger.log_exception(f"[Error-Content] {cat}: {e}")
            rows.append({"category": cat, "algo": "content", "models_dir": None, "top_k_similar": top_k_similar, "error": str(e)})
    summary = pl.DataFrame(rows)
    logger.log_info(f"[Summary-Content] Trained={len(rows)} OK={summary['models_dir'].is_not_null().sum()} FAIL={summary['models_dir'].is_null().sum()}")
    return summary

### Task: Evaluation

#### Evaluation Pipeline

In [9]:
def evaluate_content_based(category: str, artifacts: dict, k_values: list = [10, 20, 50], 
                          split: str = "test", sample_users: int = 3000):
    
    logger.log_info(f"[Eval-Content] {category} on {split.upper()}")
    
    df_eval = load_5core_data(category, split=split)
    R = artifacts['R']
    user_idx = artifacts['user_idx']
    item_idx = artifacts['item_idx']
    item_similarity = artifacts['item_similarity']
    top_k = artifacts.get('top_k_similar', 30)
    
    # Filter to train users only
    train_user_list = list(user_idx.keys())
    df_eval = df_eval.filter(pl.col('user_id').is_in(train_user_list))
    
    if len(df_eval) == 0:
        logger.log_warning(f"[Eval-Content] No data after filtering")
        return None
    
    logger.log_info(f"[Eval-Content] After filtering: {len(df_eval):,} ratings, {df_eval['user_id'].n_unique():,} users")
    
    # Sample users
    eval_users = df_eval['user_id'].unique().to_list()
    if len(eval_users) > sample_users:
        np.random.seed(42)
        eval_users = np.random.choice(eval_users, sample_users, replace=False).tolist()
    
    logger.log_info(f"[Eval-Content] Evaluating {len(eval_users)} users with top_k={top_k}...")
    
    # Initialize accumulators
    metrics_acc = {
        'rmse': [], 'accuracy': [],
        **{f'recall@{k}': [] for k in k_values},
        **{f'ndcg@{k}': [] for k in k_values},
        **{f'map@{k}': [] for k in k_values}
    }
    
    evaluated_users = 0
    
    for user_id in eval_users:
        if user_id not in user_idx:
            continue
        
        u = user_idx[user_id]
        user_eval = df_eval.filter(pl.col('user_id') == user_id)
        actual_items = set(user_eval['parent_asin'].to_list())
        actual_ratings = {row['parent_asin']: row['rating'] 
                         for row in user_eval.iter_rows(named=True)}
        
        if len(actual_items) == 0:
            continue
        
        known_items = {item for item in actual_items if item in item_idx}
        
        # Skip if no known items
        if len(known_items) == 0:
            continue
        
        evaluated_users += 1
        scores = predict_content_based(u, R, item_similarity, top_k=top_k)
        
        # RMSE & Accuracy
        predictions = np.full(R.shape[1], np.nan)
        actuals = np.full(R.shape[1], np.nan)
        for asin in known_items:
            idx = item_idx[asin]
            predictions[idx] = scores[idx]
            actuals[idx] = actual_ratings[asin]
        
        rmse, acc = compute_rmse_accuracy(predictions, actuals)
        if not np.isnan(rmse):
            metrics_acc['rmse'].append(rmse)
            metrics_acc['accuracy'].append(acc)
        
        # Ranking metrics
        rated = set(R.getrow(u).indices.tolist())
        cand_mask = np.ones(R.shape[1], dtype=bool)
        if rated:
            cand_mask[list(rated)] = False
        
        cand_scores = scores[cand_mask]
        if cand_scores.size == 0:
            continue
        
        cand_indices = np.nonzero(cand_mask)[0]
        max_k = max(k_values)
        n_top = min(max_k, cand_scores.size)
        top_pos = np.argpartition(-cand_scores, n_top - 1)[:n_top]
        sorted_idx = top_pos[np.argsort(-cand_scores[top_pos])]
        
        item_rev = artifacts['item_rev']
        recommended = [item_rev[cand_indices[i]] for i in sorted_idx]
        
        for k in k_values:
            metrics_acc[f'recall@{k}'].append(recall_at_k(recommended, known_items, k))
            metrics_acc[f'ndcg@{k}'].append(ndcg_at_k(recommended, known_items, k))
            metrics_acc[f'map@{k}'].append(map_at_k(recommended, known_items, k))
    
    logger.log_info(f"[Eval-Content] Actually evaluated: {evaluated_users} users")
    
    # Aggregate
    results = {
        'category': category,
        'split': split,
        'n_users': evaluated_users,
        'rmse': np.mean(metrics_acc['rmse']) if metrics_acc['rmse'] else np.nan,
        'accuracy': np.mean(metrics_acc['accuracy']) if metrics_acc['accuracy'] else np.nan
    }
    
    for k in k_values:
        for metric in ['recall', 'ndcg', 'map']:
            key = f'{metric}@{k}'
            results[key] = np.mean(metrics_acc[key]) if metrics_acc[key] else 0.0
    
    logger.log_info(f"[Eval-Content] RMSE={results['rmse']:.4f}, Acc={results['accuracy']:.4f}")
    logger.log_info(f"[Eval-Content] NDCG@10={results['ndcg@10']:.4f}, Recall@10={results['recall@10']:.4f}")
    
    return results

#### Hyperparameter Tuning

In [10]:
def select_best_k(df_results: pl.DataFrame):
    """Select best K using NDCG-primary strategy"""
    best_k_ndcg = df_results['K'][df_results['NDCG@10'].arg_max()]
    best_ndcg = df_results['NDCG@10'].max()
    
    logger.log_info(f"\nPrimary metric (NDCG@10): K={best_k_ndcg}, score={best_ndcg:.4f}")
    
    threshold = best_ndcg * 0.98
    similar_rows = df_results.filter(pl.col('NDCG@10') >= threshold)
    similar_k = similar_rows['K'].to_list()
    
    if len(similar_k) > 1:
        logger.log_info(f"Multiple K with similar NDCG (within 2%): {similar_k}")
        best_k = similar_rows['K'][similar_rows['Recall@10'].arg_max()]
        logger.log_info(f"Selected K={best_k} based on Recall@10")
    else:
        best_k = best_k_ndcg
        logger.log_info(f"Clear winner: K={best_k}")
    
    return best_k

### Pipeline and execution

#### Training pipeline

In [11]:
def _train_single_category(cat, model_dir, K_VALUES, n_eval_tune):
    """Helper: Train and tune a single category for content-based CF"""
    
    # ========================================================================
    # STEP 1: BUILD BASE MODEL (if not exists)
    # ========================================================================
    
    if not (model_dir / "R.npz").exists():
        logger.log_info("STEP 1: TRAINING BASE MODEL")
        logger.log_info("-"*70)
        logger.log_info("Building TF-IDF similarity matrix (done once)\n")
        
        # Load data
        df_train = load_5core_data(cat, split="train")
        df_meta = load_metadata(cat)
        
        # Build content-based model
        logger.log_info("Computing TF-IDF and item similarities...")
        R, user_idx, item_idx, user_rev, item_rev, item_similarity, vectorizer = build_content_model(
            df_train, df_meta, max_users=MAX_USERS, max_items=MAX_ITEMS,
            max_features=TFIDF_MAX_FEATURES, min_df=TFIDF_MIN_DF, 
            ngram_range=TFIDF_NGRAM_RANGE
        )
        
        # Save artifacts
        save_content_artifacts(model_dir, R, user_rev, item_rev,
                             user_idx, item_idx, item_similarity, vectorizer)
        
        logger.log_info(f"Base model saved to {model_dir}\n")
    else:
        logger.log_info("STEP 1: BASE MODEL EXISTS")
        logger.log_info("-"*70)
        logger.log_info(f"Loading from {model_dir}\n")
    
    # ========================================================================
    # STEP 2: HYPERPARAMETER TUNING (if not done)
    # ========================================================================
    
    if not Configurations.has_tuning_results_content(cat):
        logger.log_info("STEP 2: HYPERPARAMETER TUNING (VALIDATION)")
        logger.log_info("-"*70)
        logger.log_info(f"K values: {K_VALUES}")
        logger.log_info(f"Validation users: {n_eval_tune}\n")
        
        # Load base artifacts
        artifacts = load_content_artifacts(model_dir)
        
        results = []
        
        # Test each K value
        for i, k in enumerate(K_VALUES, 1):
            logger.log_info(f"\n[{i}/{len(K_VALUES)}] Testing K={k}")
            logger.log_info("-"*70)
            
            # Create artifacts for this K
            eval_artifacts = artifacts.copy()
            eval_artifacts['top_k_similar'] = k
            
            # Evaluate
            start_time = time.time()
            metrics = evaluate_content_based(
                cat, eval_artifacts,
                k_values=[10, 20, 50],
                split="valid",
                sample_users=n_eval_tune
            )
            eval_time = time.time() - start_time
            
            if metrics:
                results.append({
                    'K': k,
                    'NDCG@10': metrics['ndcg@10'],
                    'NDCG@20': metrics['ndcg@20'],
                    'NDCG@50': metrics['ndcg@50'],
                    'Recall@10': metrics['recall@10'],
                    'Recall@20': metrics['recall@20'],
                    'Recall@50': metrics['recall@50'],
                    'MAP@10': metrics['map@10'],
                    'MAP@20': metrics['map@20'],
                    'MAP@50': metrics['map@50'],
                    'RMSE': metrics['rmse'],
                    'Accuracy': metrics['accuracy'],
                    'Eval_Time': eval_time
                })
                
                logger.log_info(f"Results:")
                logger.log_info(f"  NDCG@10:   {metrics['ndcg@10']:.4f}")
                logger.log_info(f"  Recall@10: {metrics['recall@10']:.4f}")
                logger.log_info(f"  Eval:      {eval_time:.1f}s")
        
        # Save and select best
        df_results = pl.DataFrame(results)
        df_results.write_csv(MODELS_DIR / 'content' / f'tuning_{cat}.csv')
        
        best_k = select_best_k(df_results)
        Configurations.save_best_k_content(cat, best_k)
        
        visualize_hyperparameter_tuning(
                                        df_results,
                                        category=cat,
                                        param_col='K',
                                        param_name='K (Similar Items - Content)',
                                        save_dir=MODELS_DIR / 'content',
                                        algo_name='Content-Based'
                                    )
        
        logger.log_info(f"Final model uses K={best_k}\n")
        
        return {'tuned_now': True, 'best_k': best_k}
    else:
        best_k = Configurations.load_best_k_content(cat)
        logger.log_info(f"STEP 2: TUNING ALREADY DONE (K={best_k})")
        return {'tuned_now': False, 'best_k': best_k}

#### Phase 1: Training + Tuning

In [None]:
# ============================================================================
# PHASE 1: TRAINING + TUNING ALL CATEGORIES
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1: TRAINING + TUNING ALL CATEGORIES (CONTENT-BASED)")
logger.log_info("="*70 + "\n")

if not Configurations.has_tuning_results_content(CATEGORY[0]):
    logger.log_info(f"K values to test: {K_VALUES}\n")

workflow_results = {}

for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nCATEGORY: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "content" / cat
    workflow_results[cat] = _train_single_category(
        cat, model_dir, K_VALUES, Configurations.get_eval_samples_tuning()
    )

# Summary
logger.log_info("\n" + "="*70)
logger.log_info("PHASE 1 COMPLETE: ALL MODELS TRAINED AND TUNED")
logger.log_info("="*70 + "\n")

logger.log_info("Tuning Summary:")
for cat in CATEGORY:
    status = 'newly tuned' if workflow_results[cat]['tuned_now'] else 'loaded from cache'
    logger.log_info(f"  {cat}: K={workflow_results[cat]['best_k']} ({status})")

logger.log_info("\n" + "="*70)
logger.log_info("Ready for Phase 2: Final Evaluation")
logger.log_info("="*70 + "\n")

#### Phase 2: Final Evaluation

In [None]:
# ============================================================================
# PHASE 2: FINAL EVALUATION ON TEST SET (ALL CATEGORIES)
# ============================================================================

logger.log_info("\n" + "="*70)
logger.log_info("PHASE 2: FINAL EVALUATION ON TEST SET (CONTENT-BASED)")
logger.log_info("="*70 + "\n")

# Helper function
def _create_val_test_comparison(cat, best_k, tuning_row, final_row):
    """Create val vs test comparison plot"""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    metrics_names = ['NDCG@10', 'Recall@10', 'MAP@10']
    val_scores = [tuning_row['NDCG@10'], tuning_row['Recall@10'], tuning_row['MAP@10']]
    test_scores = [final_row['ndcg@10'], final_row['recall@10'], final_row['map@10']]
    
    x = np.arange(len(metrics_names))
    width = 0.35
    
    axes[0].bar(x - width/2, val_scores, width, label='Validation', alpha=0.8, color='#3498DB')
    axes[0].bar(x + width/2, test_scores, width, label='Test', alpha=0.8, color='#2ECC71')
    axes[0].set_xlabel('Metrics', fontsize=11)
    axes[0].set_ylabel('Score', fontsize=11)
    axes[0].set_title(f'Validation vs Test (Content-Based) - {cat} (K={best_k})', 
                    fontsize=12, fontweight='bold')
    axes[0].set_xticks(x)
    axes[0].set_xticklabels(metrics_names)
    axes[0].legend(fontsize=10)
    axes[0].grid(axis='y', alpha=0.3)
    
    for i, (v, t) in enumerate(zip(val_scores, test_scores)):
        axes[0].text(i - width/2, v, f'{v:.4f}', ha='center', va='bottom', fontsize=9)
        axes[0].text(i + width/2, t, f'{t:.4f}', ha='center', va='bottom', fontsize=9)
    
    # Table
    axes[1].axis('off')
    improvements = [f"{(t/v - 1)*100:+.1f}%" if v > 0 else "N/A" 
                   for v, t in zip(val_scores, test_scores)]
    
    val_scores_full = val_scores + [tuning_row['RMSE']]
    test_scores_full = test_scores + [final_row['rmse']]
    improvements_full = improvements + [f"{(final_row['rmse']/tuning_row['RMSE'] - 1)*100:+.1f}%"]
    metrics_names_full = metrics_names + ['RMSE']
    
    table_data = [
        ['Metric', 'Validation', 'Test', 'Change'],
        *[[name, f"{v:.4f}", f"{t:.4f}", imp]
          for name, v, t, imp in zip(metrics_names_full, val_scores_full, 
                                     test_scores_full, improvements_full)]
    ]
    
    table = axes[1].table(cellText=table_data, cellLoc='center', loc='center',
                         colWidths=[0.25, 0.25, 0.25, 0.25])
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 2.5)
    
    for j in range(4):
        table[(0, j)].set_facecolor('#34495E')
        table[(0, j)].set_text_props(weight='bold', color='white')
    
    for i in range(1, 5):
        change_val = table_data[i][3]
        color = '#D5F4E6' if (change_val.startswith('+') and i < 4) or (change_val.startswith('-') and i == 4) else '#FADBD8'
        table[(i, 3)].set_facecolor(color)
    
    plt.tight_layout()
    out_path = MODELS_DIR / 'content' / f'val_vs_test_{cat}.png'
    plt.savefig(out_path, dpi=150, bbox_inches='tight')
    plt.show()

# Load workflow results
if 'workflow_results' not in locals():
    workflow_results = {}
    for cat in CATEGORY:
        best_k = Configurations.load_best_k_content(cat)
        workflow_results[cat] = {'best_k': best_k, 'tuned_now': False}
    logger.log_info("Loaded best K from configuration\n")

n_eval_final = Configurations.get_eval_samples_final()
logger.log_info(f"Test users per category: {n_eval_final}\n")

# Run test evaluation
for cat in CATEGORY:
    logger.log_info(f"\n{'='*70}\nTESTING: {cat}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "content" / cat
    best_k = workflow_results[cat]['best_k']
    logger.log_info(f"Using K: {best_k}")
    
    final_artifacts = load_content_artifacts(model_dir)
    final_artifacts['top_k_similar'] = best_k
    
    logger.log_info("Evaluating on test set...\n")
    results = evaluate_content_based(cat, final_artifacts, k_values=[10, 20, 50],
                                    split="test", sample_users=n_eval_final)
    
    if results:
        workflow_results[cat]['test_results'] = results
        logger.log_info(f"\nTest Results (K={best_k}):")
        logger.log_info(f"  NDCG@10: {results['ndcg@10']:.4f}, "
                       f"Recall@10: {results['recall@10']:.4f}, "
                       f"MAP@10: {results['map@10']:.4f}")
        logger.log_info(f"  RMSE: {results['rmse']:.4f}, "
                       f"Accuracy: {results['accuracy']:.4f}\n")

# Save final results
logger.log_info("\n" + "="*70)
logger.log_info("SAVING FINAL RESULTS")
logger.log_info("="*70 + "\n")

test_results_list = [workflow_results[cat]['test_results'] 
                     for cat in CATEGORY 
                     if 'test_results' in workflow_results[cat]]

if test_results_list:
    df_final_results = pl.DataFrame(test_results_list)
    
    logger.log_info("Final Test Results:")
    display(df_final_results)
    
    out_csv = MODELS_DIR / 'content' / 'final_test_results.csv'
    df_final_results.write_csv(out_csv)
    logger.log_info(f"\nSaved: {out_csv}")
    
    logger.log_info("Generating final evaluation plot...")
    visualize_final_results(
                            test_results_list,
                            save_dir=MODELS_DIR / 'content',
                            algo_name='Content-Based',
                            k_values=[10, 20, 50]
                        )
    logger.log_info(f"Saved: evaluation_results.png\n")

# Post-analysis visualization
logger.log_info("\n" + "="*70)
logger.log_info("POST-ANALYSIS VISUALIZATION")
logger.log_info("="*70 + "\n")

for cat in CATEGORY:
    tuning_csv = MODELS_DIR / 'content' / f'tuning_{cat}.csv'
    
    if not tuning_csv.exists() or 'test_results' not in workflow_results[cat]:
        continue
    
    logger.log_info(f"Generating Val vs Test comparison for {cat}...")
    
    df_tuning = pl.read_csv(tuning_csv)
    best_k = workflow_results[cat]['best_k']
    
    tuning_row = df_tuning.filter(pl.col('K') == best_k).row(0, named=True)
    final_row = df_final_results.filter(pl.col('category') == cat).row(0, named=True)
    
    visualize_val_test_comparison(
                                    cat=cat,
                                    param_val=best_k,
                                    tuning_row=tuning_row,
                                    final_row=final_row,
                                    save_dir=MODELS_DIR / 'content',
                                    param_name='K',
                                    algo_name='Content-Based'
                                )
    logger.log_info(f"  Saved: val_vs_test_{cat}.png\n")

# Summary
logger.log_info("\n" + "="*70)
logger.log_info("COMPLETE WORKFLOW SUMMARY (CONTENT-BASED)")
logger.log_info("="*70)

for cat in CATEGORY:
    logger.log_info(f"\n{cat}: K={workflow_results[cat]['best_k']}")
    if 'test_results' in workflow_results[cat]:
        test = workflow_results[cat]['test_results']
        logger.log_info(f"  NDCG@10: {test['ndcg@10']:.4f}, "
                       f"Recall@10: {test['recall@10']:.4f}")

logger.log_info("\n" + "="*70)
logger.log_info("ALL PHASES COMPLETE")
logger.log_info("="*70 + "\n")

#### Debug info

In [None]:
def check_content_quality(category: str):
    """Check metadata quality and TF-IDF statistics"""
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"CONTENT QUALITY CHECK: {category}")
    logger.log_info(f"{'='*70}\n")
    
    # Load metadata
    df_meta = load_metadata(category)
    
    logger.log_info("Metadata Statistics:")
    logger.log_info("-" * 70)
    logger.log_info(f"  Total items: {len(df_meta):,}")
    
    # Analyze descriptions
    descriptions = df_meta['description'].to_list()
    valid_descs = [d for d in descriptions if d and d.strip()]
    
    logger.log_info(f"  Valid descriptions: {len(valid_descs):,} ({len(valid_descs)/len(df_meta)*100:.1f}%)")
    
    if valid_descs:
        lengths = [len(d) for d in valid_descs]
        word_counts = [len(d.split()) for d in valid_descs]
        
        logger.log_info(f"\nText Statistics:")
        logger.log_info(f"  Avg length: {np.mean(lengths):.1f} chars")
        logger.log_info(f"  Avg words:  {np.mean(word_counts):.1f}")
        logger.log_info(f"  Median words: {np.median(word_counts):.0f}")
    
    # Load model
    model_dir = MODELS_DIR / 'content' / category
    if model_dir.exists():
        logger.log_info(f"\n{'-' * 70}")
        logger.log_info("Model Statistics:")
        
        artifacts = load_content_artifacts(model_dir)
        vectorizer = artifacts['vectorizer']
        item_similarity = artifacts['item_similarity']
        
        logger.log_info(f"  Vocabulary size: {len(vectorizer.vocabulary_):,}")
        logger.log_info(f"  TF-IDF matrix: {tfidf_matrix.shape if 'tfidf_matrix' in locals() else 'N/A'}")
        logger.log_info(f"  Similarity matrix: {item_similarity.shape}")
        logger.log_info(f"  Similarity nnz: {item_similarity.nnz:,}")
        logger.log_info(f"  Similarity sparsity: {(1 - item_similarity.nnz/(item_similarity.shape[0]*item_similarity.shape[1])):.2%}")
        logger.log_info(f"  Similarity range: [{item_similarity.data.min():.4f}, {item_similarity.data.max():.4f}]")
        
        # Top features
        logger.log_info(f"\nTop 20 TF-IDF Features:")
        feature_names = vectorizer.get_feature_names_out()
        for i, feat in enumerate(feature_names[:20], 1):
            logger.log_info(f"  {i}. {feat}")
    
    logger.log_info(f"\n{'='*70}\n")

check_content_quality(CATEGORY[0])

### Task: Unit test

#### UI Recommendation Test

In [15]:
def recommend_content_ui(user_id: str, n_recs: int = 5, models_dir: str | Path | None = None, category: str | None = None) -> pl.DataFrame:
    cat = category or CATEGORY[0]
    model_dir = Path(models_dir) if models_dir else (MODELS_DIR / "content" / cat)
    artifacts = load_content_artifacts(model_dir)
    best_k = Configurations.load_best_k_content(cat)
    artifacts['top_k_similar'] = best_k
    return recommend_content_based(user_id, n_recs, artifacts)

def unit_test_ui_content_recommend(user_id: str, n_recs: int = 5, models_dir: str | Path | None = None, category: str | None = None):
    cat = category or CATEGORY[0]
    md = models_dir if models_dir else (MODELS_DIR / "content" / cat)
    logger.log_info(f"[UnitTest-UI-CONTENT] model_dir={md} | user_id={user_id} | n_recs={n_recs}")
    recs = recommend_content_ui(user_id=user_id, n_recs=n_recs, models_dir=md, category=cat)
    cols = set(recs.columns)
    assert {"parent_asin", "score"}.issubset(cols), "recs missing required columns"
    assert len(recs) <= n_recs, f"recs length should be ≤ {n_recs}"
    logger.log_info(f"[UnitTest-UI-CONTENT] returned {len(recs)} items ✅")
    display(recs)
    return recs

#### Test All Categories

In [None]:
def test_all_categories():
    """Unit test: Verify recommendation function"""
    logger.log_info("\n" + "="*70)
    logger.log_info("[UNIT TEST] Testing Recommendation Function (Content-Based)")
    logger.log_info("="*70 + "\n")
    
    test_summary = []
    
    for cat in CATEGORY:
        logger.log_info(f"\n[Test] {cat}")
        logger.log_info("-"*70)
        
        try:
            model_dir = MODELS_DIR / "content" / cat
            
            if not model_dir.exists():
                logger.log_warning(f"  ✗ Model not found")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'Model not found'})
                continue
            
            artifacts = load_content_artifacts(model_dir)
            best_k = Configurations.load_best_k_content(cat)
            artifacts['top_k_similar'] = best_k
            
            user_rev = artifacts['user_rev']
            item_rev = artifacts['item_rev']
            
            logger.log_info(f"  Model loaded: {len(user_rev):,} users, {len(item_rev):,} items")
            logger.log_info(f"  Using top_k_similar: {best_k}")
            
            if len(user_rev) == 0:
                logger.log_warning(f"  ✗ No users")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'No users'})
                continue
            
            sample_user = user_rev[0]
            logger.log_info(f"  Testing user: {sample_user}")
            
            recs = recommend_content_ui(sample_user, n_recs=N_RECS, category=cat)
            
            assert set(recs.columns) >= {"parent_asin", "score"}, "Missing columns"
            assert len(recs) <= N_RECS, f"Too many recs: {len(recs)}"
            
            logger.log_info(f"  ✓ Generated {len(recs)} recommendations")
            logger.log_info(f"  Score range: [{recs['score'].min():.4f}, {recs['score'].max():.4f}]")
            
            test_summary.append({
                'category': cat,
                'status': 'PASS',
                'n_recs': len(recs),
                'score_min': float(recs['score'].min()),
                'score_max': float(recs['score'].max())
            })
            
            display(recs.head(5))
            
        except Exception as e:
            logger.log_exception(f"  ✗ Error: {e}")
            test_summary.append({'category': cat, 'status': 'FAIL', 'reason': str(e)})
    
    # Summary
    logger.log_info("\n" + "="*70)
    logger.log_info("UNIT TEST SUMMARY")
    logger.log_info("="*70)
    
    df_summary = pl.DataFrame(test_summary)
    display(df_summary)
    
    passed = sum(1 for r in test_summary if r['status'] == 'PASS')
    logger.log_info(f"\nResults: {passed}/{len(test_summary)} passed")
    
    logger.log_info("="*70 + "\n")

test_all_categories()