## Trending-Based Recommendation (Baseline + Cold-Start Handler)
**Goal**
- Build trending-based recommender for cold-start scenarios
- Compute time-aware popularity scores with recency boost
- Evaluate as baseline and cold-start fallback
- Save artifacts for hybrid integration and API

**What this notebook does**
- 1. Load 5-core TRAIN from PROCESSED_DIR
- 2. Compute trending scores: log(count) * avg_rating * recency_weight
- 3. Build sparse matrix for consistency with other models
- 4. Generate Top-N trending recommendations
- 5. Evaluate on TEST/VALID sets
- 6. Save artifacts for API integration

### Task: Import libraries, define constants

In [None]:
import os, sys, json, pickle
import numpy as np, polars as pl
from pathlib import Path
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import time
from datetime import datetime, timedelta

module_path = os.path.abspath(os.path.join('..', '../utilities'))
sys.path.append(module_path)

from logger import Logger
from configurations import Configurations
from visualization_helpers import visualize_final_results

m_log_file = Configurations.LOG_PATH
logger = Logger(process_name="trending", log_file=m_log_file)

PROCESSED_DIR = Path(Configurations.DATA_PROCESSED_PATH)
MODELS_DIR = Path(Configurations.MODELS_PATH)
CATEGORY = Configurations.CATEGORIES

logger.log_info("="*70)
logger.log_info("TRENDING-BASED RECOMMENDATION")
logger.log_info("="*70)
logger.log_info(f"Categories: {CATEGORY}")
logger.log_info(f"Sample size: {Configurations.DEV_SAMPLE_SIZE}")
logger.log_info("="*70 + "\n")

N_RECS = 10
RECENCY_DAYS = 90  # Consider last 90 days as "recent"

### Task: Load data

In [2]:
def _candidate_files(category: str, split: str = "train"):
    dev_sample_size = Configurations.DEV_SAMPLE_SIZE
    safe_cat = category.replace('/', '-')
    
    if dev_sample_size != 'full':
        for size_name in Configurations.SAMPLE_SIZES.keys():
            if size_name == dev_sample_size:
                return PROCESSED_DIR / f"{safe_cat}.5core.{split}.{size_name}.parquet"
    else:
        return PROCESSED_DIR / f"{safe_cat}.5core.{split}.parquet"

def load_5core_data(category: str, split: str = "train") -> pl.DataFrame:
    p = _candidate_files(category, split)
    logger.log_info(f"[Load-{split.upper()}] {p}")
    df = pl.read_parquet(p, low_memory=False)
    logger.log_info(f"[Load-{split.upper()}] shape={df.shape} | "
                   f"users={df['user_id'].n_unique()} | "
                   f"items={df['parent_asin'].n_unique()}")
    return df

### Task: Build Trending Scores

In [3]:
def compute_trending_scores(df_train: pl.DataFrame, recency_days: int = 90):
    """
    Compute trending scores with recency boost
    Score = log(count) * avg_rating * recency_weight
    recency_weight = 1.0 + 0.5 * (recent_count / total_count)
    """
    logger.log_info("[Trending] Computing scores...")
    
    # Handle timestamp conversion
    if 'timestamp' in df_train.columns:
        logger.log_info("[Trending] Checking timestamp format...")
        
        # Sample to detect time unit
        sample_ts = df_train['timestamp'].head(5).to_list()
        logger.log_info(f"[Trending] Sample timestamps: {sample_ts}")
        time_unit = 'ms'
        
        # Convert with correct unit
        df_train = df_train.with_columns([
            pl.from_epoch(pl.col('timestamp'), time_unit=time_unit).alias('datetime')
        ])
        
        # Verify conversion
        sample_dt = df_train['datetime'].head(3).to_list()
        logger.log_info(f"[Trending] Converted sample: {sample_dt}")
        
        # Get date range
        max_date = df_train['datetime'].max()
        min_date = df_train['datetime'].min()
        
        logger.log_info(f"[Trending] Date range: {min_date} to {max_date}")
        
        # Sanity check: years should be 1970-2025
        max_year = max_date.year if hasattr(max_date, 'year') else 0
        
        if max_year < 1970 or max_year > 2030:
            logger.log_warning(f"[Trending] Suspicious year: {max_year}, using popularity only")
            df_train = df_train.with_columns([pl.lit(False).alias('is_recent')])
        else:
            # Calculate cutoff
            cutoff_date = max_date - timedelta(days=recency_days)
            logger.log_info(f"[Trending] Recent cutoff: {cutoff_date} ({recency_days} days)")
            
            # Mark recent
            df_train = df_train.with_columns([
                (pl.col('datetime') >= cutoff_date).alias('is_recent')
            ])
            
            n_recent = df_train['is_recent'].sum()
            logger.log_info(f"[Trending] Recent interactions: {n_recent:,} "
                           f"({n_recent/len(df_train)*100:.1f}%)")
    
    else:
        logger.log_warning("[Trending] No timestamp, using simple popularity")
        df_train = df_train.with_columns([pl.lit(False).alias('is_recent')])
    
    # Compute statistics per item
    item_stats = df_train.group_by('parent_asin').agg([
        pl.count('rating').alias('n_ratings'),
        pl.mean('rating').alias('avg_rating'),
        pl.sum('is_recent').cast(pl.Int32).alias('n_recent')
    ])
    
    # Calculate recency weight
    item_stats = item_stats.with_columns([
        (1.0 + 0.5 * (pl.col('n_recent') / pl.col('n_ratings'))).alias('recency_weight')
    ])
    
    # Calculate trending score
    item_stats = item_stats.with_columns([
        (pl.col('n_ratings').log() * pl.col('avg_rating') * pl.col('recency_weight')).alias('trending_score')
    ]).sort('trending_score', descending=True)
    
    logger.log_info(f"[Trending] Computed for {len(item_stats):,} items")
    logger.log_info(f"[Trending] Top item: {item_stats['parent_asin'][0]}, "
                   f"score: {item_stats['trending_score'][0]:.2f}")
    logger.log_info(f"[Trending] Avg recency weight: {item_stats['recency_weight'].mean():.2f}")
    
    return item_stats

### Task: Build Sparse Matrix & Artifacts

In [4]:
def build_trending_model(df_train: pl.DataFrame, item_stats: pl.DataFrame):    
    logger.log_info("[Trending] Building sparse matrix...")
    
    # Use items from statistics (sorted by trending score)
    items = item_stats['parent_asin'].to_list()
    users = df_train['user_id'].unique().to_list()
    
    user_idx = {u: i for i, u in enumerate(users)}
    item_idx = {a: i for i, a in enumerate(items)}
    
    # Build rating matrix
    u = np.array([user_idx[x] for x in df_train['user_id'].to_list()], dtype=np.int32)
    i = np.array([item_idx[x] for x in df_train['parent_asin'].to_list()], dtype=np.int32)
    v = np.array(df_train['rating'].to_list(), dtype=np.float32)
    
    nU, nI = len(users), len(items)
    R = csr_matrix((v, (u, i)), shape=(nU, nI), dtype=np.float32)
    
    logger.log_info(f"[Trending] Matrix: {R.shape}, nnz={R.nnz:,}, "
                   f"sparsity={(1 - R.nnz/(nU*nI)):.2%}")
    
    user_rev = np.array(users, dtype=object)
    item_rev = np.array(items, dtype=object)
    
    return R, user_idx, item_idx, user_rev, item_rev

### Task: Recommendation

In [5]:
def recommend_trending(user_id: str, n_recs: int, artifacts: dict) -> pl.DataFrame:
    """
    Generate trending recommendations
    Excludes items user already rated
    """
    item_stats = artifacts['item_stats']
    R = artifacts['R']
    user_idx = artifacts['user_idx']
    
    # Get user's rated items
    rated_items = set()
    if user_id in user_idx:
        u = int(user_idx[user_id])
        rated_items = set(R.getrow(u).indices.tolist())
    
    # Get trending items
    trending_items = item_stats['parent_asin'].to_list()
    trending_scores = item_stats['trending_score'].to_list()
    
    # Filter out rated
    recommendations = []
    for item, score in zip(trending_items, trending_scores):
        if item not in rated_items:
            recommendations.append((item, score))
            if len(recommendations) >= n_recs:
                break
    
    if recommendations:
        rec_items, rec_scores = zip(*recommendations)
        return pl.DataFrame({
            "parent_asin": list(rec_items),
            "score": list(rec_scores)
        })
    
    return pl.DataFrame({"parent_asin": [], "score": []})


### Task: Save/Load Artifacts

In [6]:
def save_trending_artifacts(out_dir: Path, item_stats, R, user_rev, item_rev, 
                           user_idx, item_idx):
    out_dir.mkdir(parents=True, exist_ok=True)
    
    item_stats.write_parquet(out_dir / "item_stats.parquet")
    save_npz(out_dir / "R.npz", R)
    
    with open(out_dir / "user_rev.pkl", "wb") as f:
        pickle.dump(user_rev, f)
    with open(out_dir / "item_rev.pkl", "wb") as f:
        pickle.dump(item_rev, f)
    
    (out_dir / "user_idx.json").write_text(
        json.dumps({str(k): int(v) for k, v in user_idx.items()})
    )
    (out_dir / "item_idx.json").write_text(
        json.dumps({str(k): int(v) for k, v in item_idx.items()})
    )
    
    logger.log_info(f"[Saved-Trending] {out_dir}")


def load_trending_artifacts(model_dir: Path):
    item_stats = pl.read_parquet(model_dir / "item_stats.parquet")
    R = load_npz(model_dir / "R.npz")
    
    with open(model_dir / "user_rev.pkl", "rb") as f:
        user_rev = pickle.load(f)
    with open(model_dir / "item_rev.pkl", "rb") as f:
        item_rev = pickle.load(f)
    
    user_idx = json.loads((model_dir / "user_idx.json").read_text())
    item_idx = json.loads((model_dir / "item_idx.json").read_text())
    
    return {
        'item_stats': item_stats,
        'R': R,
        'user_rev': user_rev,
        'item_rev': item_rev,
        'user_idx': user_idx,
        'item_idx': item_idx
    }

### Task: Evaluation

In [7]:
def compute_rmse_accuracy(predictions: np.ndarray, actuals: np.ndarray, threshold: float = 3.5):
    mask = ~np.isnan(actuals)
    if mask.sum() == 0:
        return np.nan, np.nan
    pred_filtered = predictions[mask]
    actual_filtered = actuals[mask]
    rmse = np.sqrt(mean_squared_error(actual_filtered, pred_filtered))
    accuracy = np.mean((pred_filtered >= threshold) == (actual_filtered >= threshold))
    return rmse, accuracy

def recall_at_k(recommended: list, relevant: set, k: int):
    if len(relevant) == 0:
        return 0.0
    recommended_k = set(recommended[:k])
    return len(recommended_k & relevant) / len(relevant)

def ndcg_at_k(recommended: list, relevant: set, k: int):
    recommended_k = recommended[:k]
    dcg = sum([1.0 / np.log2(i + 2) if item in relevant else 0.0 
              for i, item in enumerate(recommended_k)])
    idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(relevant), k))])
    return dcg / idcg if idcg > 0 else 0.0

def map_at_k(recommended: list, relevant: set, k: int):
    recommended_k = recommended[:k]
    if len(relevant) == 0:
        return 0.0
    score, num_hits = 0.0, 0.0
    for i, item in enumerate(recommended_k):
        if item in relevant:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(relevant), k)


In [8]:
def evaluate_trending(category: str, artifacts: dict, k_values: list = [10, 20, 50],
                     split: str = "test", sample_users: int = 1000):    
    logger.log_info(f"[Eval-Trending] {category} on {split.upper()}")
    
    df_eval = load_5core_data(category, split=split)
    item_stats = artifacts['item_stats']
    R = artifacts['R']
    user_idx = artifacts['user_idx']
    item_idx = artifacts['item_idx']
    
    # Filter to train users
    train_users = list(user_idx.keys())
    df_eval = df_eval.filter(pl.col('user_id').is_in(train_users))
    
    if len(df_eval) == 0:
        logger.log_warning("[Eval-Trending] No data after filtering")
        return None
    
    logger.log_info(f"[Eval-Trending] {len(df_eval):,} ratings, "
                   f"{df_eval['user_id'].n_unique():,} users")
    
    # Sample users
    eval_users = df_eval['user_id'].unique().to_list()
    if len(eval_users) > sample_users:
        np.random.seed(42)
        eval_users = np.random.choice(eval_users, sample_users, replace=False).tolist()
    
    logger.log_info(f"[Eval-Trending] Evaluating {len(eval_users)} users...")
    
    # Get trending order
    trending_order = item_stats['parent_asin'].to_list()
    
    # Initialize metrics
    metrics_acc = {
        **{f'recall@{k}': [] for k in k_values},
        **{f'ndcg@{k}': [] for k in k_values},
        **{f'map@{k}': [] for k in k_values}
    }
    
    evaluated = 0
    
    for user_id in eval_users:
        if user_id not in user_idx:
            continue
        
        # Get relevant items
        user_eval = df_eval.filter(pl.col('user_id') == user_id)
        relevant = set(user_eval.filter(pl.col('rating') >= 4)['parent_asin'].to_list())
        
        if len(relevant) == 0:
            continue
        
        # Get user's rated items
        u = int(user_idx[user_id])
        rated = set(R.getrow(u).indices.tolist())
        
        # Recommend trending items (exclude rated)
        recommended = [item for item in trending_order if item not in rated]
        
        evaluated += 1
        
        # Calculate metrics
        for k in k_values:
            metrics_acc[f'recall@{k}'].append(recall_at_k(recommended, relevant, k))
            metrics_acc[f'ndcg@{k}'].append(ndcg_at_k(recommended, relevant, k))
            metrics_acc[f'map@{k}'].append(map_at_k(recommended, relevant, k))
    
    logger.log_info(f"[Eval-Trending] Actually evaluated: {evaluated} users")
    
    # Aggregate
    results = {
        'category': category,
        'split': split,
        'n_users': evaluated,
        'rmse': np.nan,  # Trending doesn't predict ratings
        'accuracy': np.nan
    }
    
    for k in k_values:
        for metric in ['recall', 'ndcg', 'map']:
            key = f'{metric}@{k}'
            results[key] = np.mean(metrics_acc[key]) if metrics_acc[key] else 0.0
    
    logger.log_info(f"[Eval-Trending] NDCG@10={results['ndcg@10']:.4f}, "
                   f"Recall@10={results['recall@10']:.4f}")
    
    return results


### Task: Main Pipeline

In [9]:
def train_trending_model(category: str):
    """Train trending model for category"""
    
    logger.log_info(f"\n{'='*70}\nCATEGORY: {category}\n{'='*70}\n")
    
    model_dir = MODELS_DIR / "trending" / category
    
    # Check if already exists
    if (model_dir / "R.npz").exists():
        logger.log_info(f"[Skip] Model exists for {category}\n")
        return
    
    # Load data
    logger.log_info("[STEP 1] Loading data...")
    df_train = load_5core_data(category, split="train")
    
    # Compute scores
    logger.log_info("\n[STEP 2] Computing trending scores...")
    item_stats = compute_trending_scores(df_train, recency_days=RECENCY_DAYS)
    
    # Build matrix
    logger.log_info("\n[STEP 3] Building sparse matrix...")
    R, user_idx, item_idx, user_rev, item_rev = build_trending_model(df_train, item_stats)
    
    # Save
    logger.log_info("\n[STEP 4] Saving artifacts...")
    save_trending_artifacts(model_dir, item_stats, R, user_rev, item_rev,
                          user_idx, item_idx)
    
    logger.log_info(f"\n Model saved to: {model_dir}\n")

### Traing all categories

In [None]:
logger.log_info("\n" + "="*70)
logger.log_info("TRAINING ALL CATEGORIES")
logger.log_info("="*70 + "\n")

for cat in CATEGORY:
    try:
        train_trending_model(cat)
    except Exception as e:
        logger.log_exception(f"[Error] {cat}: {e}")

logger.log_info("\n" + "="*70)
logger.log_info("TRAINING COMPLETE")
logger.log_info("="*70 + "\n")

### Task: Evaluation

In [None]:
logger.log_info("\n" + "="*70)
logger.log_info("EVALUATION ON TEST SET")
logger.log_info("="*70 + "\n")

test_results = []

for cat in CATEGORY:
    logger.log_info(f"\n[Eval] {cat}")
    logger.log_info("-"*70)
    
    try:
        model_dir = MODELS_DIR / "trending" / cat
        
        if not model_dir.exists():
            logger.log_warning(f"  Model not found")
            continue
        
        # Load artifacts
        artifacts = load_trending_artifacts(model_dir)
        
        # Evaluate
        results = evaluate_trending(
            cat, artifacts,
            k_values=[10, 20, 50],
            split="test",
            sample_users=Configurations.get_eval_samples_final()
        )
        
        if results:
            test_results.append(results)
            logger.log_info(f"  NDCG@50: {results['ndc' \
            '0']:.4f}, "
                          f"Recall@50: {results['recall@50']:.4f}\n")
    
    except Exception as e:
        logger.log_exception(f"  Error: {e}")

# Save results
if test_results:
    df_results = pl.DataFrame(test_results)
    
    logger.log_info("\n" + "="*70)
    logger.log_info("FINAL RESULTS")
    logger.log_info("="*70)
    display(df_results)
    
    out_csv = MODELS_DIR / 'trending' / 'final_test_results.csv'
    df_results.write_csv(out_csv)
    logger.log_info(f"\n Saved: {out_csv}")
    
    # Visualize
    visualize_final_results(
        test_results,
        save_dir=MODELS_DIR / 'trending',
        algo_name='Trending-Based',
        k_values=[10, 20, 50]
    )
    logger.log_info(" Saved: evaluation_results.png\n")

### Task: Unit test

In [12]:
def recommend_trending_ui(user_id: str, n_recs: int = 5, 
                         models_dir: Path = None, category: str = None) -> pl.DataFrame:
    cat = category or CATEGORY[0]
    model_dir = models_dir if models_dir else (MODELS_DIR / "trending" / cat)
    artifacts = load_trending_artifacts(model_dir)
    return recommend_trending(user_id, n_recs, artifacts)

In [None]:
def test_all_categories():
    logger.log_info("\n" + "="*70)
    logger.log_info("[UNIT TEST] Testing Recommendation Function")
    logger.log_info("="*70 + "\n")
    
    test_summary = []
    
    for cat in CATEGORY:
        logger.log_info(f"\n[Test] {cat}")
        logger.log_info("-"*70)
        
        try:
            model_dir = MODELS_DIR / "trending" / cat
            
            if not model_dir.exists():
                logger.log_warning("  Model not found")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'Not found'})
                continue
            
            # Load artifacts
            artifacts = load_trending_artifacts(model_dir)
            user_rev = artifacts['user_rev']
            item_stats = artifacts['item_stats']
            
            logger.log_info(f"  Loaded: {len(user_rev):,} users, {len(item_stats):,} items")
            
            if len(user_rev) == 0:
                logger.log_warning("  No users")
                test_summary.append({'category': cat, 'status': 'FAIL', 'reason': 'No users'})
                continue
            
            # Test with sample user
            sample_user = user_rev[0]
            logger.log_info(f"  Testing user: {sample_user}")
            
            recs = recommend_trending_ui(sample_user, n_recs=N_RECS, category=cat)
            
            assert set(recs.columns) >= {"parent_asin", "score"}, "Missing columns"
            assert len(recs) <= N_RECS, f"Too many recs: {len(recs)}"
            
            logger.log_info(f"  Generated {len(recs)} recommendations")
            logger.log_info(f"  Score range: [{recs['score'].min():.2f}, {recs['score'].max():.2f}]")
            
            test_summary.append({
                'category': cat,
                'status': 'PASS',
                'n_recs': len(recs),
                'score_min': float(recs['score'].min()),
                'score_max': float(recs['score'].max())
            })
            
            display(recs.head(5))
        
        except Exception as e:
            logger.log_exception(f"  Error: {e}")
            test_summary.append({'category': cat, 'status': 'FAIL', 'reason': str(e)})
    
    # Summary
    logger.log_info("\n" + "="*70)
    logger.log_info("UNIT TEST SUMMARY")
    logger.log_info("="*70)
    
    df_summary = pl.DataFrame(test_summary)
    display(df_summary)
    
    passed = sum(1 for r in test_summary if r['status'] == 'PASS')
    logger.log_info(f"\nResults: {passed}/{len(test_summary)} passed")
    logger.log_info("="*70 + "\n")

test_all_categories()

### Analysis

In [None]:
def analyze_trending_distribution(category: str):
    """Analyze trending score distribution"""
    
    logger.log_info(f"\n{'='*70}")
    logger.log_info(f"TRENDING ANALYSIS: {category}")
    logger.log_info(f"{'='*70}\n")
    
    model_dir = MODELS_DIR / "trending" / category
    if not model_dir.exists():
        logger.log_warning("Model not found")
        return
    
    artifacts = load_trending_artifacts(model_dir)
    item_stats = artifacts['item_stats']
    
    logger.log_info("Score Distribution:")
    logger.log_info(f"  Total items: {len(item_stats):,}")
    logger.log_info(f"  Score range: [{item_stats['trending_score'].min():.2f}, "
                   f"{item_stats['trending_score'].max():.2f}]")
    logger.log_info(f"  Mean score: {item_stats['trending_score'].mean():.2f}")
    logger.log_info(f"  Median score: {item_stats['trending_score'].median():.2f}")
    
    logger.log_info("\nRecency Impact:")
    logger.log_info(f"  Avg recency weight: {item_stats['recency_weight'].mean():.2f}")
    logger.log_info(f"  Max recency weight: {item_stats['recency_weight'].max():.2f}")
    
    # Top 10 trending
    logger.log_info("\nTop 10 Trending Items:")
    for i, row in enumerate(item_stats.head(10).iter_rows(named=True), 1):
        logger.log_info(f"  {i}. {row['parent_asin']}: score={row['trending_score']:.2f}, "
                       f"ratings={row['n_ratings']}, recent={row['n_recent']}")
    
    logger.log_info(f"\n{'='*70}\n")

analyze_trending_distribution(CATEGORY[0])