## 1. Setup and Data Loading

In [None]:
# Import libraries
import os
import sys
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import joblib
from pathlib import Path

# ML libraries
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

# Add src to path
sys.path.append('../src')
from utils import *

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("âœ“ Libraries imported successfully")

In [None]:
# Load processed data
data_dir = Path('../data/processed')
X_train = pd.read_csv(data_dir / 'X_train.csv')
y_train = pd.read_csv(data_dir / 'y_train.csv').values.ravel()
X_test = pd.read_csv(data_dir / 'X_test.csv')
y_test = pd.read_csv(data_dir / 'y_test.csv').values.ravel()

# Combine for temporal splitting
X_full = pd.concat([X_train, X_test], axis=0, ignore_index=True)
y_full = np.concatenate([y_train, y_test])

print(f"Full dataset shape: {X_full.shape}")
print(f"Features: {list(X_full.columns)}")
print(f"\nTarget distribution:\n{pd.Series(y_full).value_counts().sort_index()}")

# Class names
class_names = {
    0: 'Offer Received',
    1: 'Offer Viewed',
    2: 'Transaction',
    3: 'Offer Completed'
}

print(f"\nâœ“ Loaded {len(X_full)} total samples")

## 2. Temporal Data Analysis

In [None]:
# Analyze reg_month distribution
if 'reg_month' in X_full.columns:
    reg_month_stats = X_full['reg_month'].describe()
    print("\nðŸ“Š Registration Month Statistics:")
    print(reg_month_stats)
    
    # Visualize distribution
    fig, axes = plt.subplots(1, 2, figsize=(16, 5))
    
    # Histogram
    axes[0].hist(X_full['reg_month'], bins=50, edgecolor='black', alpha=0.7)
    axes[0].axvline(X_full['reg_month'].quantile(0.6), color='red', linestyle='--', linewidth=2, label='60% (Train cutoff)')
    axes[0].axvline(X_full['reg_month'].quantile(0.8), color='orange', linestyle='--', linewidth=2, label='80% (Val cutoff)')
    axes[0].set_xlabel('Registration Month (Normalized)', fontsize=12)
    axes[0].set_ylabel('Frequency', fontsize=12)
    axes[0].set_title('Registration Month Distribution', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Cumulative distribution
    sorted_months = np.sort(X_full['reg_month'])
    cumulative = np.arange(1, len(sorted_months) + 1) / len(sorted_months) * 100
    axes[1].plot(sorted_months, cumulative, linewidth=2)
    axes[1].axhline(60, color='red', linestyle='--', linewidth=2, label='60% Train')
    axes[1].axhline(80, color='orange', linestyle='--', linewidth=2, label='80% Val')
    axes[1].set_xlabel('Registration Month (Normalized)', fontsize=12)
    axes[1].set_ylabel('Cumulative %', fontsize=12)
    axes[1].set_title('Cumulative Distribution', fontsize=14, fontweight='bold')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    
    # Save
    figures_dir = Path('../results/figures')
    figures_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(figures_dir / 'temporal_reg_month_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nâœ“ Saved to results/figures/temporal_reg_month_distribution.png")
else:
    print("\nâš  Warning: 'reg_month' column not found in data")

## 3. Create Temporal Splits

In [None]:
# Create temporal splits based on reg_month percentiles
if 'reg_month' in X_full.columns:
    # Calculate split points (60% train, 20% val, 20% test)
    train_cutoff = X_full['reg_month'].quantile(0.6)
    val_cutoff = X_full['reg_month'].quantile(0.8)
    
    print(f"\nðŸ“Š TEMPORAL SPLIT STRATEGY")
    print(f"{'='*80}")
    print(f"Train cutoff (60%): reg_month <= {train_cutoff:.4f}")
    print(f"Val cutoff (80%): {train_cutoff:.4f} < reg_month <= {val_cutoff:.4f}")
    print(f"Test: reg_month > {val_cutoff:.4f}")
    
    # Create splits
    train_mask = X_full['reg_month'] <= train_cutoff
    val_mask = (X_full['reg_month'] > train_cutoff) & (X_full['reg_month'] <= val_cutoff)
    test_mask = X_full['reg_month'] > val_cutoff
    
    # Split data
    X_train_temporal = X_full[train_mask].reset_index(drop=True)
    y_train_temporal = y_full[train_mask]
    
    X_val_temporal = X_full[val_mask].reset_index(drop=True)
    y_val_temporal = y_full[val_mask]
    
    X_test_temporal = X_full[test_mask].reset_index(drop=True)
    y_test_temporal = y_full[test_mask]
    
    print(f"\nðŸ“Š Split Sizes:")
    print(f"Training: {len(X_train_temporal):,} samples ({len(X_train_temporal)/len(X_full)*100:.1f}%)")
    print(f"Validation: {len(X_val_temporal):,} samples ({len(X_val_temporal)/len(X_full)*100:.1f}%)")
    print(f"Test: {len(X_test_temporal):,} samples ({len(X_test_temporal)/len(X_full)*100:.1f}%)")
    
    # Check class distributions
    print(f"\nðŸ“Š Class Distribution Comparison:")
    print(f"{'='*80}")
    
    for class_id, class_name in class_names.items():
        train_pct = (y_train_temporal == class_id).sum() / len(y_train_temporal) * 100
        val_pct = (y_val_temporal == class_id).sum() / len(y_val_temporal) * 100
        test_pct = (y_test_temporal == class_id).sum() / len(y_test_temporal) * 100
        
        print(f"{class_name:20s}: Train={train_pct:5.2f}%, Val={val_pct:5.2f}%, Test={test_pct:5.2f}%")
    
    print("\nâœ“ Temporal splits created successfully")
else:
    print("\nâš  Cannot create temporal splits without 'reg_month' column")
    X_train_temporal = X_train
    y_train_temporal = y_train
    X_val_temporal = X_test[:len(X_test)//2]
    y_val_temporal = y_test[:len(y_test)//2]
    X_test_temporal = X_test[len(X_test)//2:]
    y_test_temporal = y_test[len(y_test)//2:]

## 4. Train Models on Temporal Splits

In [None]:
# Train XGBoost on temporal data
print(f"\n{'='*80}")
print("ðŸŽ¯ Training XGBoost on Temporal Split")
print(f"{'='*80}")

xgb_temporal = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost...")
xgb_temporal.fit(
    X_train_temporal, 
    y_train_temporal,
    eval_set=[(X_val_temporal, y_val_temporal)],
    verbose=False
)
print("âœ“ XGBoost training complete")

In [None]:
# Train Random Forest on temporal data
print(f"\n{'='*80}")
print("ðŸŽ¯ Training Random Forest on Temporal Split")
print(f"{'='*80}")

rf_temporal = RandomForestClassifier(
    n_estimators=150,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced_subsample',
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest...")
rf_temporal.fit(X_train_temporal, y_train_temporal)
print("âœ“ Random Forest training complete")

## 5. Evaluate Temporal Performance

In [None]:
# Evaluate on all splits
temporal_results = []

models = {
    'XGBoost Temporal': xgb_temporal,
    'Random Forest Temporal': rf_temporal
}

splits = {
    'Train': (X_train_temporal, y_train_temporal),
    'Validation': (X_val_temporal, y_val_temporal),
    'Test (Future)': (X_test_temporal, y_test_temporal)
}

for model_name, model in models.items():
    print(f"\n{'='*80}")
    print(f"ðŸ“Š {model_name} - Temporal Evaluation")
    print(f"{'='*80}")
    
    for split_name, (X_split, y_split) in splits.items():
        # Predictions
        y_pred = model.predict(X_split)
        
        # Metrics
        accuracy = accuracy_score(y_split, y_pred)
        f1_micro = f1_score(y_split, y_pred, average='micro')
        f1_macro = f1_score(y_split, y_pred, average='macro')
        f1_weighted = f1_score(y_split, y_pred, average='weighted')
        
        temporal_results.append({
            'Model': model_name,
            'Split': split_name,
            'Samples': len(X_split),
            'Accuracy': accuracy,
            'F1_Micro': f1_micro,
            'F1_Macro': f1_macro,
            'F1_Weighted': f1_weighted
        })
        
        print(f"\n{split_name} ({len(X_split):,} samples):")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  F1 (Micro): {f1_micro:.4f}")
        print(f"  F1 (Macro): {f1_macro:.4f}")
        print(f"  F1 (Weighted): {f1_weighted:.4f}")

# Create results DataFrame
temporal_results_df = pd.DataFrame(temporal_results)
print(f"\n{'='*80}")
print("ðŸ“Š TEMPORAL VALIDATION RESULTS")
print(f"{'='*80}")
display(temporal_results_df.round(4))

# Save results
output_dir = Path('../results/metrics')
output_dir.mkdir(parents=True, exist_ok=True)
temporal_results_df.to_csv(output_dir / 'temporal_validation_results.csv', index=False)
print("\nâœ“ Saved to results/metrics/temporal_validation_results.csv")

In [None]:
# Visualize performance degradation over time
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

metrics_to_plot = ['F1_Weighted', 'F1_Macro']

for idx, metric in enumerate(metrics_to_plot):
    pivot_data = temporal_results_df.pivot(index='Split', columns='Model', values=metric)
    pivot_data = pivot_data.reindex(['Train', 'Validation', 'Test (Future)'])
    
    pivot_data.plot(kind='bar', ax=axes[idx], width=0.7, edgecolor='black')
    axes[idx].set_title(f'{metric} Across Temporal Splits', fontsize=14, fontweight='bold')
    axes[idx].set_xlabel('Data Split', fontsize=12)
    axes[idx].set_ylabel(metric, fontsize=12)
    axes[idx].legend(title='Model', loc='best')
    axes[idx].grid(axis='y', alpha=0.3)
    axes[idx].set_ylim(0, 1.0)
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(figures_dir / 'temporal_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Saved to results/figures/temporal_performance_comparison.png")

## 6. Analyze Performance Degradation

In [None]:
# Calculate performance degradation from train to test
print(f"\n{'='*80}")
print("ðŸ“‰ PERFORMANCE DEGRADATION ANALYSIS")
print(f"{'='*80}")

degradation_results = []

for model_name in temporal_results_df['Model'].unique():
    model_data = temporal_results_df[temporal_results_df['Model'] == model_name]
    
    train_row = model_data[model_data['Split'] == 'Train'].iloc[0]
    test_row = model_data[model_data['Split'] == 'Test (Future)'].iloc[0]
    
    print(f"\n{model_name}:")
    
    for metric in ['Accuracy', 'F1_Micro', 'F1_Macro', 'F1_Weighted']:
        train_val = train_row[metric]
        test_val = test_row[metric]
        degradation = train_val - test_val
        degradation_pct = (degradation / train_val * 100) if train_val > 0 else 0
        
        print(f"  {metric:15s}: Train={train_val:.4f}, Test={test_val:.4f}, "
              f"Î”={degradation:.4f} ({degradation_pct:.2f}%)")
        
        degradation_results.append({
            'Model': model_name,
            'Metric': metric,
            'Train': train_val,
            'Test': test_val,
            'Degradation': degradation,
            'Degradation_%': degradation_pct
        })

# Save degradation analysis
degradation_df = pd.DataFrame(degradation_results)
degradation_df.to_csv(output_dir / 'temporal_degradation_analysis.csv', index=False)
print("\nâœ“ Saved to results/metrics/temporal_degradation_analysis.csv")

In [None]:
# Visualize degradation
fig, ax = plt.subplots(figsize=(12, 6))

# Filter for F1_Weighted metric
f1_degradation = degradation_df[degradation_df['Metric'] == 'F1_Weighted']

x = np.arange(len(f1_degradation))
width = 0.35

bars1 = ax.bar(x - width/2, f1_degradation['Train'], width, label='Train', color='green', edgecolor='black')
bars2 = ax.bar(x + width/2, f1_degradation['Test'], width, label='Test (Future)', color='red', edgecolor='black')

ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('F1-Score (Weighted)', fontsize=12)
ax.set_title('Train vs Test Performance (Temporal Validation)', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(f1_degradation['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(0, 1.0)

# Add degradation percentages
for i, (idx, row) in enumerate(f1_degradation.iterrows()):
    ax.text(i, max(row['Train'], row['Test']) + 0.02, 
            f"-{abs(row['Degradation_%']):.1f}%",
            ha='center', fontsize=10, color='red', fontweight='bold')

plt.tight_layout()
plt.savefig(figures_dir / 'temporal_degradation_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("âœ“ Saved to results/figures/temporal_degradation_visualization.png")

## 7. Compare with Original Models

In [None]:
# Load original models for comparison
models_dir = Path('../models')
original_models = {}

try:
    original_models['XGBoost Original'] = joblib.load(models_dir / 'xgboost_model.pkl')
    print("âœ“ Loaded original XGBoost")
except:
    print("âœ— Could not load original XGBoost")

try:
    original_models['Random Forest Original'] = joblib.load(models_dir / 'random_forest_model.pkl')
    print("âœ“ Loaded original Random Forest")
except:
    print("âœ— Could not load original Random Forest")

In [None]:
# Compare original vs temporal models on future test set
if original_models:
    print(f"\n{'='*80}")
    print("ðŸ“Š ORIGINAL vs TEMPORAL MODELS ON FUTURE DATA")
    print(f"{'='*80}")
    
    comparison_results = []
    
    # Temporal models
    for model_name, model in [('XGBoost Temporal', xgb_temporal), ('Random Forest Temporal', rf_temporal)]:
        y_pred = model.predict(X_test_temporal)
        
        comparison_results.append({
            'Model': model_name,
            'Type': 'Temporal',
            'Accuracy': accuracy_score(y_test_temporal, y_pred),
            'F1_Weighted': f1_score(y_test_temporal, y_pred, average='weighted'),
            'F1_Macro': f1_score(y_test_temporal, y_pred, average='macro')
        })
    
    # Original models
    for model_name, model in original_models.items():
        y_pred = model.predict(X_test_temporal)
        
        comparison_results.append({
            'Model': model_name,
            'Type': 'Original',
            'Accuracy': accuracy_score(y_test_temporal, y_pred),
            'F1_Weighted': f1_score(y_test_temporal, y_pred, average='weighted'),
            'F1_Macro': f1_score(y_test_temporal, y_pred, average='macro')
        })
    
    comparison_df = pd.DataFrame(comparison_results)
    display(comparison_df.round(4))
    
    # Save comparison
    comparison_df.to_csv(output_dir / 'temporal_vs_original_comparison.csv', index=False)
    print("\nâœ“ Saved to results/metrics/temporal_vs_original_comparison.csv")
    
    # Visualize comparison
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Group by model type
    x = np.arange(len(comparison_df))
    width = 0.25
    
    bars1 = ax.bar(x - width, comparison_df['Accuracy'], width, label='Accuracy', edgecolor='black')
    bars2 = ax.bar(x, comparison_df['F1_Weighted'], width, label='F1 Weighted', edgecolor='black')
    bars3 = ax.bar(x + width, comparison_df['F1_Macro'], width, label='F1 Macro', edgecolor='black')
    
    ax.set_xlabel('Model', fontsize=12)
    ax.set_ylabel('Score', fontsize=12)
    ax.set_title('Original vs Temporal Models on Future Test Data', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([f"{row['Model']}\n({row['Type']})" for _, row in comparison_df.iterrows()], 
                        rotation=45, ha='right', fontsize=9)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    ax.set_ylim(0, 1.0)
    
    plt.tight_layout()
    plt.savefig(figures_dir / 'original_vs_temporal_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("âœ“ Saved to results/figures/original_vs_temporal_comparison.png")

## 8. Summary & Recommendations

In [None]:
# Generate summary report
summary_lines = [
    "\n" + "="*80,
    "ðŸ“Š TEMPORAL VALIDATION SUMMARY",
    "="*80,
    "",
    "1. DATA SPLITS:",
    f"   Training: {len(X_train_temporal):,} samples (early customers)",
    f"   Validation: {len(X_val_temporal):,} samples (mid-period customers)",
    f"   Test: {len(X_test_temporal):,} samples (recent/future customers)",
    "",
    "2. TEMPORAL MODEL PERFORMANCE:",
]

for model_name in ['XGBoost Temporal', 'Random Forest Temporal']:
    model_results = temporal_results_df[temporal_results_df['Model'] == model_name]
    test_row = model_results[model_results['Split'] == 'Test (Future)'].iloc[0]
    
    summary_lines.append(f"\n   {model_name}:")
    summary_lines.append(f"     Accuracy: {test_row['Accuracy']:.4f}")
    summary_lines.append(f"     F1 (Weighted): {test_row['F1_Weighted']:.4f}")
    summary_lines.append(f"     F1 (Macro): {test_row['F1_Macro']:.4f}")

summary_lines.extend([
    "",
    "3. PERFORMANCE DEGRADATION:",
])

for model_name in ['XGBoost Temporal', 'Random Forest Temporal']:
    model_deg = degradation_df[
        (degradation_df['Model'] == model_name) & 
        (degradation_df['Metric'] == 'F1_Weighted')
    ].iloc[0]
    
    summary_lines.append(
        f"   {model_name}: {model_deg['Degradation_%']:.2f}% drop from train to test"
    )

if original_models:
    summary_lines.extend([
        "",
        "4. ORIGINAL vs TEMPORAL COMPARISON:",
    ])
    
    for model_type in ['XGBoost', 'Random Forest']:
        temporal_name = f"{model_type} Temporal"
        original_name = f"{model_type} Original"
        
        if temporal_name in comparison_df['Model'].values and original_name in comparison_df['Model'].values:
            temporal_f1 = comparison_df[comparison_df['Model'] == temporal_name]['F1_Weighted'].iloc[0]
            original_f1 = comparison_df[comparison_df['Model'] == original_name]['F1_Weighted'].iloc[0]
            diff = temporal_f1 - original_f1
            
            summary_lines.append(
                f"   {model_type}: Temporal F1={temporal_f1:.4f}, Original F1={original_f1:.4f}, "
                f"Î”={diff:+.4f}"
            )

summary_lines.extend([
    "",
    "5. KEY FINDINGS:",
    f"   - Models trained on temporal splits show realistic future performance",
    f"   - Performance degradation ranges from {degradation_df['Degradation_%'].min():.2f}% to {degradation_df['Degradation_%'].max():.2f}%",
    f"   - {'Temporal models perform better' if comparison_df[comparison_df['Type']=='Temporal']['F1_Weighted'].mean() > comparison_df[comparison_df['Type']=='Original']['F1_Weighted'].mean() else 'Original models perform better'} on future data",
    "",
    "6. RECOMMENDATIONS:",
    "   âœ“ Use temporal validation for more realistic performance estimates",
    "   âœ“ Retrain models periodically as new data becomes available",
    "   âœ“ Monitor performance degradation in production",
    "   âœ“ Consider online learning or incremental updates",
    "   âœ“ Implement model retraining triggers based on performance thresholds",
    "",
    "="*80,
    "âœ“ TEMPORAL VALIDATION COMPLETE",
    "="*80
])

summary_text = "\n".join(summary_lines)
print(summary_text)

# Save summary
with open(output_dir / 'temporal_validation_summary.txt', 'w') as f:
    f.write(summary_text)

print("\nâœ“ Saved summary to results/metrics/temporal_validation_summary.txt")

## Conclusion

This notebook performed temporal validation to assess model performance on future data:

âœ… **Temporal Splits**: Created time-based train/validation/test splits using reg_month

âœ… **Model Training**: Trained XGBoost and Random Forest on temporal data

âœ… **Performance Analysis**: Evaluated degradation from training to future test data

âœ… **Comparison**: Compared temporal models with original random-split models

**Key Insight**: Temporal validation provides more realistic estimates of production performance compared to random splits.

**Next Steps**:
1. Implement continuous monitoring in production
2. Set up automated retraining pipelines
3. Use A/B testing to validate improvements