# 05 - Counterfactual Validation

This notebook validates our causal models using:
1. **Temporal Validation**: Test predictions on campaigns that changed strategy
2. **Placebo Test**: Model should find no effect when there is none
3. **Manski Bounds**: Check if estimates are within theoretical bounds
4. **Temporal Cross-Validation**: Test stability of causal effects over time

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import pickle
import warnings

warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

In [None]:
# Load data and models
df = pd.read_csv('../data/processed/kickstarter_causal_features.csv')
df['launch_date'] = pd.to_datetime(df['launch_date'])

# Load trained models
with open('../src/models/causal_models.pkl', 'rb') as f:
    models = pickle.load(f)

print(f"Loaded {len(df)} campaigns")
print(f"Models available: {list(models.keys())}")

In [None]:
# Prepare features
duration_col = 'campaign_duration_days' if 'campaign_duration_days' in df.columns else 'duration_days'
feature_cols = ['avg_reward_price', 'goal_ambition', duration_col, 'trend_index', 'concurrent_campaigns']

# Store validation results
validation_results = {}

---
## Validation 1: Temporal Validation

Identify campaigns with price variation (proxy for strategy experimentation) and validate predictions.

In [None]:
# Identify campaigns with high price spread (proxy for price experimentation)
df['price_variation'] = df['price_spread'] / df['avg_reward_price'].replace(0, 1)

# Select top 10 campaigns with most price variation as "strategy changers"
validation_campaigns = df.nlargest(10, 'price_variation').copy()

print(f"Selected {len(validation_campaigns)} campaigns for temporal validation")
print(f"Price variation range: {validation_campaigns['price_variation'].min():.2f} - {validation_campaigns['price_variation'].max():.2f}")

In [None]:
def validate_campaign(campaign, model, scaler, feature_cols):
    """
    Validate counterfactual prediction for a campaign.
    
    Simulates: "What if they used their min price instead of avg price?"
    """
    # Original features
    original_features = campaign[feature_cols].values.reshape(1, -1)
    
    # Counterfactual: use min price instead of avg
    cf_features = original_features.copy()
    price_idx = feature_cols.index('avg_reward_price')
    cf_features[0, price_idx] = campaign['min_reward_price']
    
    # Scale and predict
    original_scaled = scaler.transform(original_features)
    cf_scaled = scaler.transform(cf_features)
    
    original_pred = model.predict(original_scaled)[0]
    cf_pred = model.predict(cf_scaled)[0]
    
    actual = campaign['funding_ratio']
    error = abs(original_pred - actual)
    predicted_effect = cf_pred - original_pred
    
    return {
        'campaign_id': campaign.name,
        'actual': actual,
        'predicted': original_pred,
        'error': error,
        'cf_predicted': cf_pred,
        'predicted_effect': predicted_effect,
        'price_change': campaign['min_reward_price'] - campaign['avg_reward_price']
    }

In [None]:
# Run validation
scaler = models['scaler']
ols_model = models['ols']

temporal_results = []
for idx, row in validation_campaigns.iterrows():
    result = validate_campaign(row, ols_model, scaler, feature_cols)
    temporal_results.append(result)

temporal_df = pd.DataFrame(temporal_results)

print("\nTEMPORAL VALIDATION RESULTS")
print("="*50)
print(f"Mean Absolute Error: {temporal_df['error'].mean():.4f}")
print(f"Median Absolute Error: {temporal_df['error'].median():.4f}")
print(f"90th percentile error: {temporal_df['error'].quantile(0.9):.4f}")

validation_results['temporal'] = {
    'mae': temporal_df['error'].mean(),
    'median_ae': temporal_df['error'].median(),
    'p90_error': temporal_df['error'].quantile(0.9)
}

In [None]:
# Visualization: Predicted vs Actual
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(temporal_df['actual'], temporal_df['predicted'], s=100, c='steelblue', edgecolors='black')
max_val = max(temporal_df['actual'].max(), temporal_df['predicted'].max())
axes[0].plot([0, max_val], [0, max_val], 'r--', linewidth=2, label='Perfect Prediction')
axes[0].set_xlabel('Actual Funding Ratio')
axes[0].set_ylabel('Predicted Funding Ratio')
axes[0].set_title('Temporal Validation: Predicted vs Actual')
axes[0].legend()

# Error distribution
axes[1].bar(range(len(temporal_df)), temporal_df['error'], color='coral')
axes[1].axhline(y=temporal_df['error'].mean(), color='red', linestyle='--', label=f'Mean: {temporal_df["error"].mean():.3f}')
axes[1].set_xlabel('Campaign')
axes[1].set_ylabel('Absolute Error')
axes[1].set_title('Prediction Errors')
axes[1].legend()

plt.tight_layout()
plt.show()

---
## Validation 2: Placebo Test

For campaigns that did NOT change strategy, the model should predict NO CHANGE.

In [None]:
def placebo_test(campaign, model, scaler, feature_cols):
    """
    Test if model incorrectly detects phantom changes.
    
    For a campaign that didn't change, we simulate predicting
    with a slightly different (fake) price. The effect should be small.
    """
    # Original features
    original_features = campaign[feature_cols].values.reshape(1, -1)
    
    # Placebo: add 1% noise to price (simulates no real change)
    placebo_features = original_features.copy()
    price_idx = feature_cols.index('avg_reward_price')
    noise = np.random.normal(0, 0.01) * original_features[0, price_idx]
    placebo_features[0, price_idx] += noise
    
    # Scale and predict
    original_scaled = scaler.transform(original_features)
    placebo_scaled = scaler.transform(placebo_features)
    
    original_pred = model.predict(original_scaled)[0]
    placebo_pred = model.predict(placebo_scaled)[0]
    
    phantom_effect = abs(placebo_pred - original_pred)
    return phantom_effect

In [None]:
# Select 50 random campaigns with LOW price variation (didn't experiment)
no_change_campaigns = df.nsmallest(100, 'price_variation').sample(50, random_state=42)

np.random.seed(42)
placebo_effects = []
for idx, row in no_change_campaigns.iterrows():
    effect = placebo_test(row, ols_model, scaler, feature_cols)
    placebo_effects.append(effect)

print("\nPLACEBO TEST RESULTS")
print("="*50)
print(f"Mean phantom effect: {np.mean(placebo_effects):.6f}")
print(f"Median phantom effect: {np.median(placebo_effects):.6f}")
print(f"95th percentile: {np.percentile(placebo_effects, 95):.6f}")
print(f"Max phantom effect: {np.max(placebo_effects):.6f}")

# Check if placebo effects are small
threshold = 0.01
passed = np.mean(placebo_effects) < threshold
print(f"\nTest {'PASSED' if passed else 'FAILED'}: Mean phantom effect {'<' if passed else '>='} {threshold}")

validation_results['placebo'] = {
    'mean_phantom_effect': np.mean(placebo_effects),
    'p95_phantom_effect': np.percentile(placebo_effects, 95),
    'passed': passed
}

In [None]:
# Visualize placebo effects
fig, ax = plt.subplots(figsize=(10, 5))

ax.hist(placebo_effects, bins=30, color='teal', edgecolor='black', alpha=0.7)
ax.axvline(x=np.mean(placebo_effects), color='red', linestyle='--', 
           linewidth=2, label=f'Mean: {np.mean(placebo_effects):.6f}')
ax.axvline(x=np.percentile(placebo_effects, 95), color='orange', linestyle='--',
           linewidth=2, label=f'95th pct: {np.percentile(placebo_effects, 95):.6f}')

ax.set_xlabel('Phantom Effect (should be ~0)')
ax.set_ylabel('Count')
ax.set_title('Placebo Test: Distribution of Phantom Effects')
ax.legend()

plt.tight_layout()
plt.show()

---
## Validation 3: Manski Bounds

Check if point estimates fall within theoretically plausible bounds.

In [None]:
def manski_bounds(observed_outcome, treatment_effect, uncertainty=0.3):
    """
    Compute worst-case and best-case bounds assuming unobserved confounding.
    
    Args:
        observed_outcome: Actual funding ratio
        treatment_effect: Estimated treatment effect
        uncertainty: Range of unobserved confounding
        
    Returns:
        (lower_bound, upper_bound)
    """
    lower_bound = observed_outcome + treatment_effect - uncertainty
    upper_bound = observed_outcome + treatment_effect + uncertainty
    return lower_bound, upper_bound

In [None]:
# Sample 100 campaigns for bounds analysis
sample_df = df.sample(100, random_state=42).copy()

# Get predictions
sample_features = sample_df[feature_cols].values
sample_scaled = scaler.transform(sample_features)
sample_df['predicted'] = ols_model.predict(sample_scaled)

# Compute treatment effects (difference between predicted and counterfactual)
tsls_coef = models.get('tsls_coef', 0.00002)  # Use 2SLS coefficient
sample_df['treatment_effect'] = tsls_coef * sample_df['avg_reward_price']

# Compute bounds
bounds_results = []
within_bounds = 0

for idx, row in sample_df.iterrows():
    lb, ub = manski_bounds(row['funding_ratio'], row['treatment_effect'], uncertainty=0.3)
    point_est = row['predicted']
    
    is_within = lb <= point_est <= ub
    if is_within:
        within_bounds += 1
    
    bounds_results.append({
        'lower_bound': lb,
        'upper_bound': ub,
        'point_estimate': point_est,
        'within_bounds': is_within
    })

bounds_df = pd.DataFrame(bounds_results)
pct_within = within_bounds / len(sample_df) * 100

print("\nMANSKI BOUNDS RESULTS")
print("="*50)
print(f"Campaigns within bounds: {within_bounds}/{len(sample_df)} ({pct_within:.1f}%)")
print(f"Average bound width: {(bounds_df['upper_bound'] - bounds_df['lower_bound']).mean():.3f}")

validation_results['manski'] = {
    'pct_within_bounds': pct_within,
    'avg_bound_width': (bounds_df['upper_bound'] - bounds_df['lower_bound']).mean()
}

In [None]:
# Visualize bounds
fig, ax = plt.subplots(figsize=(12, 6))

sample_idx = range(20)  # Show first 20
bounds_sample = bounds_df.iloc[:20]

# Plot bounds
ax.vlines(sample_idx, bounds_sample['lower_bound'], bounds_sample['upper_bound'], 
          color='blue', linewidth=3, alpha=0.5, label='Manski Bounds')
ax.scatter(sample_idx, bounds_sample['point_estimate'], color='red', s=80, zorder=5, label='Point Estimate')

# Color by within/outside bounds
for i, (idx, row) in enumerate(bounds_sample.iterrows()):
    if not row['within_bounds']:
        ax.scatter([i], [row['point_estimate']], color='orange', s=100, edgecolors='black', zorder=6)

ax.set_xlabel('Campaign')
ax.set_ylabel('Funding Ratio')
ax.set_title('Manski Bounds: Point Estimates vs Theoretical Bounds')
ax.legend()

plt.tight_layout()
plt.show()

---
## Validation 4: Temporal Cross-Validation

Train on 2020-2022, test on 2023-2024 to verify causal stability.

In [None]:
# Split by year
df['year'] = df['launch_date'].dt.year

train_df = df[df['year'] <= 2022].dropna(subset=feature_cols + ['funding_ratio'])
test_df = df[df['year'] >= 2023].dropna(subset=feature_cols + ['funding_ratio'])

print(f"Training set (2020-2022): {len(train_df)} campaigns")
print(f"Test set (2023-2024): {len(test_df)} campaigns")

In [None]:
# Train on historical data
X_train_temporal = train_df[feature_cols]
y_train_temporal = train_df['funding_ratio']

X_test_temporal = test_df[feature_cols]
y_test_temporal = test_df['funding_ratio']

# Scale
scaler_temporal = StandardScaler()
X_train_scaled = scaler_temporal.fit_transform(X_train_temporal)
X_test_scaled = scaler_temporal.transform(X_test_temporal)

# Train model
model_temporal = LinearRegression().fit(X_train_scaled, y_train_temporal)

# Evaluate
train_pred = model_temporal.predict(X_train_scaled)
test_pred = model_temporal.predict(X_test_scaled)

train_r2 = r2_score(y_train_temporal, train_pred)
test_r2 = r2_score(y_test_temporal, test_pred) if len(test_df) > 0 else np.nan
test_mae = mean_absolute_error(y_test_temporal, test_pred) if len(test_df) > 0 else np.nan

# Compare coefficients
train_price_coef = model_temporal.coef_[0]
full_price_coef = ols_model.coef_[0]
coef_change = abs(train_price_coef - full_price_coef)

print("\nTEMPORAL CROSS-VALIDATION RESULTS")
print("="*50)
print(f"Training R² (2020-2022): {train_r2:.4f}")
print(f"Test R² (2023-2024): {test_r2:.4f}" if not np.isnan(test_r2) else "Test R²: N/A (no test data)")
print(f"Test MAE: {test_mae:.4f}" if not np.isnan(test_mae) else "Test MAE: N/A")
print(f"\nPrice coefficient (2020-2022 trained): {train_price_coef:.6f}")
print(f"Price coefficient (full data): {full_price_coef:.6f}")
print(f"Coefficient change: {coef_change:.6f}")

coef_stable = coef_change < 0.01
print(f"\nTreatment effect stable: {'YES' if coef_stable else 'NO'} (change {'<' if coef_stable else '>='} 0.01)")

validation_results['temporal_cv'] = {
    'train_r2': train_r2,
    'test_r2': test_r2 if not np.isnan(test_r2) else 'N/A',
    'test_mae': test_mae if not np.isnan(test_mae) else 'N/A',
    'coef_change': coef_change,
    'stable': coef_stable
}

---
## Validation Report

In [None]:
report = """
================================================================================
                        VALIDATION RESULTS REPORT
================================================================================

### Temporal Validation (N={temporal_n} campaigns)
- Mean Absolute Error: {temporal_mae:.4f}
- Median Absolute Error: {temporal_median:.4f}
- 90% of predictions within ±{temporal_p90:.4f} of actual

### Placebo Test (N=50 campaigns)
- Mean phantom effect: {placebo_mean:.6f} (should be ~0)
- 95% of effects < {placebo_p95:.6f} (should be small)
- Test: {placebo_status}

### Manski Bounds Check (N=100 campaigns)
- {manski_pct:.1f}% of point estimates fall within theoretical bounds
- Average bound width: {manski_width:.3f}

### Temporal Cross-Validation
- 2020-2022 training → 2023-2024 test: R² = {cv_r2}
- Treatment effect stable: {cv_stable} (ΔCoef = {cv_change:.6f})

### Conclusion
Model is {conclusion} for counterfactual prediction because:
- Placebo test {placebo_reason}
- {manski_pct:.0f}% of estimates are within bounds
- Treatment effects are {stability_reason} over time

================================================================================
""".format(
    temporal_n=len(temporal_df),
    temporal_mae=validation_results['temporal']['mae'],
    temporal_median=validation_results['temporal']['median_ae'],
    temporal_p90=validation_results['temporal']['p90_error'],
    placebo_mean=validation_results['placebo']['mean_phantom_effect'],
    placebo_p95=validation_results['placebo']['p95_phantom_effect'],
    placebo_status='PASSED ✓' if validation_results['placebo']['passed'] else 'FAILED ✗',
    manski_pct=validation_results['manski']['pct_within_bounds'],
    manski_width=validation_results['manski']['avg_bound_width'],
    cv_r2=validation_results['temporal_cv']['test_r2'],
    cv_stable='YES ✓' if validation_results['temporal_cv']['stable'] else 'NO ✗',
    cv_change=validation_results['temporal_cv']['coef_change'],
    conclusion='RELIABLE' if validation_results['placebo']['passed'] and validation_results['temporal_cv']['stable'] else 'NEEDS IMPROVEMENT',
    placebo_reason='passed (no phantom effects detected)' if validation_results['placebo']['passed'] else 'failed (detecting spurious effects)',
    stability_reason='stable' if validation_results['temporal_cv']['stable'] else 'unstable'
)

print(report)

In [None]:
# Save validation metrics
validation_df = pd.DataFrame([
    {'metric': 'Temporal MAE', 'value': validation_results['temporal']['mae']},
    {'metric': 'Temporal Median AE', 'value': validation_results['temporal']['median_ae']},
    {'metric': 'Placebo Mean Effect', 'value': validation_results['placebo']['mean_phantom_effect']},
    {'metric': 'Placebo Test Passed', 'value': int(validation_results['placebo']['passed'])},
    {'metric': 'Manski Pct Within Bounds', 'value': validation_results['manski']['pct_within_bounds']},
    {'metric': 'Temporal CV Test R2', 'value': validation_results['temporal_cv']['test_r2'] if validation_results['temporal_cv']['test_r2'] != 'N/A' else np.nan},
    {'metric': 'Coefficient Stable', 'value': int(validation_results['temporal_cv']['stable'])}
])

validation_df.to_csv('../data/processed/validation_results.csv', index=False)
print("Validation results saved to data/processed/validation_results.csv")

---
## Summary

The validation suite tests our causal models across four dimensions:

1. **Temporal Validation**: How accurately can we predict outcomes?
2. **Placebo Test**: Does the model avoid detecting false effects?
3. **Manski Bounds**: Are estimates within theoretical limits?
4. **Temporal CV**: Are causal effects stable over time?

→ Proceed to building the Streamlit dashboard with validated models.