# 03 - Causal Modeling

This notebook implements causal inference methods:
1. Baseline models (Linear Regression, Random Forest)
2. Two-Stage Least Squares (2SLS) with instrumental variables
3. Causal Forest for heterogeneous treatment effects
4. Quantile regression for uncertainty

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from causal_inference import (
    BaselineModels, 
    TwoStageLeastSquares, 
    CausalForestEstimator,
    QuantileRegressionModel,
    CounterfactualPredictor
)

sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load data
df = pd.read_csv('../data/raw/kickstarter_raw_data.csv')
print(f"Loaded {len(df)} campaigns")

## 1. Baseline Models (Ignoring Causality)

In [None]:
# Prepare features
feature_cols = ['goal', 'duration_days', 'num_reward_tiers', 
                'trend_index', 'concurrent_campaigns', 'description_length', 'avg_reward_price']
outcome_col = 'funding_ratio'

baseline = BaselineModels()
X = baseline.prepare_features(df, feature_cols)
y = df[outcome_col].values

In [None]:
# Fit Linear Regression
linear_results = baseline.fit_linear(X, y)
print("Linear Regression Results:")
print(f"  R²: {linear_results['r2']:.4f}")
print(f"  RMSE: {linear_results['rmse']:.4f}")
print(f"  MAE: {linear_results['mae']:.4f}")

In [None]:
# Fit Random Forest
rf_results = baseline.fit_random_forest(X, y)
print("Random Forest Results:")
print(f"  R²: {rf_results['r2']:.4f}")
print(f"  CV R² (mean ± std): {rf_results['cv_r2_mean']:.4f} ± {rf_results['cv_r2_std']:.4f}")

In [None]:
# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_results['feature_importance']
}).sort_values('importance', ascending=True)

fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(importance_df['feature'], importance_df['importance'], color='steelblue')
ax.set_xlabel('Feature Importance')
ax.set_title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

## 2. Two-Stage Least Squares (2SLS)

Handle endogeneity of pricing using instrumental variables.

In [None]:
tsls = TwoStageLeastSquares()

instrument_cols = ['day_of_week', 'holiday_proximity']
exogenous_cols = ['goal', 'duration_days', 'num_reward_tiers', 'trend_index']

tsls_results = tsls.fit(
    df,
    outcome_col='funding_ratio',
    endogenous_col='avg_reward_price',
    instrument_cols=instrument_cols,
    exogenous_cols=exogenous_cols
)

In [None]:
print("2SLS Results:")
print(f"  Treatment Effect (price on funding ratio): {tsls_results['treatment_effect']:.6f}")
print(f"  Standard Error: {tsls_results['treatment_se']:.6f}")
print(f"  t-statistic: {tsls_results['treatment_t_stat']:.2f}")
print(f"  p-value: {tsls_results['treatment_pvalue']:.4f}")
print(f"  First Stage F-stat: {tsls_results['first_stage_f_stat']:.2f}")
print(f"  Instrument Strength: {tsls_results['instrument_strength']}")

print("\nInterpretation:")
print(f"  A $1 increase in avg reward price {'decreases' if tsls_results['treatment_effect'] < 0 else 'increases'}")
print(f"  the funding ratio by {abs(tsls_results['treatment_effect']):.4f}")

## 3. Causal Forest for Heterogeneous Effects

In [None]:
# Prepare data for causal forest
from sklearn.preprocessing import LabelEncoder

df_cf = df.copy()
le = LabelEncoder()
df_cf['category_encoded'] = le.fit_transform(df_cf['category'])

cf_features = ['goal', 'duration_days', 'trend_index', 'concurrent_campaigns', 'category_encoded']

In [None]:
causal_forest = CausalForestEstimator(n_estimators=100)

cf_results = causal_forest.fit(
    df_cf,
    outcome_col='funding_ratio',
    treatment_col='avg_reward_price',
    feature_cols=cf_features
)

In [None]:
# Visualize treatment effect distribution
df_cf['treatment_effect'] = causal_forest.treatment_effects

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribution
axes[0].hist(df_cf['treatment_effect'], bins=50, color='teal', edgecolor='black')
axes[0].axvline(x=cf_results['ATE'], color='red', linestyle='--', label=f'ATE: {cf_results["ATE"]:.4f}')
axes[0].set_xlabel('Treatment Effect')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Heterogeneous Treatment Effects')
axes[0].legend()

# By category
df_cf.boxplot(column='treatment_effect', by='category', ax=axes[1])
axes[1].set_title('Treatment Effect by Category')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Treatment Effect')
plt.suptitle('')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 4. Quantile Regression

In [None]:
qr = QuantileRegressionModel(quantiles=[0.1, 0.25, 0.5, 0.75, 0.9])
qr_results = qr.fit(X, y)

In [None]:
# Get predictions for a sample
sample_idx = np.random.choice(len(df), 50, replace=False)
X_sample = X[sample_idx]

predictions = qr.predict(X_sample)

# Visualize prediction intervals
fig, ax = plt.subplots(figsize=(12, 6))

x_axis = range(len(sample_idx))
ax.fill_between(x_axis, predictions[0.1], predictions[0.9], alpha=0.3, color='blue', label='80% Interval')
ax.fill_between(x_axis, predictions[0.25], predictions[0.75], alpha=0.5, color='blue', label='50% Interval')
ax.plot(x_axis, predictions[0.5], color='red', label='Median Prediction')
ax.scatter(x_axis, y[sample_idx], color='black', s=20, label='Actual', zorder=5)

ax.set_xlabel('Sample Index')
ax.set_ylabel('Funding Ratio')
ax.set_title('Quantile Regression Prediction Intervals')
ax.legend()

plt.tight_layout()
plt.show()

## 5. Save Models

In [None]:
# Use the combined predictor
predictor = CounterfactualPredictor()

all_results = predictor.fit_all_models(
    df,
    outcome_col='funding_ratio',
    treatment_col='avg_reward_price',
    feature_cols=['goal', 'duration_days', 'num_reward_tiers', 'trend_index', 'concurrent_campaigns', 'description_length'],
    instrument_cols=['day_of_week', 'holiday_proximity'],
    categorical_cols=['category']
)

In [None]:
# Save
model_path = '../data/processed/causal_models.pkl'
predictor.save_models(model_path)
print(f"Models saved to {model_path}")