# 02 - Feature Engineering for Causal Inference

This notebook covers the creation of features with clear causal interpretation:
- Treatment variables (endogenous pricing decisions)
- Instrumental variables (exogenous shocks)
- Confounders and controls
- Outcome variables

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load raw data
df = pd.read_csv('../data/raw/kickstarter_raw_data.csv')
print(f"Loaded {len(df)} campaigns")
df.head()

## 1. Causal Variable Construction

In [None]:
# Demand censoring indicator
# If campaign exceeded goal by >300%, true demand might be even higher
df['hit_funding_cap'] = (df['funding_ratio'] > 3.0).astype(int)
print(f"Campaigns hitting funding cap (>300%): {df['hit_funding_cap'].sum()} ({df['hit_funding_cap'].mean()*100:.1f}%)")

In [None]:
# Price positioning
# Price relative to goal ambition
df['price_to_goal_ratio'] = df['avg_reward_price'] / (df['goal'] / 100)

print("Price to Goal Ratio Statistics:")
print(df['price_to_goal_ratio'].describe())

In [None]:
# Goal ambition by category
df['goal_ambition'] = df.groupby('category')['goal'].transform(lambda x: x / x.median())

# Visualize
fig, ax = plt.subplots(figsize=(10, 6))
df.boxplot(column='goal_ambition', by='category', ax=ax)
ax.set_title('Goal Ambition by Category')
ax.set_xlabel('Category')
ax.set_ylabel('Goal / Median Goal')
ax.axhline(y=1, color='r', linestyle='--', label='Median')
plt.suptitle('')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 2. Instrumental Variable Checks

For valid instruments, we need:
1. **Relevance**: Instrument affects treatment (launch day affects outcomes)
2. **Exclusion**: Instrument doesn't directly affect outcome (only through treatment)

In [None]:
# Check: Does launch day affect pricing? (Should be weak if instrument is valid)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Day of week vs. price
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
price_by_day = df.groupby('day_of_week')['avg_reward_price'].mean()
price_by_day.index = [day_names[i] for i in price_by_day.index]
price_by_day.plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Avg Price by Launch Day (Should be flat for valid instrument)')
axes[0].set_ylabel('Average Reward Price ($)')

# Day of week vs. funding ratio
ratio_by_day = df.groupby('day_of_week')['funding_ratio'].mean()
ratio_by_day.index = [day_names[i] for i in ratio_by_day.index]
ratio_by_day.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_title('Avg Funding Ratio by Launch Day (Should show variation)')
axes[1].set_ylabel('Funding Ratio')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for instrument validity
iv_cols = ['day_of_week', 'holiday_proximity', 'avg_reward_price', 'funding_ratio']
corr_matrix = df[iv_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, ax=ax)
ax.set_title('Correlation Matrix: Instruments, Treatment, Outcome')
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Instruments should have LOW correlation with treatment (avg_reward_price)")
print("- Instruments should have SOME correlation with outcome (funding_ratio)")

## 3. Confounding Visualization

In [None]:
# Selection bias check: Price distribution for successful vs failed
fig, ax = plt.subplots(figsize=(10, 6))

successful = df[df['is_successful'] == 1]['avg_reward_price']
failed = df[df['is_successful'] == 0]['avg_reward_price']

ax.hist(successful, bins=30, alpha=0.6, label='Successful', color='green')
ax.hist(failed, bins=30, alpha=0.6, label='Failed', color='red')
ax.set_xlabel('Average Reward Price ($)')
ax.set_ylabel('Count')
ax.set_title('Price Distribution: Successful vs Failed Campaigns')
ax.legend()

plt.tight_layout()
plt.show()

print(f"Mean price (successful): ${successful.mean():.2f}")
print(f"Mean price (failed): ${failed.mean():.2f}")

In [None]:
# Press coverage as confounder
fig, ax = plt.subplots(figsize=(10, 6))

colors = df['got_press_coverage'].map({0: 'blue', 1: 'red'})
ax.scatter(df['avg_reward_price'], df['funding_ratio'], c=colors, alpha=0.5)
ax.set_xlabel('Average Reward Price ($)')
ax.set_ylabel('Funding Ratio')
ax.set_title('Price vs Funding Ratio (colored by press coverage)')

# Add legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', label='No Press', markersize=10),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', label='Got Press', markersize=10)
]
ax.legend(handles=legend_elements)

plt.tight_layout()
plt.show()

## 4. Save Engineered Features

In [None]:
# Save processed data
output_path = '../data/processed/kickstarter_causal_features.csv'
df.to_csv(output_path, index=False)
print(f"Saved {len(df)} campaigns with {len(df.columns)} features to {output_path}")

print("\nFinal features:")
print(df.columns.tolist())