In [1]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path for imports
sys.path.append('../src')

from causal_methods.psm import PropensityScoreMatching, load_and_analyze_psm
from data_simulation import generate_and_save_data

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
# Generate synthetic data for demonstration
data_path = "../data/synthetic_tax_data.csv"

# Check if data exists, otherwise generate it
if not Path(data_path).exists():
    print("Generating synthetic data...")
    df = generate_and_save_data(
        output_path=data_path,
        n_users=1000,  # Larger sample for PSM
        config_path="../config/simulation_config.yaml"
    )
else:
    print("Loading existing data...")
    df = pd.read_csv(data_path)

print(f"\nDataset shape: {df.shape}")
print(f"Treatment rate: {df['used_smart_assistant'].mean():.1%}")
print(f"Filing rate 2024: {df['filed_2024'].mean():.1%}")

# Display first few rows
df.head()


Loading existing data...

Dataset shape: (1000, 20)
Treatment rate: 61.4%
Filing rate 2024: 91.2%


Unnamed: 0,user_id,age,income_bracket,device_type,user_type,region,tech_savviness,filed_2023,time_to_complete_2023,sessions_2023,support_tickets_2023,early_login_2024,used_smart_assistant,filed_2024,time_to_complete_2024,sessions_2024,support_tickets_2024,satisfaction_2024,time_improvement,session_improvement
0,user_000000,41,50k-75k,tablet,returning,Midwest,41,True,113,3,0,False,True,True,75,1,0,9.619331,38,2
1,user_000001,37,<30k,desktop,returning,Midwest,43,True,54,1,0,False,False,True,38,3,0,7.41611,16,-2
2,user_000002,19,30k-50k,mobile,new,East,61,True,62,2,0,False,False,True,60,1,0,7.508821,2,1
3,user_000003,66,75k-100k,mobile,new,East,31,True,99,3,0,True,True,True,59,6,0,6.923859,40,-3
4,user_000004,27,30k-50k,desktop,returning,West,85,True,57,3,0,True,True,False,0,0,0,,57,3


In [3]:
# Initialize PSM and run quick analysis
psm = PropensityScoreMatching(df)

# Define covariates
covariates = [
    'age', 'tech_savviness', 'income_bracket', 'device_type',
    'user_type', 'region', 'filed_2023', 'early_login_2024'
]

# Estimate propensity scores
ps_results = psm.estimate_propensity_scores(covariates=covariates)
print(f"AUC Score: {ps_results['auc_score']:.3f}")

# Perform matching
matching_results = psm.perform_matching(method='nearest_neighbor', caliper=0.1)
print(f"Matching rate: {matching_results['matching_rate']:.1%}")

# Estimate treatment effects
effects = psm.estimate_treatment_effects(outcome_cols='filed_2024')
effect = effects['filed_2024']
print(f"Treatment effect on filing: {effect['ate']:.4f}")
print(f"P-value: {effect['p_value']:.6f}" if not np.isnan(effect['p_value']) else f"P-value: {effect['p_value']}")
print(f"95% CI: [{effect['ci_lower']:.4f}, {effect['ci_upper']:.4f}]")

# Statistical significance
if effect['p_value'] < 0.05:
    print("✅ Result is statistically significant (p < 0.05)")
elif effect['p_value'] < 0.1:
    print("⚠️ Result is marginally significant (p < 0.1)")
else:
    print("❌ Result is not statistically significant")

# Generate comprehensive report
print("\n" + "="*60)
print(psm.generate_summary_report())


AUC Score: 0.775
Matching rate: 33.2%
Treatment effect on filing: 0.0245
P-value: 0.365174
95% CI: [-0.0285, 0.0775]
❌ Result is not statistically significant

PROPENSITY SCORE MATCHING ANALYSIS SUMMARY

1. PROPENSITY SCORE MODEL:
------------------------------
Model type: Logistic Regression
Propensity score range: [0.056, 0.983]
Mean propensity score: 0.614

2. MATCHING RESULTS:
------------------------------
Original treated units: 614
Matched treated units: 204
Matched control units: 204
Matching rate: 33.2%

4. TREATMENT EFFECTS:
------------------------------
filed_2024:
  ATE: 0.025
  95% CI: [-0.028, 0.078]
  P-value: 0.3652

