In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_simulation import TaxSoftwareDataSimulator, generate_and_save_data

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Imports successful!")


In [None]:
# Load configuration to see parameters
from src.data_simulation import load_config
import yaml

# Load the main configuration
config = load_config("../config/simulation_config.yaml")

print("📋 SIMULATION CONFIGURATION PREVIEW")
print("="*50)
print(f"Random seed: {config['simulation']['random_seed']}")
print(f"Default users: {config['simulation']['default_n_users']:,}")
print(f"Treatment base rate: {config['treatment']['base_adoption_rate']:.1%}")
print(f"Treatment base effect: {config['outcomes_2024']['filing']['treatment_effects']['base_effect']:.1%}")

print("\n📊 DEMOGRAPHIC DISTRIBUTIONS:")
for demo_type in ['income_brackets', 'device_types', 'user_types']:
    values = config['demographics'][demo_type]['values']
    weights = config['demographics'][demo_type]['weights']
    print(f"  {demo_type}: {dict(zip(values, weights))}")

print("\n🎯 TECH-SAVVINESS PARAMETERS:")
tech_config = config['tech_savviness']
print(f"  Base score: {tech_config['base_score']}")
print(f"  Young boost (age < {tech_config['age_adjustments']['young_threshold']}): +{tech_config['age_adjustments']['young_boost']}")
print(f"  High income boost: +{tech_config['income_adjustments']['high_income_boost']}")

print("\\n✅ All parameters are now configurable in YAML files!")


In [None]:
# Generate datasets with different scenarios
from src.data_simulation import TaxSoftwareDataSimulator
import os

scenarios = {
    'baseline': '../config/simulation_config.yaml',
    'high_treatment': '../config/scenario_high_treatment.yaml', 
    'low_adoption': '../config/scenario_low_adoption.yaml'
}

results = {}
sample_size = 5000  # Smaller for demo

print("🔄 GENERATING MULTIPLE SCENARIOS")
print("="*50)

for scenario_name, config_path in scenarios.items():
    if os.path.exists(config_path):
        print(f"\\n📊 Generating {scenario_name} scenario...")
        
        simulator = TaxSoftwareDataSimulator(n_users=sample_size, config_path=config_path)
        df = simulator.generate_complete_dataset()
        
        results[scenario_name] = {
            'data': df,
            'treatment_rate': df['used_smart_assistant'].mean(),
            'filing_rate_2024': df['filed_2024'].mean(),
            'naive_effect': (df[df['used_smart_assistant']==1]['filed_2024'].mean() - 
                           df[df['used_smart_assistant']==0]['filed_2024'].mean())
        }
        
        print(f"   Treatment rate: {results[scenario_name]['treatment_rate']:.1%}")
        print(f"   2024 filing rate: {results[scenario_name]['filing_rate_2024']:.1%}")
        print(f"   Naive treatment effect: {results[scenario_name]['naive_effect']:.1%}")
    else:
        print(f"⚠️  Config file not found: {config_path}")

print("\\n✅ Scenario generation complete!")


In [None]:
# Compare scenarios visually
import matplotlib.pyplot as plt
import pandas as pd

if len(results) > 1:
    # Create comparison DataFrame
    comparison_data = []
    for scenario, result in results.items():
        comparison_data.append({
            'Scenario': scenario.replace('_', ' ').title(),
            'Treatment Rate': result['treatment_rate'],
            'Filing Rate 2024': result['filing_rate_2024'],
            'Naive Treatment Effect': result['naive_effect']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Plot comparison
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Treatment rates
    axes[0].bar(comparison_df['Scenario'], comparison_df['Treatment Rate'], color='skyblue')
    axes[0].set_title('Treatment Adoption Rates')
    axes[0].set_ylabel('Rate')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Filing rates
    axes[1].bar(comparison_df['Scenario'], comparison_df['Filing Rate 2024'], color='lightgreen')
    axes[1].set_title('2024 Filing Rates')
    axes[1].set_ylabel('Rate')
    axes[1].tick_params(axis='x', rotation=45)
    
    # Treatment effects
    axes[2].bar(comparison_df['Scenario'], comparison_df['Naive Treatment Effect'], color='coral')
    axes[2].set_title('Naive Treatment Effects')
    axes[2].set_ylabel('Effect Size')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("📈 SCENARIO COMPARISON TABLE:")
    print(comparison_df.round(3))
else:
    print("⚠️  Only baseline scenario available for comparison")


In [None]:
# Use the baseline scenario data
if 'baseline' in results:
    df = results['baseline']['data']
else:
    # Generate baseline if not available
    from src.data_simulation import generate_and_save_data
    df = generate_and_save_data()

print("📊 BASELINE DATASET LOADED")
print("="*40)
print(f"Dataset shape: {df.shape}")
print(f"Treatment rate: {df['used_smart_assistant'].mean():.1%}")
print(f"2024 filing rate: {df['filed_2024'].mean():.1%}")

# Display first few rows
df.head()


In [None]:
# Load the generated dataset
df = pd.read_csv("../data/simulated_users.csv")

# Basic info about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:")
print(df.columns.tolist())

# First few rows
df.head()


In [None]:
# Summary statistics
df.describe()


In [None]:
# Treatment and outcome distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Treatment distribution
treatment_counts = df['used_smart_assistant'].value_counts()
axes[0,0].pie(treatment_counts.values, labels=['No', 'Yes'], autopct='%1.1f%%', startangle=90)
axes[0,0].set_title('Smart Assistant Usage Distribution')

# 2024 filing rates by treatment
filing_by_treatment = df.groupby('used_smart_assistant')['filed_2024'].mean()
axes[0,1].bar(['Control', 'Treated'], filing_by_treatment.va'lues, color=['lightcoral', 'lightblue'])
axes[0,1].set_title('2024 Filing Rate by Treatment')
axes[0,1].set_ylabel('Filing Rate')
for i, v in enumerate(filing_by_treatment.values):
    axes[0,1].text(i, v + 0.01, f'{v:.1%}', ha='center')

# Income distribution
income_counts = df['income_bracket'].value_counts()
axes[1,0].bar(range(len(income_counts)), income_counts.values)
axes[1,0].set_xticks(range(len(income_counts)))
axes[1,0].set_xticklabels(income_counts.index, rotation=45)
axes[1,0].set_title('Income Bracket Distribution')
axes[1,0].set_ylabel('Count')

# Tech-savviness distribution
axes[1,1].hist(df['tech_savviness'], bins=20, alpha=0.7, edgecolor='black')
axes[1,1].set_title('Tech-Savviness Distribution')
axes[1,1].set_xlabel('Tech-Savviness Score (0-100)')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()
