# 01 - Data Exploration

This notebook explores the ECLS-K:2011 public-use data for the fairness study.

## Objectives
1. Load and inspect the data
2. Check variable availability and missing rates
3. Generate descriptive statistics
4. Create analytic sample

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.data_loader import (
    load_config,
    load_ecls_data,
    handle_missing_values,
    create_race_variable,
    create_ses_variable,
    get_variable_lists
)

# Settings
pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Configuration

In [None]:
# Load config
config = load_config('../config.yaml')
vars = get_variable_lists(config)

print("Outcome variables:", vars['outcomes'])
print("Demographic variables:", vars['demographics'])
print("Number of predictors:", len(vars['predictors']))

## 2. Load Data

**Note:** You need to download the ECLS-K:2011 data first from:
https://nces.ed.gov/ecls/dataproducts.asp

In [None]:
# Check if data exists
data_path = Path('../data/processed/analytic_sample.parquet')
raw_path = Path('../data/raw/')

if data_path.exists():
    print(f"Loading processed data from {data_path}")
    df = pd.read_parquet(data_path)
elif list(raw_path.glob('*.csv')) or list(raw_path.glob('*.dat')):
    print("Loading raw data...")
    # df = load_ecls_data(str(raw_path / 'your_data_file.csv'))
    print("Please update the file path above")
else:
    print("Data not found. Please download from NCES:")
    print("https://nces.ed.gov/ecls/dataproducts.asp")
    print("\nFor now, creating synthetic data for demonstration...")
    
    # Create synthetic data for demonstration
    np.random.seed(42)
    n = 5000
    
    df = pd.DataFrame({
        # Demographics
        'X_RACETH_R': np.random.choice([1, 2, 3, 4, 7], n, p=[0.5, 0.15, 0.25, 0.05, 0.05]),
        'X_CHSEX_R': np.random.choice([1, 2], n),
        'X1SESQ5': np.random.choice([1, 2, 3, 4, 5], n),
        'X12LANGST': np.random.choice([1, 2], n, p=[0.8, 0.2]),
        
        # Baseline scores
        'X1RTHETK': np.random.normal(0, 1, n),
        'X2RTHETK': np.random.normal(0.2, 1, n),
        'X1MTHETK': np.random.normal(0, 1, n),
        'X2MTHETK': np.random.normal(0.2, 1, n),
        
        # Executive function
        'X4DCCSSCR': np.random.normal(50, 10, n),
        'X6DCCSSCR': np.random.normal(55, 10, n),
        
        # Approaches to learning
        'X1TCHAPP': np.random.normal(3, 0.5, n),
        'X2TCHAPP': np.random.normal(3.1, 0.5, n),
        
        # Outcomes
        'X9RTHETA': np.random.normal(1.5, 1, n),
        'X9MTHETA': np.random.normal(1.5, 1, n)
    })
    
    print(f"Created synthetic data with {len(df)} records")

## 3. Data Overview

In [None]:
print(f"Dataset shape: {df.shape}")
print(f"\nColumn types:\n{df.dtypes.value_counts()}")
df.head()

In [None]:
# Missing data summary
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(1)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print("Variables with missing data:")
print(missing_df[missing_df['Missing Count'] > 0].head(20))

## 4. Create Derived Variables

In [None]:
# Handle missing values
df = handle_missing_values(df)

# Create race variable
df = create_race_variable(df)

# Create SES variable
df = create_ses_variable(df)

print("New variables created:")
print(df[['race_ethnicity', 'ses_category']].head())

## 5. Descriptive Statistics

In [None]:
# Race/ethnicity distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Race
race_counts = df['race_ethnicity'].value_counts()
axes[0].bar(race_counts.index, race_counts.values, color='steelblue')
axes[0].set_title('Race/Ethnicity Distribution')
axes[0].tick_params(axis='x', rotation=45)

# SES
ses_counts = df['ses_category'].value_counts().sort_index()
axes[1].bar(ses_counts.index, ses_counts.values, color='coral')
axes[1].set_title('SES Distribution')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Outcome distributions by race
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reading
sns.boxplot(data=df, x='race_ethnicity', y='X9RTHETA', ax=axes[0])
axes[0].set_title('5th Grade Reading Scores by Race/Ethnicity')
axes[0].tick_params(axis='x', rotation=45)

# Math
sns.boxplot(data=df, x='race_ethnicity', y='X9MTHETA', ax=axes[1])
axes[1].set_title('5th Grade Math Scores by Race/Ethnicity')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
numeric_cols = ['X1RTHETK', 'X2RTHETK', 'X1MTHETK', 'X9RTHETA', 'X9MTHETA']
available_cols = [c for c in numeric_cols if c in df.columns]

if len(available_cols) > 1:
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        df[available_cols].corr(),
        annot=True,
        cmap='coolwarm',
        center=0,
        fmt='.2f'
    )
    plt.title('Correlation Between Cognitive Scores')
    plt.tight_layout()
    plt.show()

## 6. Create At-Risk Indicator

In [None]:
from src.data_loader import create_at_risk_indicator

# Create at-risk indicator (< 25th percentile)
df = create_at_risk_indicator(df, 'X9RTHETA', percentile=25)

# Check prevalence by group
prevalence = df.groupby('race_ethnicity')['X9RTHETA_at_risk'].agg(['mean', 'count'])
prevalence.columns = ['At-Risk Rate', 'N']
prevalence['At-Risk Rate'] = (prevalence['At-Risk Rate'] * 100).round(1).astype(str) + '%'
print("At-Risk Prevalence by Race/Ethnicity:")
print(prevalence)

## 7. Save Processed Data

In [None]:
# Save to parquet
output_path = Path('../data/processed/')
output_path.mkdir(parents=True, exist_ok=True)

df.to_parquet(output_path / 'analytic_sample.parquet')
print(f"Saved {len(df)} records to {output_path / 'analytic_sample.parquet'}")

## Next Steps

1. **02_model_development.ipynb** - Train and evaluate ML models
2. **03_fairness_analysis.ipynb** - Evaluate algorithmic fairness
3. **04_results_summary.ipynb** - Generate final figures and tables