In [None]:
# ============================================================================
# Import all required libraries
# ============================================================================

import glob
import os
import string
import sys
import textwrap
import unicodedata

sys.path.append('..')

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from Levenshtein import distance as levenshtein_distance
from scipy import stats

from utils import (
    normalize_text,
    process_primary_language,
    categorize_language,
    group_language_by_family,
    check_english_only,
    check_multilingual,
    check_not_english,
    read_transcription_data,
)

# ============================================================================
# Configuration
# ============================================================================
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = 24

pd.set_option('display.max_rows', 20)


In [None]:
# Load all model families together
MODEL_FAMILY = "whisper"

# Model to model family mapping
MODEL_TO_FAMILY = {
    # Whisper models
    'tiny': 'whisper',
    'base': 'whisper',
    'small': 'whisper', 
    'medium': 'whisper',
    'large': 'whisper',
    'turbo': 'whisper',
}

# Define model sizes (in millions of parameters) for use in visualizations
model_sizes = {
    'tiny': 39,
    'base': 74,
    'small': 244,
    'medium': 769,
    'large': 1550,
}

In [None]:

def read_transcription_data(MODEL_FAMILY, meta_demographics_file, verbose=True):
    """
    Load transcription data and join with demographics from meta file.
    
    Args:
        MODEL_FAMILY: Model family to load ('whisper', 'all', etc.)
        meta_demographics_file: Path to CSV file with all demographic data
        verbose: Print loading progress
    """
    # Load all TSV files from the selected model family
    if MODEL_FAMILY == 'all':
        tsv_files = glob.glob("transcriptions/*/*.tsv")
    else:
        tsv_files = glob.glob(f"transcriptions/{MODEL_FAMILY}/*.tsv")

    if len(tsv_files) == 0:
        raise ValueError(f"No TSV files found for MODEL_FAMILY='{MODEL_FAMILY}'")
    elif len(tsv_files) == 1:
        data = pd.read_csv(tsv_files[0], sep='\t')
    else:
        data = pd.concat([pd.read_csv(f, sep='\t') for f in tsv_files], ignore_index=True)

    if verbose:
        print(f"Loaded {len(data)} rows from {len(tsv_files)} file(s) ({MODEL_FAMILY})")

    data['transcription_og'] = data['transcription']
    if 'prompt' in data.columns:
        data['prompt'] = data['prompt'].fillna("No prompt")
    else:
        data['prompt'] = "No prompt"
    data = data.drop_duplicates()

    # Load demographics from meta file
    demo = pd.read_csv(meta_demographics_file)
    demo = demo[demo['Status'] == 'APPROVED']
    if verbose:
        print(f"Loaded {len(demo)} approved records from meta demographics file")

    demo['Primary language'] = demo['Primary language'].apply(process_primary_language)
    
    # Join transcription data with demographics
    data = data.set_index("participant_id").join(demo.set_index("Participant id"), how='left').reset_index()
    data = data.rename(columns={'index': 'participant_id'})
    
    data = data.dropna(subset=['Primary language'])

    data['english_only'] = data['Primary language'].apply(check_english_only)
    data['multilingual'] = data['Primary language'].apply(check_multilingual)
    data['not_english'] = data['Primary language'].apply(check_not_english)

    data['answer'] = data['original_text'].apply(lambda x: str(x).split(":")[-1].replace('"', "").lower())
    data['answer'] = data['answer'].apply(lambda x: normalize_text(x))
    data['transcription'] = data['transcription'].apply(lambda x: normalize_text(x))

    data['levenshtein_distance'] = data.apply(lambda row: levenshtein_distance(row['answer'], row['transcription']), axis=1)
    data['is_correct'] = data['levenshtein_distance'] == 0

    data['Age'] = data['Age'].astype(int)
    data['age_decade'] = data['Age'] // 10

    # prefer not to say grouped as 1
    data['Sex'] = data['Sex'].apply(lambda x: 0 if x == 'Male' else 1)

    return data


In [None]:

meta_demographics_file = "demographic_data/grandfather_task_demographics.csv"

data = read_transcription_data(MODEL_FAMILY, meta_demographics_file=meta_demographics_file)

#getting rid of some inconsistent whitespace --- same prompt was passed in
data['prompt'] = data['prompt'].str.replace('\xa0', ' ', regex=False)

street_origin = pd.read_csv("../street_names.tsv")
street_origin['name'] = street_origin['name'].str.lower()


data = data.set_index("answer").join(street_origin.set_index("name"), how='left').reset_index()
data.columns = ['answer', 'participant_id', 'index', 'model', 'original_text',
       'transcription', 'transcription_og', 'prompt', 'Submission id',
       'Status', 'Custom study tncs accepted at', 'Started at', 'Completed at',
       'Reviewed at', 'Archived at', 'Time taken', 'Completion code',
       'Total approvals', 'Primary language', 'Age', 'Sex',
       'Ethnicity simplified', 'Country of birth', 'Country of residence',
       'Nationality', 'Language', 'Student status', 'Employment status',
       'english_only', 'multilingual', 'not_english', 'levenshtein_distance',
       'is_correct', 'age_decade', 'origin']
data['correct'] = data['transcription'] == data['answer']
data['language_family'] = data['Primary language'].apply(group_language_by_family)

# Add model_family column based on model name
data['model_family'] = data['model'].map(MODEL_TO_FAMILY)

data['language_group'] = data['Primary language'].apply(categorize_language)
data['is_correct'] = data['is_correct'].astype(float)

data['model'] = data['model'].replace("whisper_large_synthetic_16633_20251216_160230", "large-finetuned")
print(f"\nModel families loaded: {data['model_family'].unique().tolist()}")
print(f"Models per family:\n{data.groupby('model_family')['model'].unique()}")


In [None]:
#TODO some data cleaning



In [None]:
data[data['model'] == 'base'].groupby("participant_id").count()

In [None]:
participant_data = data.groupby("participant_id").sample(n=1)
display(participant_data.groupby("Sex").count())
display(participant_data.groupby("Sex").count()/93)

In [None]:
display(participant_data.groupby("age_decade").count())
display((participant_data.groupby("age_decade").count()/93).round(3))

In [None]:
display(participant_data.groupby("language_group").count())
display(participant_data.groupby("language_group").count()/93)

In [None]:
value = str([x.replace("'", "").replace("'", "")  for x in participant_data["Primary language"].unique()])
values = value.split(",")
values = [x.replace("'", "").replace(" ", "").replace("[", "").replace("]", "") for x in values]

print(set(values))
print(len(set(values)))

In [None]:

# Bootstrap sampling function with confidence intervals
def bootstrap_accuracy(data, n_bootstrap=10000, confidence_level=0.95):
    n = len(data)
    if n == 0:
        return {'mean': np.nan, 'lower': np.nan, 'upper': np.nan, 'std': np.nan}
    
    # Store bootstrap accuracies
    bootstrap_accuracies = np.zeros(n_bootstrap)
    
    # Perform bootstrap sampling
    for i in range(n_bootstrap):
        # Sample with replacement
        bootstrap_sample = data.sample(n=n, replace=True)
        bootstrap_accuracies[i] = bootstrap_sample['is_correct'].mean()
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    ci_lower = np.percentile(bootstrap_accuracies, lower_percentile)
    ci_upper = np.percentile(bootstrap_accuracies, upper_percentile)
    
    return {
        'mean': data['is_correct'].mean(),
        'lower': ci_lower,
        'upper': ci_upper,
        'std': np.std(bootstrap_accuracies)
    }

def calculate_bootstrap_by_model_group(data, n_bootstrap=10000):
    results = []
    
    for (model, language_group), group_data in data.groupby(['model', 'language_group'], observed=True):
        boot_result = bootstrap_accuracy(group_data, n_bootstrap=n_bootstrap)
        results.append({
            'model': model,
            'language_group': language_group,
            'mean': boot_result['mean'],
            'lower': boot_result['lower'],
            'upper': boot_result['upper'],
            'std': boot_result['std']
        })
    
    results_df = pd.DataFrame(results)
    results_df['model_size'] = results_df['model'].map(model_sizes)
    
    return results_df

In [None]:
data.groupby(['prompt', 'language_group']).mean(numeric_only=True)[['is_correct']]


In [None]:
# Visualization with bootstrap confidence bands

def visualize_data_with_confidence_bands(data, ax, prompt, n_bootstrap=10000, colors=None):
    """
    Visualize accuracy data with 95% confidence bands using bootstrap sampling.
    
    Parameters:
    -----------
    data : DataFrame
        Data to visualize
    ax : matplotlib axis
        Axis to plot on
    prompt : str
        Prompt text for the title
    n_bootstrap : int
        Number of bootstrap samples (default: 10000)
    colors : dict
        Colors for each language group (default: None, uses default colors)
    """
    # Filter out DATA_EXPIRED language
    data_filtered = data.copy()
    
    # Add language group
    data_filtered['language_group'] = data_filtered['Primary language'].apply(categorize_language)
    
    # Calculate bootstrap confidence intervals
    print(f"Calculating bootstrap confidence intervals for: {prompt[:50]}...")
    bootstrap_results = calculate_bootstrap_by_model_group(data_filtered, n_bootstrap=n_bootstrap)
    
    # Define colors for the three groups (darkest = Non-English, lightest = English only)
    if colors is None:
        colors = {'English only': '#59A14F', 'Multilingual (English)': '#76B7B2', 'Non-English': '#E15759'}
    
    # Filter to only models with sizes in model_sizes
    bootstrap_results = bootstrap_results.dropna(subset=['model_size'])
    
    # Use log scale for x-axis
    ax.set_xscale('log')
    
    # Plot each language group as a separate line with confidence bands
    for language_group in ['English only', 'Multilingual (English)', 'Non-English']:
        group_data = bootstrap_results[bootstrap_results['language_group'] == language_group]
        # Sort by model size and reset index to ensure proper alignment
        group_data = group_data.sort_values('model_size').reset_index(drop=True)
        
        if len(group_data) > 0:
            # Use model sizes for x positions
            x = group_data['model_size'].values
            y_mean = group_data['mean'].values
            y_lower = group_data['lower'].values
            y_upper = group_data['upper'].values
            
            # Plot confidence band (shaded area)
            ax.fill_between(x, y_lower, y_upper, alpha=0.2, color=colors[language_group])
            
            # Plot line (lighter)
            ax.plot(x, y_mean, linewidth=2, alpha=0.3, color=colors[language_group])
            
            # Plot markers with full opacity
            ax.plot(x, y_mean, marker='o', linewidth=0, markersize=8, 
                    color=colors[language_group], label=language_group)
    
    
    ax.set_xlabel('Model Size (Millions of Parameters)', fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    
    # Wrap title to multiple lines if needed
    wrapped_title = '\n'.join(textwrap.wrap(prompt[:80], width=50))
    ax.set_title(wrapped_title, fontsize=14)
    
    ax.set_ylim(0, 1.0)
    ax.set_xlim(25, 20000)
    ax.grid(True, alpha=0.3)
    


In [None]:

# Create plot with 3 subplots and confidence bands
print("Creating visualization with bootstrap confidence bands...")
print("Note: This may take a few minutes due to 10,000 bootstrap samples per group.\n")

sns.set_style("whitegrid")
fig, axes = plt.subplots(figsize=(15, 6), nrows=1, ncols=3)


# Get unique prompts, filtering out NaN values, and sort
unique_prompts = sorted([p for p in data['prompt'].unique() if pd.notna(p)])
for n, prompt in enumerate(unique_prompts):
    visualize_data_with_confidence_bands(data[data['prompt'] == prompt], axes[n], prompt, n_bootstrap=10000)

# Update axis labels to match style
for ax in axes:
    ax.set_xlabel('Model Size (Millions of Parameters)', fontsize=14)
    ax.set_ylabel('Average Accuracy', fontsize=14)

# Create a single shared legend below all subplots
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, title='Language Group', 
           bbox_to_anchor=(0.5, -0.02), loc='upper center', ncol=3, 
           frameon=True, fontsize=12, title_fontsize=12)

fig.suptitle('Transcription Accuracy by Prompt Type w/ 95% CI', fontsize=16, y=1.02)

plt.tight_layout()

# Save to figures folder as high-quality PDF
output_dir = "figures"
os.makedirs(output_dir, exist_ok=True)

# Get model names from the data
model_names = "_".join(sorted(data['model'].unique()))
output_filename = f"all_models_accuracy_{model_names}.pdf"
output_path = os.path.join(output_dir, output_filename)

plt.savefig(f'figures/accuracy_by_figure_2.png', dpi=150, bbox_inches='tight')
print(f"\n✓ Saved high-quality PDF to: {output_path}")

plt.show()

print("\n✓ Visualization complete with 95% bootstrap confidence intervals!")



In [None]:
# Single graph with three lines - one for each prompt type
# Shows average performance across all language groups

sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize=(10, 8))

# Get unique prompts
unique_prompts = sorted([p for p in data['prompt'].unique() if pd.notna(p)])

# Define colors for each prompt
prompt_colors = ['#2E86AB', '#F18F01', '#A23B72']
prompt_labels = ['No prompt', 'Prompt - The user is going to give you their location via an address ', 'The user is going to give you their location via one of the following addresses:...']

ax.set_xscale('log')

for idx, prompt in enumerate(unique_prompts):
    prompt_data = data[data['prompt'] == prompt].copy()
    
    # Calculate average accuracy by model (across all language groups)
    model_accuracy = prompt_data.groupby('model').agg({
        'is_correct': 'mean'
    }).reset_index()
    
    # Add model sizes
    model_accuracy['model_size'] = model_accuracy['model'].map(model_sizes)
    model_accuracy = model_accuracy.dropna(subset=['model_size'])
    model_accuracy = model_accuracy.sort_values('model_size')
    
    # Calculate bootstrap CI for each model
    ci_results = []
    for model in model_accuracy['model'].unique():
        model_data = prompt_data[prompt_data['model'] == model]
        if len(model_data) > 0:
            boot_result = bootstrap_accuracy(model_data, n_bootstrap=10000)
            ci_results.append({
                'model': model,
                'model_size': model_sizes.get(model, np.nan),
                'mean': boot_result['mean'],
                'lower': boot_result['lower'],
                'upper': boot_result['upper']
            })
    
    ci_df = pd.DataFrame(ci_results).dropna(subset=['model_size']).sort_values('model_size')
    
    if len(ci_df) > 0:
        x = ci_df['model_size'].values
        y_mean = ci_df['mean'].values
        y_lower = ci_df['lower'].values
        y_upper = ci_df['upper'].values
        
        # Plot confidence band
        ax.fill_between(x, y_lower, y_upper, alpha=0.2, color=prompt_colors[idx])
        
        # Plot line
        ax.plot(x, y_mean, marker='o', linewidth=2, markersize=8, 
                color=prompt_colors[idx], label=prompt_labels[idx] if idx < len(prompt_labels) else prompt[:30])
        
        # Add model labels (only for first prompt to avoid clutter)
        if idx == 0:
            label_y = 0.98  # Fixed y-coordinate for all labels
            for i, row in ci_df.iterrows():
                # Draw vertical line from label to data point
                ax.plot([row['model_size'], row['model_size']], [row['mean'], label_y - 0.07], 
                       color='gray', linewidth=0.5, linestyle='-', alpha=0.5)
                # Add label at fixed y position
                ax.text(row['model_size'], label_y - 0.07, row['model'], 
                       ha='center', va='bottom', fontsize=14, rotation=30)

# Add dashed horizontal lines for average accuracy of each prompt
for idx, prompt in enumerate(unique_prompts):
    prompt_data = data[data['prompt'] == prompt]
    avg_accuracy = prompt_data['is_correct'].mean()
    ax.axhline(y=avg_accuracy, color=prompt_colors[idx], linestyle='--', linewidth=.7, alpha=0.7)
    # Add label on right side
    ax.text(22000, avg_accuracy, f'{avg_accuracy:.2f}', 
            color=prompt_colors[idx], fontsize=14, va='center')

ax.set_xlabel('Model Size in Millions of Parameters (Log Scale)', fontsize=14)
ax.set_ylabel('Average Accuracy', fontsize=14)
ax.set_title('Transcription Accuracy by Prompt Type w/ 95% CI', 
             fontsize=16)
ax.set_ylim(0, 1.0)
ax.set_xlim(25, 20000)
ax.grid(True, alpha=0.3)
ax.legend(title='Prompt Type', fontsize=12, title_fontsize=12, loc='lower right')

plt.savefig(f'figures/overall_accuracies_figure_1.png', dpi=150, bbox_inches='tight')
plt.tight_layout()
plt.show()

In [None]:
# Calculate grouped means
grouped_means = data[data['prompt']=='No prompt'][data['model']!='large-finetuned'].groupby(['model', 'language_group']).mean(numeric_only=True).sort_values('is_correct', ascending=False)[['is_correct']]

# Calculate confidence intervals for each group using bootstrap_accuracy function from Cell 10
ci_results = []
for (model, language_group), group_data in data[data['prompt']=='No prompt'][data['model']!='large-finetuned'].groupby(['model', 'language_group'], observed=True):
    boot_result = bootstrap_accuracy(group_data, n_bootstrap=10)
    ci_results.append({
        'model': model,
        'language_group': language_group,
        'mean': boot_result['mean'],
        'ci_lower': boot_result['lower'],
        'ci_upper': boot_result['upper'],
        'error_lower': boot_result['mean'] - boot_result['lower'],
        'error_upper': boot_result['upper'] - boot_result['mean'],
        'n': len(group_data),
        'std': boot_result['std']
    })

ci_df = pd.DataFrame(ci_results)

# Add model_family to ci_df
ci_df['model_family'] = ci_df['model'].map(MODEL_TO_FAMILY)
ci_df = ci_df.sort_values(by='mean', ascending=True)

# Get unique model families (excluding NaN)
model_families = [f for f in ci_df['model_family'].unique() if pd.notna(f)]

# Define colors for language groups
colors = {
    'English only': '#2E86AB', 
    'Multilingual (English)': '#F18F01', 
    'Non-English': '#A23B72'
}
language_groups = ['English only', 'Multilingual (English)', 'Non-English']

# Create single figure with horizontal subplots scaled by number of models
n_families = len(model_families)

# Calculate width ratios based on number of models in each family
width_ratios = [len(ci_df[ci_df['model_family'] == f]['model'].unique()) for f in model_families]
total_models = sum(width_ratios)

fig, axes = plt.subplots(1, n_families, figsize=(1.2 * total_models, 8), sharey=True,
                         gridspec_kw={'width_ratios': width_ratios})
if n_families == 1:
    axes = [axes]

width = 0.25
offsets = np.array([-width, 0, width])

for idx, family in enumerate(model_families):
    ax = axes[idx]
    family_df = ci_df[ci_df['model_family'] == family]
    models = family_df['model'].unique()
    
    # Set up bar positions
    n_models = len(models)
    x = np.arange(n_models)
    
    # Plot bars for each language group
    for i, lang_group in enumerate(language_groups):
        group_data = family_df[family_df['language_group'] == lang_group].copy()
        group_data = group_data.set_index('model').reindex(models).reset_index()
        
        bars = ax.bar(x + offsets[i], 
                      group_data['mean'], 
                      width, 
                      label=lang_group,
                      color=colors[lang_group],
                      alpha=0.8,
                      edgecolor='white',
                      linewidth=1.5)
        
        
        # Only plot error bars for non-NaN values
        valid_mask = ~group_data['mean'].isna()
        if valid_mask.any():
            ax.errorbar(x[valid_mask] + offsets[i], 
                        group_data.loc[valid_mask, 'mean'],
                        yerr=[group_data.loc[valid_mask, 'error_lower'].fillna(0), 
                              group_data.loc[valid_mask, 'error_upper'].fillna(0)],
                        fmt='none',
                        ecolor='black',
                        elinewidth=0.7,
                        capsize=4,
                        capthick=0.7,
                        alpha=0.7)
    
    
    ax.set_title(f'{family.upper()}', fontsize=16)
    # ax.set_xlabel('Model', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(models, fontsize=12, rotation=45, ha='right')
    

    ax.set_xlim(-0.5, n_models - 0.5)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_ylim(0, 1.0)

# Only add y-label to first subplot
axes[0].set_ylabel('Accuracy (Mean ± 95% CI)', fontsize=14)

# Add shared legend
handles, labels = axes[0].get_legend_handles_labels()
leg = fig.legend(handles, labels, title='Primary Language', loc='upper center', 
           bbox_to_anchor=(0.5, 0.97), ncol=3, fontsize=11, title_fontsize=11,
           columnspacing=2, handletextpad=0.8, frameon=True, fancybox=False,
           edgecolor='gray', facecolor='white', framealpha=1.0)
leg.get_frame().set_linewidth(0.5)

fig.suptitle('Transcription Accuracy by Model Family and Language Group\n(with 95% Bootstrap Confidence Intervals)', 
             fontsize=14, y=1.02)
plt.savefig(f'figures/potential_figure_2.png', dpi=150, bbox_inches='tight')
plt.subplots_adjust(wspace=0.15, left=0.06, right=0.98, bottom=0.18, top=0.85)
plt.show()


In [None]:
# Prepare data for heatmap
grouped_data = data[data['prompt']=='No prompt'].groupby(["answer", "Primary language"]).mean(numeric_only=True)[['is_correct']]

# Pivot the data for heatmap (Primary language as rows, answer as columns)
heatmap_data = grouped_data.reset_index().pivot(index='Primary language', columns='answer', values='is_correct')

# Sort y-axis by average accuracy across all streets (descending)
heatmap_data['_avg_row'] = heatmap_data.mean(axis=1)
heatmap_data = heatmap_data.sort_values('_avg_row', ascending=False)
heatmap_data = heatmap_data.drop('_avg_row', axis=1)

# Sort x-axis by average accuracy across all languages (descending)
col_averages = heatmap_data.mean(axis=0).sort_values(ascending=False)
heatmap_data = heatmap_data[col_averages.index]

# Create the heatmap
fig, ax = plt.subplots(figsize=(18, 6))
sns.heatmap(heatmap_data, annot=False, cmap='RdYlGn', center=0.5, 
            vmin=0, vmax=1, cbar_kws={'label': 'Accuracy'}, 
            linewidths=0.5, linecolor='gray', ax=ax)
ax.set_title('Transcription Accuracy by Street Name and Language', fontsize=16, pad=15)
ax.set_xlabel('')
ax.set_ylabel('Language', fontsize=12)
# Clean up y-tick labels: remove "English" unless it's the only language
ytick_labels = [label.get_text() for label in ax.get_yticklabels()]
cleaned_labels = []
for label in ytick_labels:
    if label.lower().strip() == 'english':
        cleaned_labels.append(label)
    else:
        # Remove "English, " or ", English" from the label
        cleaned = label.replace('English, ', '').replace(', English', '')
        cleaned_labels.append(cleaned)
ax.set_yticklabels(cleaned_labels, fontsize=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=8)

plt.subplots_adjust(left=0.15, right=0.95, top=0.92, bottom=0.25)
plt.savefig(f'figures/all_models_accuracy_by_street_language.png', dpi=150, bbox_inches='tight')
plt.show()





In [None]:
lep_data = pd.read_csv("population_by_language.tsv", sep="\t")
lep_data = lep_data[['Language', 'LEP_Population']] 
lep_data['LEP_Population'] = lep_data['LEP_Population'].astype(int)
lep_data['LEP_Population_Percent'] = lep_data['LEP_Population'] / 151388
lep_data = lep_data.sort_values('LEP_Population_Percent', ascending=False)

# Create visualization
sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))

# Create bar chart
bars = ax.barh(lep_data['Language'], lep_data['LEP_Population_Percent'], color='steelblue', alpha=0.8)

# Add value labels on bars
for i, (bar, val) in enumerate(zip(bars, lep_data['LEP_Population_Percent'])):
    ax.text(val + 0.005, bar.get_y() + bar.get_height()/2, 
            f'{val:.1%}', va='center', fontsize=9)

ax.set_xlabel('Proportion of LEP Population', fontsize=12)
ax.set_ylabel('Language', fontsize=12)
ax.set_title('Limited English Proficiency (LEP) Population by Language\n(San Francisco)', 
             fontsize=14)
ax.set_xlim(0, max(lep_data['LEP_Population_Percent']) * 1.15)

plt.savefig(f'figures/LEP_SF.png', dpi=150, bbox_inches='tight')
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# OLS Regression: Demographic Features Predicting Transcription Accuracy
# ============================================================================

# Prepare the data for regression
regression_data = data.copy()

# Create dummy variables for categorical features
# Sex: create dummy (Female=1, Male=0 as reference)
regression_data['is_female'] = (regression_data['Sex'] == 'Female').astype(float)

# Language family dummies (Germanic as reference category since English is most common)
language_family_dummies = pd.get_dummies(regression_data['language_family'], prefix='lang_family', drop_first=True, dtype=float)
regression_data = pd.concat([regression_data, language_family_dummies], axis=1)

# Convert boolean columns to float
regression_data['multilingual'] = regression_data['multilingual'].astype(float)
regression_data['not_english'] = regression_data['not_english'].astype(float)

# Create age decade buckets (use existing age_decade column or create from Age)
if 'age_decade' not in regression_data.columns:
    regression_data['age_decade'] = (regression_data['Age'] // 10 * 10).astype(int)

# Age decade dummies (youngest decade as reference)
age_decade_dummies = pd.get_dummies(regression_data['age_decade'], prefix='age', drop_first=True, dtype=float)
regression_data = pd.concat([regression_data, age_decade_dummies], axis=1)

# Define independent variables (demographic features)
demographic_features = ['is_female', 'multilingual', 'not_english']

# Add age decade dummies (excluding reference category)
age_decade_cols = [col for col in regression_data.columns if col.startswith('age_')]
demographic_features.extend(age_decade_cols)

# Add language family dummies (excluding reference category)
lang_family_cols = [col for col in regression_data.columns if col.startswith('lang_family_')]
demographic_features.extend(lang_family_cols)

# Aggregate by participant: mean accuracy per participant, keep demographic features
participant_data = regression_data.groupby('participant_id').agg({
    'is_correct': 'mean',  # Mean accuracy across all street names
    **{feat: 'first' for feat in demographic_features}  # Demographics are constant per participant
}).reset_index()

print(f"Aggregated to {len(participant_data)} participants (from {len(regression_data)} observations)")

# ============================================================================
# Test each feature independently for variance explained
# ============================================================================
print("\n" + "=" * 80)
print("Univariate Tests: Each Feature Tested Independently")
print("=" * 80)

# Prepare y for univariate tests
y_uni = participant_data['is_correct'].values.astype(float)
valid_y = ~np.isnan(y_uni)

# Define feature groups for testing
feature_groups = {
    'Sex (is_female)': ['is_female'],
    'Multilingual': ['multilingual'],
    'Non-English Speaker': ['not_english'],
    'Age Decade': [col for col in participant_data.columns if col.startswith('age_')],
    'Language Family': [col for col in participant_data.columns if col.startswith('lang_family_')]
}

univariate_results = []

for group_name, features in feature_groups.items():
    if not features or not all(f in participant_data.columns for f in features):
        continue
    
    X_uni = participant_data[features].values.astype(float)
    valid_idx = valid_y & ~np.isnan(X_uni).any(axis=1)
    X_valid = X_uni[valid_idx]
    y_valid = y_uni[valid_idx]
    
    # Add constant
    X_valid = sm.add_constant(X_valid)
    
    # Fit univariate model
    model = sm.OLS(y_valid, X_valid).fit()
    
    univariate_results.append({
        'Feature': group_name,
        'N Features': len(features),
        'R²': model.rsquared,
        'Adj. R²': model.rsquared_adj,
        'F-statistic': model.fvalue,
        'F p-value': model.f_pvalue,
        'N': len(y_valid)
    })

# Display results as table
univariate_df = pd.DataFrame(univariate_results)
univariate_df['Significant'] = univariate_df['F p-value'].apply(lambda p: '***' if p < 0.001 else '**' if p < 0.01 else '*' if p < 0.05 else '')
print("\n")
print(univariate_df.to_string(index=False, float_format=lambda x: f'{x:.4f}'))
print("\nSignificance: * p<0.05, ** p<0.01, *** p<0.001")

# ============================================================================
# Full Model
# ============================================================================
print("\n" + "=" * 80)
print("Full Model: All Demographic Features")
print("=" * 80)

# Prepare X and y as numpy arrays (from participant-level data)
X = participant_data[demographic_features].values.astype(float)
y = participant_data['is_correct'].values.astype(float)
feature_names = ['const'] + demographic_features

# Drop rows with NaN
valid_idx = ~(np.isnan(X).any(axis=1) | np.isnan(y))
X = X[valid_idx]
y = y[valid_idx]

# Add constant for intercept
X = sm.add_constant(X)

# Fit OLS model
ols_model = sm.OLS(y, X).fit()

# Print regression summary
print("=" * 80)
print("OLS Regression: Demographic Features Predicting Transcription Accuracy")
print("=" * 80)
print(f"\nDependent Variable: is_correct (mean accuracy per participant)")
print(f"Number of observations: {len(y)}")
print(f"\nReference categories:")
print(f"  - Sex: Male")
print(f"  - Age: Youngest decade in data")
print(f"  - Language family: Germanic (includes English)")
print(f"  - Language background: English only (monolingual)")
print(f"\nFeature names: {feature_names}")
print()
print(ols_model.summary(xname=feature_names))