In [None]:
# ============================================================================
# Import all required libraries
# ============================================================================

import glob
import os
import string
import sys
import textwrap
import unicodedata

sys.path.append('..')

import jiwer
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from IPython.display import Audio, display, HTML
from Levenshtein import distance as levenshtein_distance
from scipy import stats

from utils import (
    normalize_text,
    process_primary_language,
    categorize_language,
    group_language_by_family,
    check_english_only,
    check_multilingual,
    check_not_english,
    read_transcription_data,
)

# ============================================================================
# Configuration
# ============================================================================
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = 24

pd.set_option('display.max_rows', 20)

# Set to small for quick testing, normally set to 10,000
BOOTSTRAP_SAMPLES = 10000


In [None]:
# Load all model families together
MODEL_FAMILY = "all"

# Model to model family mapping
MODEL_TO_FAMILY = {
    # Whisper models
    'tiny': 'whisper',
    'base': 'whisper',
    'small': 'whisper', 
    'medium': 'whisper',
    'large': 'whisper',

    # Phi4 models
    'phi-4-multimodal': 'phi4',
    # Deepgram models
    'nova-2': 'deepgram',
    'nova-3': 'deepgram',
    'enhanced-phonecall': 'deepgram',
    'enhanced-general': 'deepgram',
    'base-phonecall': 'deepgram',
    'base-general': 'deepgram',
    'telephony': 'deepgram',

    # Google v2 models
    'chirp_2': 'googlev2',
    'chirp_3': 'googlev2',
}

# Define model sizes (in millions of parameters) for use in visualizations
model_sizes = {
    'tiny': 39,
    'base': 74,
    'small': 244,
    'medium': 769,
    'large': 1550,
    'phi-4-multimodal': 14000,
}


In [None]:
# Get list of allowed models (excluding finetuned models)
allowed_models = list(MODEL_TO_FAMILY.keys())

data = read_transcription_data(MODEL_FAMILY, allowed_models=allowed_models)



street_origin = pd.read_csv("../street_names.tsv")
street_origin['name'] = street_origin['name'].str.lower()

data = data.set_index("answer").join(street_origin.set_index("name"), how='left').reset_index()
data.columns = ['answer', 'participant_id', 'index', 'model', 'prompt', 'original_text',
       'transcription', 'transcription_og', 'Status', 'Primary language',
       'Age', 'Sex', 'Language', 'english_only', 'multilingual', 'not_english',
       'levenshtein_distance', 'is_correct', 'age_decade', 'origin']

# Add model_family column based on model name
data['model_family'] = data['model'].map(MODEL_TO_FAMILY)

# Filter to only include models in MODEL_TO_FAMILY dictionary
data = data[data['model'].isin(allowed_models)]
print(f"After filtering: {len(data)} rows, {data['model'].nunique()} unique models")
print(f"Models included: {sorted(data['model'].unique())}")

data['language_group'] = data['Primary language'].apply(categorize_language)
data['is_correct'] = data['is_correct'].astype(float)


In [None]:
#very noisy data // input error
data = data[data['participant_id']!='PARTICIPANT_001']
data = data[data['participant_id']!='PARTICIPANT_002']
# To hear the audio files, go to the bottom of the notebook

In [None]:
# Deduplicate transcriptions (keep first occurrence of each participant/street/model/prompt)
rows_before = len(data)
data = data.drop_duplicates(subset=['participant_id', 'answer', 'model', 'prompt'], keep='first')
print(f"Deduplication: {rows_before:,} → {len(data):,} rows (removed {rows_before - len(data):,})")

In [None]:
# Filter prompts: only Whisper and Phi4 models should have non-"No prompt" transcriptions
# For other models (Deepgram, Google), keep only "No prompt" transcriptions
whisper_models = ['tiny', 'base', 'small', 'medium', 'large']
phi4_models = ['phi-4-multimodal']
models_with_all_prompts = whisper_models + phi4_models

rows_before = len(data)
# Keep rows where: (1) model is whisper/phi4, OR (2) prompt is "No prompt"
data = data[(data['model'].isin(models_with_all_prompts)) | (data['prompt'] == 'No prompt')]
print(f"Prompt filtering: {rows_before:,} → {len(data):,} rows (removed {rows_before - len(data):,})")


In [None]:
len(data.groupby("participant_id").sample(n=1))

In [None]:
#Data quality checks
#every model and every prompt should be the same number of participants 29*78
pd.set_option('display.max_colwidth', 10)
display(data.groupby(["model", "prompt"]).count().sort_values("is_correct", ascending=False).reset_index())

In [None]:
# Add is_correct_prefix: Check if "i'm on" appears in original_text and transcription
def check_prefix_correct(row):
    """Check if the prefix 'i'm on' is correctly transcribed (using normalized text with aliases) i am, i'm are marked as correct"""
    if pd.isna(row['original_text']) or pd.isna(row['transcription']):
        return np.nan
    
    # Use normalize_text to handle aliases and variations
    original_normalized = normalize_text(row['original_text'])
    transcription_normalized = normalize_text(row['transcription'])
    
    # Check if "i'm on" is in the original text
    if "i'm on" in original_normalized:
        # Check if it's also in the transcription
        return float("i'm on" in transcription_normalized)
    else:
        # If "i'm on" is not in original, it's a 0
        return 0


# Add is_correct_street_name: Check if the street name (answer) appears in the transcription
def check_street_name_correct(row):
    """
    Check if the street name appears correctly in the transcription (using normalized text with aliases)
    Note: row['answer'] and row['transcription'] are already normalized in read_transcription_data
    """
    if pd.isna(row['answer']):
        return 0
    
    # The answer and transcription are already normalized via normalize_text in read_transcription_data
    # So we can directly check if the answer appears in the transcription
    answer = str(row['answer']).lower().strip()
    transcription = str(row['transcription']).lower().strip()
    
    # Check if the street name appears in the transcription
    return float(answer in transcription)

# Add word error rate calculation using jiwer library
def calculate_wer(row):
    """Calculate Word Error Rate (WER) between original text and transcription"""
    if pd.isna(row['answer']) or pd.isna(row['transcription']):
        return np.nan
    
    # Convert to string and strip whitespace
    original = str(row['answer']).strip()
    transcription = str(row['transcription']).strip()

    wer = jiwer.wer(original, transcription)
    return float(wer)



## Numbers for paper

In [None]:
#Analysis for no prompt

no_prompt_df = data[data['prompt']=='No prompt'].copy()
no_prompt_df['is_correct_prefix'] = no_prompt_df.apply(check_prefix_correct, axis=1)
no_prompt_df['is_correct_street_name'] = no_prompt_df.apply(check_street_name_correct, axis=1)
no_prompt_df['word_error_rate'] = no_prompt_df.apply(calculate_wer, axis=1)

display(no_prompt_df.mean(numeric_only=True)[['is_correct', 'is_correct_street_name', 'is_correct_prefix', 'word_error_rate']])

display(no_prompt_df.groupby("language_group").mean(numeric_only=True)[['is_correct', 'is_correct_street_name', 'is_correct_prefix', 'word_error_rate']])

display(no_prompt_df[no_prompt_df['model']=='large'].mean(numeric_only=True)[['is_correct', 'is_correct_street_name', 'is_correct_prefix', 'word_error_rate']])

In [None]:
no_prompt_df.groupby("model").mean(numeric_only=True)[['is_correct', 'is_correct_street_name', 'is_correct_prefix', 'word_error_rate']]

In [None]:
#Overall accuracy by prompt and language group
data.groupby(['prompt', 'language_group']).mean(numeric_only=True)[['is_correct']]


In [None]:
data.groupby("prompt").mean(numeric_only=True)[['is_correct']]

In [None]:
participant_data = data.groupby("participant_id").sample(n=1)

for variable in ["Sex", "age_decade", "language_group"]:
    display(participant_data.groupby(variable).count()[['answer']])
    display(((participant_data.groupby(variable).count()/len(participant_data)).round(3)*100)[['answer']])
    
for variable in ["Sex", "age_decade", "language_group"]:
    display(data.groupby([variable]).mean(numeric_only=True)[['is_correct']].round(3)*100)




In [None]:
no_prompt_df.groupby("Sex").mean(numeric_only=True)[['is_correct', 'is_correct_street_name', 'is_correct_prefix', 'word_error_rate']]

In [None]:
#Total number of unique languages spoken by participants
values = str([x.replace("'", "").replace("'", "")  for x in participant_data["Primary language"].unique()]).split(",")
values = [x.replace("'", "").replace(" ", "").replace("[", "").replace("]", "") for x in values]

print("Unique languages spoken by participants", len(set(values)),  set(values))

## Figures

In [None]:
# Bootstrap sampling function with confidence intervals
def bootstrap_accuracy(data, n_bootstrap=BOOTSTRAP_SAMPLES, confidence_level=0.95):
    n = len(data)
    if n == 0:
        return {'mean': np.nan, 'lower': np.nan, 'upper': np.nan, 'std': np.nan}
    
    # Store bootstrap accuracies
    bootstrap_accuracies = np.zeros(n_bootstrap)
    
    # Perform bootstrap sampling
    for i in range(n_bootstrap):
        # Sample with replacement
        bootstrap_sample = data.sample(n=n, replace=True)
        bootstrap_accuracies[i] = bootstrap_sample['is_correct'].mean()
    
    # Calculate confidence interval
    alpha = 1 - confidence_level
    lower_percentile = (alpha / 2) * 100
    upper_percentile = (1 - alpha / 2) * 100
    
    ci_lower = np.percentile(bootstrap_accuracies, lower_percentile)
    ci_upper = np.percentile(bootstrap_accuracies, upper_percentile)
    
    return {
        'mean': data['is_correct'].mean(),
        'lower': ci_lower,
        'upper': ci_upper,
        'std': np.std(bootstrap_accuracies)
    }

def calculate_bootstrap_by_model_group(data, n_bootstrap=BOOTSTRAP_SAMPLES):
    results = []
    
    for (model, language_group), group_data in data.groupby(['model', 'language_group'], observed=True):
        boot_result = bootstrap_accuracy(group_data, n_bootstrap=n_bootstrap)
        results.append({
            'model': model,
            'language_group': language_group,
            'mean': boot_result['mean'],
            'lower': boot_result['lower'],
            'upper': boot_result['upper'],
            'std': boot_result['std']
        })
    
    results_df = pd.DataFrame(results)
    results_df['model_size'] = results_df['model'].map(model_sizes)
    
    return results_df

In [None]:
# Visualization with bootstrap confidence bands

def visualize_data_with_confidence_bands(data, ax, prompt, n_bootstrap=10000, colors=None):
    """
    Visualize accuracy data with 95% confidence bands using bootstrap sampling.
    
    Parameters:
    -----------
    data : DataFrame
        Data to visualize
    ax : matplotlib axis
        Axis to plot on
    prompt : str
        Prompt text for the title
    n_bootstrap : int
        Number of bootstrap samples (default: 10000)
    colors : dict
        Colors for each language group (default: None, uses default colors)
    """
    
    # Add language group
    data['language_group'] = data['Primary language'].apply(categorize_language)
    
    # Calculate bootstrap confidence intervals
    print(f"Calculating bootstrap confidence intervals for: {prompt[:50]}...")
    bootstrap_results = calculate_bootstrap_by_model_group(data, n_bootstrap=n_bootstrap)
    
    # Define colors for the three groups (darkest = Non-English, lightest = English only)
    if colors is None:
        colors = {'English only': '#59A14F', 'Multilingual (English)': '#76B7B2', 'Non-English': '#E15759'}
    
    # Filter to only models with sizes in model_sizes
    bootstrap_results = bootstrap_results.dropna(subset=['model_size'])
    
    # Use log scale for x-axis
    ax.set_xscale('log')
    
    # Plot each language group as a separate line with confidence bands
    for language_group in ['English only', 'Multilingual (English)', 'Non-English']:
        group_data = bootstrap_results[bootstrap_results['language_group'] == language_group]
        # Sort by model size and reset index to ensure proper alignment
        group_data = group_data.sort_values('model_size').reset_index(drop=True)
        
        if len(group_data) > 0:
            # Use model sizes for x positions
            x = group_data['model_size'].values
            y_mean = group_data['mean'].values
            y_lower = group_data['lower'].values
            y_upper = group_data['upper'].values
            
            # Plot confidence band (shaded area)
            ax.fill_between(x, y_lower, y_upper, alpha=0.2, color=colors[language_group])
            
            # Plot line (lighter)
            ax.plot(x, y_mean, linewidth=2, alpha=0.3, color=colors[language_group])
            
            # Plot markers with full opacity
            ax.plot(x, y_mean, marker='o', linewidth=0, markersize=8, 
                    color=colors[language_group], label=language_group)
    
    
    ax.set_xlabel('Model Size (Millions of Parameters)', fontsize=12)
    ax.set_ylabel('Accuracy', fontsize=12)
    
    # Wrap title to multiple lines if needed
    wrapped_title = '\n'.join(textwrap.wrap(prompt[:80], width=50))
    ax.set_title(wrapped_title, fontsize=14)
    
    ax.set_ylim(0, 1.0)
    ax.set_xlim(25, 20000)
    ax.grid(True, alpha=0.3)
    


In [None]:

# Create plot with 3 subplots and confidence bands
print("Creating visualization with bootstrap confidence bands...")
print("Note: This may take a few minutes due to 10,000 bootstrap samples per group.\n")

sns.set_style("whitegrid")
fig, axes = plt.subplots(figsize=(15, 6), nrows=1, ncols=3)


# Get unique prompts, filtering out NaN values, and sort
unique_prompts = sorted([p for p in data['prompt'].unique() if pd.notna(p)])
for n, prompt in enumerate(unique_prompts):
    visualize_data_with_confidence_bands(data[data['prompt'] == prompt], axes[n], prompt, n_bootstrap=BOOTSTRAP_SAMPLES)

# Update axis labels to match style
for ax in axes:
    ax.set_xlabel('Model Size (Millions of Parameters)', fontsize=14)
    ax.set_ylabel('Average Accuracy', fontsize=14)

# Create a single shared legend below all subplots
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, title='Language Group', 
           bbox_to_anchor=(0.5, -0.02), loc='upper center', ncol=3, 
           frameon=True, fontsize=12, title_fontsize=12)

fig.suptitle('Transcription Accuracy by Prompt Type w/ 95% CI', fontsize=16, y=1.02)

plt.tight_layout()

output_dir = "figures"
os.makedirs(output_dir, exist_ok=True)

# Get model names from the data
model_names = "_".join(sorted(data['model'].unique()))
output_filename = f"all_models_accuracy_{model_names}.pdf"
output_path = os.path.join(output_dir, output_filename)

plt.savefig(f'figures/accuracy_by_figure_2.png', dpi=150, bbox_inches='tight')
print(f"\n✓ Saved high-quality PDF to: {output_path}")

plt.show()

print("\n✓ Visualization complete with 95% bootstrap confidence intervals!")



In [None]:
# Single graph with three lines - one for each prompt type
# Shows average performance across all language groups

sns.set_style("whitegrid")
fig, ax = plt.subplots(figsize=(10, 8))

# Get unique prompts
unique_prompts = sorted([p for p in data['prompt'].unique()])

# Define colors for each prompt
prompt_colors = ['#2E86AB', '#F18F01', '#A23B72']
prompt_labels = ['No prompt', 'Prompt - The user is going to give you their location via an address ', 'The user is going to give you their location via one of the following addresses:...']

ax.set_xscale('log')

for idx, prompt in enumerate(unique_prompts):
    prompt_data = data[data['prompt'] == prompt].copy()
    
    # Calculate average accuracy by model (across all language groups)
    model_accuracy = prompt_data.groupby('model').agg({
        'is_correct': 'mean'
    }).reset_index()
    
    # Add model sizes
    model_accuracy['model_size'] = model_accuracy['model'].map(model_sizes)
    model_accuracy = model_accuracy.dropna(subset=['model_size'])
    model_accuracy = model_accuracy.sort_values('model_size')
    
    # Calculate bootstrap CI for each model
    ci_results = []
    for model in model_accuracy['model'].unique():
        model_data = prompt_data[prompt_data['model'] == model]
        if len(model_data) > 0:
            boot_result = bootstrap_accuracy(model_data, n_bootstrap=BOOTSTRAP_SAMPLES)
            ci_results.append({
                'model': model,
                'model_size': model_sizes.get(model, np.nan),
                'mean': boot_result['mean'],
                'lower': boot_result['lower'],
                'upper': boot_result['upper']
            })
    
    ci_df = pd.DataFrame(ci_results).dropna(subset=['model_size']).sort_values('model_size')
    
    if len(ci_df) > 0:
        x = ci_df['model_size'].values
        y_mean = ci_df['mean'].values
        y_lower = ci_df['lower'].values
        y_upper = ci_df['upper'].values
        
        # Plot confidence band
        ax.fill_between(x, y_lower, y_upper, alpha=0.2, color=prompt_colors[idx])
        
        # Plot line
        ax.plot(x, y_mean, marker='o', linewidth=2, markersize=8, 
                color=prompt_colors[idx], label=prompt_labels[idx] if idx < len(prompt_labels) else prompt[:30])
        
        # Add model labels (only for first prompt to avoid clutter)
        if idx == 0:
            label_y = 0.98  # Fixed y-coordinate for all labels
            for i, row in ci_df.iterrows():
                # Draw vertical line from label to data point
                ax.plot([row['model_size'], row['model_size']], [row['mean'], label_y - 0.07], 
                       color='gray', linewidth=0.5, linestyle='-', alpha=0.5)
                # Add label at fixed y position
                ax.text(row['model_size'], label_y - 0.07, row['model'], 
                       ha='center', va='bottom', fontsize=14, rotation=30)

# Add dashed horizontal lines for average accuracy of each prompt
for idx, prompt in enumerate(unique_prompts):
    prompt_data = data[data['prompt'] == prompt]
    avg_accuracy = prompt_data['is_correct'].mean()
    ax.axhline(y=avg_accuracy, color=prompt_colors[idx], linestyle='--', linewidth=.7, alpha=0.7)
    # Add label on right side
    ax.text(22000, avg_accuracy, f'{avg_accuracy:.2f}', 
            color=prompt_colors[idx], fontsize=14, va='center')

ax.set_xlabel('Model Size in Millions of Parameters (Log Scale)', fontsize=14)
ax.set_ylabel('Average Accuracy', fontsize=14)
ax.set_title('Transcription Accuracy by Prompt Type w/ 95% CI', 
             fontsize=16)
ax.set_ylim(0, 1.0)
ax.set_xlim(25, 20000)
ax.grid(True, alpha=0.3)
ax.legend(title='Prompt Type', fontsize=12, title_fontsize=12, loc='lower right')

plt.savefig(f'figures/overall_accuracies_figure_1.png', dpi=150, bbox_inches='tight')
plt.tight_layout()
plt.show()

In [None]:
# Calculate grouped means
grouped_means = no_prompt_df.groupby(['model', 'language_group']).mean(numeric_only=True).sort_values('is_correct', ascending=False)[['is_correct']]

# Calculate confidence intervals for each group using bootstrap_accuracy function from Cell 10
ci_results = []
for (model, language_group), group_data in no_prompt_df.groupby(['model', 'language_group'], observed=True):
    boot_result = bootstrap_accuracy(group_data, n_bootstrap=10)
    ci_results.append({
        'model': model,
        'language_group': language_group,
        'mean': boot_result['mean'],
        'ci_lower': boot_result['lower'],
        'ci_upper': boot_result['upper'],
        'error_lower': boot_result['mean'] - boot_result['lower'],
        'error_upper': boot_result['upper'] - boot_result['mean'],
        'n': len(group_data),
        'std': boot_result['std']
    })

ci_df = pd.DataFrame(ci_results)

# Add model_family to ci_df
ci_df['model_family'] = ci_df['model'].map(MODEL_TO_FAMILY)
ci_df = ci_df.sort_values(by='mean', ascending=True)

# Get unique model families (excluding NaN)
model_families = [f for f in ci_df['model_family'].unique() if pd.notna(f)]

# Define colors for language groups
colors = {
    'English only': '#2E86AB', 
    'Multilingual (English)': '#F18F01', 
    'Non-English': '#A23B72'
}
language_groups = ['English only', 'Multilingual (English)', 'Non-English']

# Create single figure with horizontal subplots scaled by number of models
n_families = len(model_families)

# Calculate width ratios based on number of models in each family
width_ratios = [len(ci_df[ci_df['model_family'] == f]['model'].unique()) for f in model_families]
total_models = sum(width_ratios)

fig, axes = plt.subplots(1, n_families, figsize=(1.2 * total_models, 8), sharey=True,
                         gridspec_kw={'width_ratios': width_ratios})
if n_families == 1:
    axes = [axes]

width = 0.25
offsets = np.array([-width, 0, width])

for idx, family in enumerate(model_families):
    ax = axes[idx]
    family_df = ci_df[ci_df['model_family'] == family]
    models = family_df['model'].unique()
    
    # Set up bar positions
    n_models = len(models)
    x = np.arange(n_models)
    
    # Plot bars for each language group
    for i, lang_group in enumerate(language_groups):
        group_data = family_df[family_df['language_group'] == lang_group].copy()
        group_data = group_data.set_index('model').reindex(models).reset_index()
        
        bars = ax.bar(x + offsets[i], 
                      group_data['mean'], 
                      width, 
                      label=lang_group,
                      color=colors[lang_group],
                      alpha=0.8,
                      edgecolor='white',
                      linewidth=1.5)
        
        
        # Only plot error bars for non-NaN values
        valid_mask = ~group_data['mean'].isna()
        if valid_mask.any():
            ax.errorbar(x[valid_mask] + offsets[i], 
                        group_data.loc[valid_mask, 'mean'],
                        yerr=[group_data.loc[valid_mask, 'error_lower'].fillna(0), 
                              group_data.loc[valid_mask, 'error_upper'].fillna(0)],
                        fmt='none',
                        ecolor='black',
                        elinewidth=0.7,
                        capsize=4,
                        capthick=0.7,
                        alpha=0.7)
    
    
    ax.set_title(f'{family.upper()}', fontsize=16)
    # ax.set_xlabel('Model', fontsize=14)
    ax.set_xticks(x)
    ax.set_xticklabels(models, fontsize=12, rotation=45, ha='right')
    

    ax.set_xlim(-0.5, n_models - 0.5)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_ylim(0, 1.0)

# Only add y-label to first subplot
axes[0].set_ylabel('Accuracy (Mean ± 95% CI)', fontsize=14)

# Add shared legend
handles, labels = axes[0].get_legend_handles_labels()
leg = fig.legend(handles, labels, title='Primary Language', loc='upper center', 
           bbox_to_anchor=(0.5, 0.97), ncol=3, fontsize=11, title_fontsize=11,
           columnspacing=2, handletextpad=0.8, frameon=True, fancybox=False,
           edgecolor='gray', facecolor='white', framealpha=1.0)
leg.get_frame().set_linewidth(0.5)

fig.suptitle('Transcription Accuracy by Model Family and Language Group\n(with 95% Bootstrap Confidence Intervals)', 
             fontsize=14, y=1.02)
plt.savefig(f'figures/potential_figure_2.png', dpi=150, bbox_inches='tight')
plt.subplots_adjust(wspace=0.15, left=0.06, right=0.98, bottom=0.18, top=0.85)
plt.show()


In [None]:
# Prepare data for heatmap
grouped_data = no_prompt_df.groupby(["answer", "Primary language"]).mean(numeric_only=True)[['is_correct']]

# Pivot the data for heatmap (Primary language as rows, answer as columns)
heatmap_data = grouped_data.reset_index().pivot(index='Primary language', columns='answer', values='is_correct')

# Sort y-axis by average accuracy across all streets (descending)
heatmap_data['_avg_row'] = heatmap_data.mean(axis=1)
heatmap_data = heatmap_data.sort_values('_avg_row', ascending=False)
heatmap_data = heatmap_data.drop('_avg_row', axis=1)

# Sort x-axis by average accuracy across all languages (descending)
col_averages = heatmap_data.mean(axis=0).sort_values(ascending=False)
heatmap_data = heatmap_data[col_averages.index]

# Create the heatmap
fig, ax = plt.subplots(figsize=(18, 6))
sns.heatmap(heatmap_data, annot=False, cmap='RdYlGn', center=0.5, 
            vmin=0, vmax=1, cbar_kws={'label': 'Accuracy'}, 
            linewidths=0.5, linecolor='gray', ax=ax)
ax.set_title('Transcription Accuracy by Street Name and Language', fontsize=16, pad=15)
ax.set_xlabel('')
ax.set_ylabel('Language', fontsize=12)
# Clean up y-tick labels: remove "English" unless it's the only language
ytick_labels = [label.get_text() for label in ax.get_yticklabels()]
cleaned_labels = []
for label in ytick_labels:
    if label.lower().strip() == 'english':
        cleaned_labels.append(label)
    else:
        # Remove "English, " or ", English" from the label
        cleaned = label.replace('English, ', '').replace(', English', '')
        cleaned_labels.append(cleaned)
ax.set_yticklabels(cleaned_labels, fontsize=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=8)

plt.subplots_adjust(left=0.15, right=0.95, top=0.92, bottom=0.25)
plt.savefig(f'figures/all_models_accuracy_by_street_language.png', dpi=150, bbox_inches='tight')
plt.show()





In [None]:
lep_data = pd.read_csv("population_by_language.tsv", sep="\t")
lep_data = lep_data[['Language', 'LEP_Population']] 
lep_data['LEP_Population'] = lep_data['LEP_Population'].astype(int)
lep_data['LEP_Population_Percent'] = lep_data['LEP_Population'] / 151388
lep_data = lep_data.sort_values('LEP_Population_Percent', ascending=False)

# Create visualization
fig, ax = plt.subplots(figsize=(11, 9))

# Tableau 20 for more distinct colors
colors = plt.cm.tab20.colors[:len(lep_data)]

# Create pie chart without labels (we'll add them with lines)
wedges, texts = ax.pie(
    lep_data['LEP_Population'], 
    colors=colors,
    startangle=90,
    radius=0.6,
    wedgeprops={'linewidth': 2, 'edgecolor': 'white'}
)

# Add labels with leader lines
bbox_props = dict(boxstyle="square,pad=0.15", fc="white", ec="none", alpha=0.8)

for i, (wedge, (lang, pop, pct)) in enumerate(zip(wedges, 
    zip(lep_data['Language'], lep_data['LEP_Population'], lep_data['LEP_Population_Percent']))):
    
    # Skip labels for slices under 1%
    if pct < 0.03:
        continue
    
    ang = (wedge.theta2 - wedge.theta1) / 2. + wedge.theta1
    x = np.cos(np.deg2rad(ang))
    y = np.sin(np.deg2rad(ang))
    
    # Position labels outside the pie
    horizontalalignment = "left" if x >= 0 else "right"
    
    # Label text: include population count
    # Split long labels onto multiple lines
    if lang == "Other Asian and Pacific Island languages":
        lang = "Other Asian&\nPacific Island languages"
    
    
    # Calculate label position (adjusted for smaller pie)
    label_x = 0.8 * x
    label_y = .95* y
    
    # Adjust specific labels
    if "Russian" in lang or "Slavic" in lang:
        label_x = 0.0  # More to the right (east)
        label_y = 0.9 # Higher up (north)
    
    # Add language name (bold) with leader line
    ax.annotate(lang, 
                xy=(x * 0.6, y * 0.6),  # Point on pie edge
                xytext=(label_x, label_y),
                horizontalalignment=horizontalalignment,
                fontsize=18, fontweight='bold',
                arrowprops=dict(arrowstyle="-", color="gray", lw=0.8),
                bbox=bbox_props)
    
    # Add population (regular weight) below the language name
    pop_offset = -0.06 if lang.count('\n') == 0 else -0.09
    ax.text(label_x, label_y + pop_offset, f"Population: {pop:,}",
            horizontalalignment=horizontalalignment,
            fontsize=12, fontweight='normal', color='#555')

plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
plt.savefig('figures/LEP_SF.png', dpi=200, bbox_inches='tight', pad_inches=0.1)
plt.show()

In [None]:
# Play audio samples from the 2 participants who were filtered due to noisy data

filtered_participants = {
    'PARTICIPANT_001': 'Participant 1',
    'PARTICIPANT_002': 'Participant 2', 
}

def play_audio_samples_from_filtered_participants(n_samples=3, audio_dir='audio_files'):
    """
    Display audio samples from participants who were filtered due to noisy data.
    
    Parameters:
    -----------
    n_samples : int
        Number of audio samples to display per participant (default: 3)
    audio_dir : str
        Directory containing audio files (default: 'audio_files')
    """
    
    display(HTML("<h2>Audio Samples from Filtered Participants (Noisy Data)</h2>"))
    
    for pid, label in filtered_participants.items():
        # Find audio files for this participant
        pattern = os.path.join(audio_dir, f"{pid}_*.webm")
        audio_files = sorted(glob.glob(pattern))
        
        if not audio_files:
            print(f"\n{label} ({pid}): No audio files found")
            continue
        
        display(HTML(f"<h3>{label} - {pid}</h3>"))
        display(HTML(f"<p><b>Total audio files:</b> {len(audio_files)}</p>"))
        
        # Play first n_samples
        samples_to_play = audio_files[:n_samples]
        
        for i, audio_file in enumerate(samples_to_play, 1):
            # Extract street name from filename
            filename = os.path.basename(audio_file)
            # Format: participant_id_uuid_streetname.webm or participant_id_uuid.oga_streetname.webm
            street_name = filename.split('_')[-1].replace('.webm', '').replace('_', ' ').title()
            
            display(HTML(f"<p><b>Sample {i}/{len(samples_to_play)}:</b> {street_name}</p>"))
            display(Audio(audio_file))
        
        display(HTML("<hr>"))

# Display audio samples (3 per participant by default)
play_audio_samples_from_filtered_participants(n_samples=3)