In [None]:
import ast
import base64
import os
import sys
import time

sys.path.append('..')

import folium
import googlemaps
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polyline
import seaborn as sns
from geopy.distance import geodesic
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from utils import read_transcription_data, categorize_language

# ============================================================================
# Configuration
# ============================================================================

# Ignore locations past this many miles from target location
CITY_BOUNDS = 20

In [None]:

def parse_coord(c):
    """Parse coordinate from string/dict to (lat, lng) tuple."""
    if isinstance(c, str):
        c = ast.literal_eval(c)
    if isinstance(c, dict):
        return (c['lat'], c['lng'])
    return c


def calculate_distance(coord1, coord2):
    """Calculate distance in km between two coordinates."""
    return geodesic(parse_coord(coord1), parse_coord(coord2)).km

def parse_out_keyword(row, index):
    #street names are 1 or 2 words
    result = [word.capitalize() for word in str(row[index]).split(' ')]
    if len(result) > 1:
        result = result[-(row['keyword_index'] - 2):]
        result = ' '.join(result).replace(".", "")
        return result
    elif len(result) > 0:
        result = result[-(row['keyword_index'] - 1):]
        result = ' '.join(result).replace(".", "")
        return result
    else:
        print(len(result), row[index], "No location, should not happen")
        return "No location"


def search_maps(keyword):
    #always add "Blvd, San Francisco" to the end of the query to help limit to SF
    query = keyword + " Blvd, San Francisco"

    # Using find_place (similar to findPlaceFromQuery in JS)
    result = gmaps.find_place(
        input=query,
        input_type='textquery',
        fields=['name', 'geometry', 'place_id', 'formatted_address', 'types']
    )
    if len(result['candidates']) > 0:
        place = result['candidates'][0]
        return (place.get('name'), place['geometry']['location'])
    else:
        print(f"No location found for {keyword}")
        return ("No location", "No location")

In [None]:
# Initialize the Google Maps client
# Read API key from environment variable
API_KEY = os.environ.get('GOOGLE_MAPS_API_KEY')
if not API_KEY:
    raise EnvironmentError(
        "GOOGLE_MAPS_API_KEY environment variable is not set.\n"
        "Set it with: export GOOGLE_MAPS_API_KEY='your-api-key'"
    )

gmaps = googlemaps.Client(key=API_KEY)


In [None]:
# #identifying places with google maps api

def find_locations_google_maps(data):
    transcription_keyword = []
    transcription_name = []
    transcription_coords = []


    for n, keyword in enumerate(data['keyword_transcribed'].unique()):
        print(n)
        if keyword == "No location":
            transcription_keyword.append(keyword)
            transcription_name.append("No location")
            transcription_coords.append("No location")
        else:
            a, b = search_maps(keyword)
            transcription_keyword.append(keyword)
            transcription_name.append(a)
            transcription_coords.append(b)

    google_map_results = pd.DataFrame()
    google_map_results['transcription_keyword'] = transcription_keyword
    google_map_results['transcription_name'] = transcription_name
    google_map_results['transcription_coords'] = transcription_coords

    #save for easy reloading
    google_map_results.to_csv("google_places_data_whisper.csv", index=False)
    return google_map_results

In [None]:
#reading from deepgram
data_og = read_transcription_data("whisper")
data = data_og.copy()

data['keyword_index'] = data['answer'].apply(lambda x: len(x.split(' ')))


data['keyword_transcribed'] = data.apply(lambda x: parse_out_keyword(x, 'transcription_og'), axis=1)
data['keyword_answer'] = data.apply(lambda x: parse_out_keyword(x, 'answer'), axis=1)
data[['keyword_transcribed', 'keyword_answer', 'transcription_og', 'transcription', 'answer']]

#very noisy data // input error
data = data[data['participant_id']!='PARTICIPANT_001']
data = data[data['participant_id']!='PARTICIPANT_002']

data = data.drop_duplicates(subset=['participant_id', 'answer', 'model', 'prompt'], keep='first')
data = data[data['prompt']=='No prompt'][data['model']=='base']
len(data)


# google_map_results = find_locations_google_maps(data)
#reload values

google_map_results = pd.read_csv("google_places_data_whisper.csv")

data = data.set_index('keyword_transcribed').join(google_map_results.set_index("transcription_keyword"), how='left').reset_index()
data = data.set_index('keyword_answer').join(google_map_results.set_index("transcription_keyword"), how='left', rsuffix='_answer').reset_index()
print(len(data))
data = data[data['transcription_name'] != "No location"]
data = data[data['transcription_name_answer'] != "No location"]
data = data.dropna(subset=['transcription_coords', 'transcription_coords_answer'])
print(len(data))

data['distance'] = data.apply(
    lambda x: calculate_distance(x['transcription_coords'], x['transcription_coords_answer']), axis=1
)

len(data[data['distance'] < CITY_BOUNDS])


In [None]:
# Visualize all locations: Ground Truth vs Predicted (using folium maps)

def parse_coord(c):
    if isinstance(c, str):
        if c == "No location" or not c:
            return None
        c = ast.literal_eval(c)
    if isinstance(c, dict):
        return (c['lat'], c['lng'])
    return (c[0], c[1]) if c else None

# Count valid and invalid locations
valid_answers = data[data['transcription_coords_answer'] != "No location"]['transcription_coords_answer'].dropna()
valid_predicted = data[data['transcription_coords'] != "No location"]['transcription_coords'].dropna()
no_location_answers = len(data) - len(valid_answers)
no_location_predicted = len(data) - len(valid_predicted)

# Map 1: Ground Truth (transcription_name_answer)
m1 = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles='cartodbpositron')
for coord_str in valid_answers:
    coord = parse_coord(coord_str)
    if coord:
        folium.CircleMarker(
            location=coord, radius=5, color='#2E86AB', fill=True, 
            fill_color='#2E86AB', fill_opacity=0.6, weight=1
        ).add_to(m1)
m1.save('ground_truth_locations.html')
print(f"✅ Ground Truth map saved to ground_truth_locations.html ({len(valid_answers)} locations)")

In [None]:
# Map 2: Predicted (transcription_name)
m2 = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles='cartodbpositron')
for coord_str in valid_predicted:
    coord = parse_coord(coord_str)
    if coord:
        folium.CircleMarker(
            location=coord, radius=5, color='#E63946', fill=True,
            fill_color='#E63946', fill_opacity=0.6, weight=1
        ).add_to(m2)
m2.save('predicted_locations.html')
print(f"✅ Predicted map saved to predicted_locations.html ({len(valid_predicted)} locations)")

# Report no locations
print(f"\n{'='*50}")
print(f"NO LOCATION COUNT")
print(f"{'='*50}")
print(f"Ground Truth (answer):     {no_location_answers:,} / {len(data):,} ({100*no_location_answers/len(data):.1f}%)")
print(f"Predicted (transcription): {no_location_predicted:,} / {len(data):,} ({100*no_location_predicted/len(data):.1f}%)")

# Display the ground truth map inline
m1

In [None]:
data.sort_values(by='distance', ascending=False)[['transcription', 'transcription_name', 'transcription_name_answer', 'distance']]

# Add language group
data['language_group'] = data['Primary language'].apply(categorize_language)

# Plot by language group - histograms with KDE curves
fig, ax = plt.subplots(figsize=(12, 6))
groups = ['English only', 'Non-English']
colors = ['#2E86AB', '#E63946']  # Blue, Red

for group, color in zip(groups, colors):
    group_data = data[data['language_group'] == group]['distance']
    # Use log-transformed data for proper KDE on log scale
    log_data = np.log10(group_data[group_data > 0])
    sns.histplot(log_data, bins=30, stat='percent', color=color, alpha=0.4, 
                 edgecolor='white', linewidth=0.5, ax=ax,
                 label=f'{group} (n={len(group_data)})')

# Fix x-axis labels to show actual distances
ax.set_xlabel('Distance (km)', fontsize=12)
ax.set_ylabel('Percent', fontsize=12)
# Convert log ticks back to real values
ticks = ax.get_xticks()
ax.set_xticklabels([f'{10**t:.1g}' for t in ticks])
ax.set_title('Transcription Error Distances by Speaker Group', fontsize=14, fontweight='bold')
ax.legend(fontsize=11, frameon=False)
sns.despine()
plt.tight_layout()


In [None]:
data['language_group'] = data['Primary language'].apply(categorize_language)
data[data['distance'] < CITY_BOUNDS].groupby("language_group").mean(numeric_only=True)[['distance']]

In [None]:
example_participant = data.copy()
example_participant = example_participant[example_participant['prompt']=='No prompt']
example_participant = example_participant[example_participant['participant_id']=='PARTICIPANT_003'].copy()


In [None]:


def show_worst_mistakes_with_routes(dataframe, n=5):
    """Show top N worst mistakes with actual routes between points."""
    
    def parse(c):
        if isinstance(c, str):
            if c == "No location" or not c:
                return None, None
            c = ast.literal_eval(c)
        if isinstance(c, dict):
            return (c['lat'], c['lng'])
        return c[0], c[1]
    
    worst = dataframe.nlargest(n, 'distance')
    
    print(f"Found {len(worst)} valid mistakes")
    
    # Create map
    m = folium.Map(location=[37.7749, -122.4194], zoom_start=12, tiles='cartodbpositron')
    
    # Tableau 10 color palette
    colors = ['#4E79A7', '#F28E2B', '#E15759', '#76B7B2', '#59A14F', 
              '#EDC948', '#B07AA1', '#FF9DA7', '#9C755F', '#BAB0AC']
    
    # Collect all coordinates for fitting bounds
    all_coords = []
    
    for idx, (_, row) in enumerate(worst.iterrows()):
        try:
            lat1, lng1 = parse(row['transcription_coords'])
            lat2, lng2 = parse(row['transcription_coords_answer'])
            
            if lat1 is None or lat2 is None:
                continue
            
            all_coords.append([lat1, lng1])
            all_coords.append([lat2, lng2])
            
            color = colors[idx % len(colors)]
            
            # Add start marker (X - wrong transcription location)
            folium.Marker(
                [lat1, lng1],
                icon=folium.DivIcon(html=f'''
                    <div style="
                        font-size: 11px; font-weight: bold; color: {color};
                        background: white; border: 1.5px solid {color}; border-radius: 50%;
                        width: 18px; height: 18px; text-align: center; line-height: 15px;
                        box-shadow: 0 1px 2px rgba(0,0,0,0.3);
                    ">✗</div>
                ''')
            ).add_to(m)
            
            # Add end marker (✓ - correct answer location)
            folium.Marker(
                [lat2, lng2],
                icon=folium.DivIcon(html=f'''
                    <div style="
                        font-size: 11px; font-weight: bold; color: {color};
                        background: white; border: 1.5px solid {color}; border-radius: 50%;
                        width: 18px; height: 18px; text-align: center; line-height: 15px;
                        box-shadow: 0 1px 2px rgba(0,0,0,0.3);
                    ">✓</div>
                ''')
            ).add_to(m)
            
            # Get route
            route_coords = None
            try:
                directions = gmaps.directions((lat1, lng1), (lat2, lng2), mode="driving")
                
                if directions:
                    route_coords = polyline.decode(directions[0]['overview_polyline']['points'])
                    folium.PolyLine(route_coords, color=color, weight=2, opacity=0.7).add_to(m)
                else:
                    folium.PolyLine([[lat1, lng1], [lat2, lng2]], color=color, weight=1.5, opacity=0.6, dash_array='8, 4').add_to(m)
            except Exception as e:
                print(f"Route error {idx}: {e}")
                folium.PolyLine([[lat1, lng1], [lat2, lng2]], color=color, weight=1.5, opacity=0.6, dash_array='8, 4').add_to(m)
            
            # Calculate midpoint - use actual route if available, otherwise geographic midpoint
            if route_coords:
                mid_idx = len(route_coords) // 2
                mid_lat, mid_lng = route_coords[mid_idx]
            else:
                mid_lat = (lat1 + lat2) / 2
                mid_lng = (lng1 + lng2) / 2
            
        except Exception as e:
            print(f"Error {idx}: {e}")
    
    # Fit map bounds to show all markers with padding
    if all_coords:
        m.fit_bounds(all_coords, padding=[30, 30])
    
    html_path = f'worst_{n}_mistakes_routes.html'
    m.save(html_path)
    print(f"✅ Saved HTML to {html_path}")
    
    return m


#For visualization purposes, just keeping mistakes to what's was in the city
data_frame = example_participant[
    (example_participant['transcription_coords'] != "No location") & 
    (example_participant['transcription_coords_answer'] != "No location") & 
    (example_participant['distance'] < CITY_BOUNDS)
]
show_worst_mistakes_with_routes(data_frame, n=5)


In [None]:
# Export the map to high-resolution PNG and PDF via headless Chrome
html_path = 'worst_5_mistakes_routes.html'
pdf_path = 'worst_5_mistakes_routes.pdf'
png_path = 'worst_5_mistakes_routes.png'

# Use a very large window + high DPI for maximum quality
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=2800,2000')
chrome_options.add_argument('--force-device-scale-factor=3')

driver = webdriver.Chrome(options=chrome_options)
driver.get('file://' + os.path.abspath(html_path))

# Wait for map tiles to fully load at high resolution
time.sleep(8)

# Save high-res PNG screenshot (2800x2000 @ 3x = 8400x6000 effective pixels)
driver.save_screenshot(png_path)
print(f"✅ Saved high-res PNG to {png_path}")

# Also save PDF (landscape, large paper, minimal margins)
pdf_data = driver.execute_cdp_cmd('Page.printToPDF', {
    'landscape': True,
    'printBackground': True,
    'paperWidth': 20,
    'paperHeight': 14,
    'marginTop': 0.1,
    'marginBottom': 0.1,
    'marginLeft': 0.1,
    'marginRight': 0.1,
    'scale': 1.0,
    'preferCSSPageSize': False,
})

with open(pdf_path, 'wb') as f:
    f.write(base64.b64decode(pdf_data['data']))

driver.quit()
print(f"✅ Saved PDF to {pdf_path}")
