In [1]:
import re
import pandas as pd
import numpy as np
from glob import glob
import os
import plotly.graph_objects as go
import plotly.express as px
from geopy.geocoders import Nominatim
import json
import time
from pathlib import Path

In [2]:
# Check which speakers are represented in the data, then filter region_data by present speakers
output = pd.read_csv("output/detected_quotatives_.csv")
region_data = dict()

columns_to_use = set()
for f in glob("data/*_metadata_*.txt"):
    region = os.path.basename(f)[0:3]
    print(region)
    
    curr = pd.read_csv(f, sep="\t")
    print(f"{len(curr)} speakers")
    
    curr = curr.loc[curr['CORAAL.Spkr'].isin(output.speaker_id.tolist())]
    print(f"{len(curr)} speakers data used")
    
    if len(columns_to_use):
        columns_to_use = columns_to_use & set(curr.columns.tolist())
    else:
        columns_to_use = set(curr.columns.tolist())
        
    region_data[region] = curr[list(columns_to_use)]

ATL
14 speakers
12 speakers data used
VLD
14 speakers
14 speakers data used
DCB
75 speakers
71 speakers data used
ROC
20 speakers
20 speakers data used
PRV
38 speakers
38 speakers data used
LES
15 speakers
15 speakers data used


In [3]:
# Load or create cache
cache_file = Path('geocode_cache.json')
if cache_file.exists():
    with open(cache_file) as f:
        geocode_cache = json.load(f)
else:
    geocode_cache = {}

geolocator = Nominatim(user_agent="study_migration_viz")

def get_coords(location_str):
    """Get coordinates with caching"""
    if location_str in geocode_cache:
        return geocode_cache[location_str]
    
    try:
        time.sleep(1)  # Respect rate limit
        location = geolocator.geocode(location_str)
        if location:
            coords = [location.latitude, location.longitude]
        else:
            coords = [None, None]
    except:
        coords = [None, None]
    
    geocode_cache[location_str] = coords
    return coords

# US state abbreviations and full names for detection
US_STATES = {
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC'
}

US_STATE_NAMES = {
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
    'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
    'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming', 'District of Columbia'
}

def split_multi_city(loc):
    """
    Detect and split entries like "Jackson, MO, Buffalo, NY"
    or "Prince George's County, Maryland, Northern Virginia, Southern Maryland"
    Returns list of individual location entries
    """
    parts = [p.strip() for p in loc.split(',')]
    
    # Single city-state pair - don't split
    if len(parts) == 2:
        return [loc]
    
    # Pattern 1: City, ST, City, ST (where ST is state abbrev)
    if len(parts) >= 4 and len(parts) % 2 == 0:
        potential_states = parts[1::2]
        if all(p.upper() in US_STATES for p in potential_states):
            # Split into city-state pairs
            cities = []
            for i in range(0, len(parts), 2):
                cities.append(f"{parts[i]}, {parts[i+1]}")
            return cities
    
    # Pattern 2: Location, State, Location, State (where State is full name)
    if len(parts) >= 3:
        potential_full_states = parts[1::2]
        if all(p in US_STATE_NAMES for p in potential_full_states):
            locations = []
            for i in range(0, len(parts), 2):
                if i+1 < len(parts):
                    locations.append(f"{parts[i]}, {parts[i+1]}")
                else:
                    locations.append(parts[i])
            return locations
    
    # Pattern 3: Multiple locations with regional prefixes
    regional_prefixes = ['Northern', 'Southern', 'Eastern', 'Western', 'Central',
                        'North', 'South', 'East', 'West', 'Upper', 'Lower']
    
    locations = []
    i = 0
    while i < len(parts):
        part = parts[i].strip()
        
        if i + 1 < len(parts):
            next_part = parts[i + 1].strip()
            combined = f"{part} {next_part}"
            
            if any(part.startswith(prefix) for prefix in regional_prefixes):
                if next_part in US_STATE_NAMES or next_part.upper() in US_STATES:
                    locations.append(combined)
                    i += 2
                    continue
        
        locations.append(part)
        i += 1
    
    if len(locations) > 1:
        return locations
    
    return [loc]

loc_map = {
    "LES": 'Lower East Side, NYC, NY',
    "PG County": "Prince George's County, MD",
    'Harrisburg (3 years)': "Harrisburg, PA",
    "Harlem": 'Harlem, NYC, NY',
}

def clean_location(loc):
    """Clean location string and validate"""
    if not loc or pd.isna(loc):
        return []
    
    loc = str(loc).strip()
    
    # Remove parenthetical info like "(7 years)" or "(military)"
    loc = re.sub(r'\s*\([^)]*\)', '', loc)
    loc = loc.strip()
    
    # Skip if empty after cleaning
    if not loc:
        return []
    
    # Apply known corrections for ambiguous strings
    if loc in loc_map:
        loc = loc_map[loc]
    
    # Try to split multi-city entries
    locations = split_multi_city(loc)

    valid_locations = []
    for single_loc in locations:
        single_loc = single_loc.strip()
        
        # Skip vague county abbreviations like "PG County" without full name
        if re.search(r'County$', single_loc, re.IGNORECASE):
            if ',' not in single_loc and len(single_loc.split()) <= 2:
                continue
        
        # Skip just state abbreviation like "FL"
        if re.match(r'^[A-Z]{2}$', single_loc):
            continue
        
        # Skip non-US locations (optional)
        non_us = ['Germany', 'Japan', 'Korea', 'England', 'France', 'Italy', 'Spain']
        if any(country in single_loc for country in non_us):
            continue
        
        # Handle "Northern Virginia" type entries - treat as just the state
        regional_prefixes = ['Northern', 'Southern', 'Eastern', 'Western', 'Central',
                            'North', 'South', 'East', 'West', 'Upper', 'Lower']
        
        for prefix in regional_prefixes:
            if single_loc.startswith(prefix + ' '):
                # Extract the state name
                state_part = single_loc[len(prefix)+1:].strip()
                if state_part in US_STATE_NAMES:
                    single_loc = state_part + ', USA'
                    break
        
        # Handle entries without commas
        if ',' not in single_loc:
            # Check if it's a US state name (accept it)
            if single_loc in US_STATE_NAMES:
                single_loc = single_loc + ', USA'
            else:
                # Skip single-word vague entries
                if len(single_loc.split()) == 1:
                    continue
                # Multi-word entries without commas - add USA
                single_loc = single_loc + ', USA'
        
        valid_locations.append(single_loc)
    
    return valid_locations

# City abbreviation mapping
city_map = {
    'ATL': 'Atlanta, GA',
    'DCB': 'Washington, D.C., USA',
    'LES': 'Lower East Side, NYC, NY',
    'ROC': 'Rochester, NY',
    'PRV': 'Princeville, NC',
    'VLD': 'Valdosta, GA'
}

In [4]:
# Transform: expand all migrations
rows = []
skipped = []

for residence_abbrev, df in region_data.items():
    current = city_map.get(residence_abbrev, residence_abbrev)
    
    for _, row in df.iterrows():
        # Skip empty values
        if pd.isna(row['Other.Places.Lived']) or row['Other.Places.Lived'] == '':
            continue

        speaker_id = row['CORAAL.Spkr']
        
        original = row['Other.Places.Lived']
        
        # First split on semicolons (primary delimiter)
        semicolon_parts = [p.strip() for p in str(original).split(';')]
        
        for part in semicolon_parts:
            # Then try to clean and split each part
            cleaned_locations = clean_location(part)
            
            if cleaned_locations:
                for cleaned in cleaned_locations:
                    rows.append({
                        'speaker_id': speaker_id,
                        'previous': cleaned, 
                        'current': current, 
                        'original': original
                    })
            else:
                skipped.append({'location': part, 'original': original, 'speaker_id': speaker_id})

df_expanded = pd.DataFrame(rows)

print(f"Processed {len(rows)} valid locations")
print(f"Skipped {len(skipped)} invalid/vague locations:")
for skip in skipped: 
    print(f"  - '{skip['location']}' (from: {skip['original']})")

# Show some examples of what was kept
if len(df_expanded) > 0:
    print(f"\nExample valid locations:")
    for _, row in df_expanded.head(10).iterrows():
        print(f"  - {row['previous']} → {row['current']}")

# Geocode all unique locations
all_locations = set(df_expanded['previous']) | set(df_expanded['current'])
print(f"\nGeocoding {len(all_locations)} unique locations...")

failed_geocodes = []
for loc in all_locations:
    if loc not in geocode_cache:
        print(f"Geocoding: {loc}")
        coords = get_coords(loc)
        if coords == [None, None]:
            failed_geocodes.append(loc)

# Save cache
with open(cache_file, 'w') as f:
    json.dump(geocode_cache, f, indent=2)

if failed_geocodes:
    print(f"\nFailed to geocode {len(failed_geocodes)} locations:")
    for loc in failed_geocodes:
        print(f"  - {loc}")

# Add coordinates to dataframe
df_expanded['prev_lat'] = df_expanded['previous'].apply(lambda x: get_coords(x)[0])
df_expanded['prev_lon'] = df_expanded['previous'].apply(lambda x: get_coords(x)[1])
df_expanded['curr_lat'] = df_expanded['current'].apply(lambda x: get_coords(x)[0])
df_expanded['curr_lon'] = df_expanded['current'].apply(lambda x: get_coords(x)[1])

# Remove rows with missing coordinates
df_expanded = df_expanded.dropna()

print(f"\nSuccessfully mapped {len(df_expanded)} migration paths")

Processed 133 valid locations
Skipped 43 invalid/vague locations:
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'Germany' (from: Germany)
  - 'Germany' (from: Germany)
  - 'Harrisburg (3 years)' (from: Harrisburg (3 years))
  - 'Germany (military)' (from: Germany (military))
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'unknown' (from: unknown)
  - 'unknown' (from: unknown)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'none' (from: none)
  - 'unknown' (from: unknown)
  - 'unknown' (from: unknown

In [5]:
df_expanded.speaker_id.value_counts().describe()

count    70.000000
mean      1.685714
std       1.173905
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max       7.000000
Name: count, dtype: float64

In [6]:
# Create map with color coding by current residence
fig = go.Figure()

# Create color mapping
unique_residences = sorted(df_expanded['current'].unique())
n_residences = len(unique_residences)
colors = px.colors.sample_colorscale(
    "plasma", 
    [n/(n_residences -1) for n in range((n_residences))]
)

color_map = {}

for i, res in enumerate(unique_residences):
    color_map[res] = colors[i]

print(f"\nCreating visualization with {len(df_expanded)} paths...")

# Verify no NaN values slipped through
print(f"Checking for NaN values:")
print(f"  prev_lat: {df_expanded['prev_lat'].isna().sum()}")
print(f"  prev_lon: {df_expanded['prev_lon'].isna().sum()}")
print(f"  curr_lat: {df_expanded['curr_lat'].isna().sum()}")
print(f"  curr_lon: {df_expanded['curr_lon'].isna().sum()}")

# Add lines grouped by current residence
for current_res in unique_residences:
    df_subset = df_expanded[df_expanded['current'] == current_res]
    
    # Collect all line segments for this residence
    line_lons = []
    line_lats = []
    
    for _, row in df_subset.iterrows():
        line_lons.extend([row['prev_lon'], row['curr_lon'], None])
        line_lats.extend([row['prev_lat'], row['curr_lat'], None])
    
    # Add as single trace per residence
    fig.add_trace(go.Scattergeo(
        lon=line_lons,
        lat=line_lats,
        mode='lines',
        line=dict(width=1.5, color=color_map[current_res]),
        opacity=0.6,
        showlegend=False,
        hoverinfo='skip'
    ))

# Add origin points (gray)
fig.add_trace(go.Scattergeo(
    lon=df_expanded['prev_lon'].tolist(),
    lat=df_expanded['prev_lat'].tolist(),
    mode='markers',
    marker=dict(size=6, color='lightgray', line=dict(width=0.5, color='white')),
    text=df_expanded['previous'].tolist(),
    name='Other places lived',
    hovertemplate='<b>%{text}</b><extra></extra>'
))

# Add current location points with halos for high traffic
current_counts = df_expanded.groupby(['current', 'curr_lat', 'curr_lon']).size().reset_index(name='count')
max_count = current_counts['count'].max()

for _, row in current_counts.iterrows():
    # Add outer halo for destinations with multiple paths
    if row['count'] > 1:
        fig.add_trace(go.Scattergeo(
            lon=[row['curr_lon']],
            lat=[row['curr_lat']],
            mode='markers',
            marker=dict(
                size=14 + row['count'] * 2,
                color=color_map[row['current']],
                opacity=0.3,
                line=dict(width=0)
            ),
            showlegend=False,
            hoverinfo='skip'
        ))
    
    # Add main marker
    fig.add_trace(go.Scattergeo(
        lon=[row['curr_lon']],
        lat=[row['curr_lat']],
        mode='markers',
        marker=dict(
            size=14,
            color=color_map[row['current']],
            line=dict(width=2, color='white')
        ),
        text=row['current'],
        name=row['current'],
        hovertemplate=f"<b>{row['current']}</b><br>Participants: {row['count']}<extra></extra>"
    ))

fig.update_layout(
    title=None,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showland=True,
        landcolor='rgb(250, 250, 250)',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        coastlinecolor='rgb(200, 200, 200)',
        # Add these for state borders:
        showsubunits=True,  # Show state boundaries
        subunitcolor='rgb(150, 150, 150)',  # State border color
        subunitwidth=0.5,  # State border thickness
        showcountries=False,
        showframe=False
    ),
    width=800,   # Standard page width
    height=550,  # Good aspect ratio for letter/A4
    showlegend=True,
    legend=dict(
        title=None,
        yanchor="top",
        y=0.98,
        xanchor="left",
        x=0.02,
        font=dict(size=20),
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgb(250, 250, 250)',
        borderwidth=1
    ),
    font=dict(size=10, color='black', family='Arial'),
    paper_bgcolor='white',
    margin=dict(l=10, r=10, t=50, b=10)
)

# Also adjust marker and line sizes for print clarity
# In the lines section:
line=dict(width=1, color=color_map[current_res]),  # Thinner lines

# In the origin points:
marker=dict(size=4, color='lightgray', line=dict(width=0.3, color='white')),  # Smaller

# In the current location points:
marker=dict(
    size=10,  # Slightly smaller
    color=color_map[row['current']],
    line=dict(width=1.5, color='white')
)
# fig.show()
fig.write_image("viz/migration_map.png", scale=2, width=800, height=600)


Creating visualization with 118 paths...
Checking for NaN values:
  prev_lat: 0
  prev_lon: 0
  curr_lat: 0
  curr_lon: 0


In [8]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

# Aggregate paths by (previous, current) to get counts
path_counts = df_expanded.groupby(['previous', 'current', 'prev_lat', 'prev_lon', 'curr_lat', 'curr_lon']).size().reset_index(name='path_count')

# Create map
fig = go.Figure()

# Create color mapping for destinations
unique_residences = df_expanded['current'].unique()
n_residences = len(unique_residences)

colors = px.colors.sample_colorscale(
    "portland", 
    [n/(n_residences -1) for n in range(n_residences)]
)

color_map = {res: colors[i] for i, res in enumerate(unique_residences)}

# Define scaling functions
def get_line_width(count, min_count=1, max_count=7):
    """Scale line width from 0.5 to 6 based on path count"""
    if max_count == min_count:
        return 3
    normalized = (count - min_count) / (max_count - min_count)
    return 0.5 + normalized * 5.5

def get_opacity(count, min_count=1, max_count=7):
    """Scale opacity from 0.25 to 0.9 based on path count"""
    if max_count == min_count:
        return 0.6
    normalized = (count - min_count) / (max_count - min_count)
    return 0.25 + normalized * 0.65

min_path_count = path_counts['path_count'].min()
max_path_count = path_counts['path_count'].max()

# Add lines grouped by current residence (for consistent coloring)
for current_res in unique_residences:
    df_subset = path_counts[path_counts['current'] == current_res]
    
    # Add each path as a separate trace to control width/opacity individually
    for _, row in df_subset.iterrows():
        line_width = get_line_width(row['path_count'], min_path_count, max_path_count)
        line_opacity = get_opacity(row['path_count'], min_path_count, max_path_count)
        
        fig.add_trace(go.Scattergeo(
            lon=[row['prev_lon'], row['curr_lon']],
            lat=[row['prev_lat'], row['curr_lat']],
            mode='lines',
            line=dict(width=line_width, color=color_map[current_res]),
            opacity=line_opacity,
            showlegend=False,
            hovertemplate=f"<b>{row['previous']} → {row['current']}</b><br>Participants: {row['path_count']}<extra></extra>"
        ))

# Add origin points (gray)
origin_points = path_counts.groupby(['previous', 'prev_lat', 'prev_lon']).size().reset_index(name='count')
fig.add_trace(go.Scattergeo(
    lon=origin_points['prev_lon'].tolist(),
    lat=origin_points['prev_lat'].tolist(),
    mode='markers',
    marker=dict(size=4, color='lightgray', line=dict(width=0.3, color='white')),
    text=origin_points['previous'].tolist(),
    name='Other places lived',
    hovertemplate='<b>%{text}</b><extra></extra>'
))

# Add current location points with halos for high traffic
current_counts = df_expanded.groupby(['current', 'curr_lat', 'curr_lon']).size().reset_index(name='count')

for _, row in current_counts.iterrows():
    # Add outer halo for destinations with multiple paths
    if row['count'] > 1:
        fig.add_trace(go.Scattergeo(
            lon=[row['curr_lon']],
            lat=[row['curr_lat']],
            mode='markers',
            marker=dict(
                size=12 + row['count'] * 0.8,
                color=color_map[row['current']],
                opacity=0.2,
                line=dict(width=0)
            ),
            showlegend=False,
            hoverinfo='skip'
        ))
    
    # Add main marker
    fig.add_trace(go.Scattergeo(
        lon=[row['curr_lon']],
        lat=[row['curr_lat']],
        mode='markers',
        marker=dict(
            size=10,
            color=color_map[row['current']],
            line=dict(width=1.5, color='white')
        ),
        text=row['current'],
        name=row['current'],
        hovertemplate=f"<b>{row['current']}</b><br>Participants: {row['count']}<extra></extra>"
    ))

fig.update_layout(
    title=None,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showland=True,
        landcolor='rgb(250, 250, 250)',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        coastlinecolor='rgb(200, 200, 200)',
        showsubunits=True,
        subunitcolor='rgb(150, 150, 150)',
        subunitwidth=0.5,
        showcountries=False,
        showframe=False
    ),
    width=800,
    height=550,
    showlegend=True,
    legend=dict(
        title=None,
        yanchor="top",
        y=0.98,
        xanchor="left",
        x=0.02,
        font=dict(size=20),
        bgcolor='rgba(255, 255, 255, 0.8)',
        bordercolor='rgb(250, 250, 250)',
        borderwidth=1
    ),
    font=dict(size=10, color='black', family='Arial'),
    paper_bgcolor='white',
    margin=dict(l=10, r=10, t=50, b=10)
)

# fig.show()
fig.write_image("viz/migration_map.png", scale=3, width=800, height=600)
# print("\nMap saved to viz/migration_map.png")