### Setup

In [None]:
import numpy as np
import pandas as pd
from plotnine import *
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio
import random

Internal

In [None]:
# Train-Test-Validation Files
train_data_path = r'D:\Summer Practicum\1_Data\training_airlines.csv'
rdw_df = pd.read_csv(train_data_path, low_memory=False).copy()

# create a tuple for all airports origin/destination pairs (we don't mind which direction the flight is heading)
rdw_df['AIRPORT_PAIR'] = rdw_df.apply(lambda x: '-'.join(sorted([x['Origin'], x['Dest']])), axis=1)

In [None]:
# create aggregated dataframe for the map
agg_df = rdw_df[rdw_df['ArrDelay'] > 0].groupby('AIRPORT_PAIR').agg(
    avg_delay=('ArrDelay', 'mean'),
    median_delay=('ArrDelay', 'median'),
    total_delay=('ArrDelay', 'sum'),
    num_flights=('ArrDelay', 'count'),
).reset_index()

# Optional: filter for routes with a significant number of flights
agg_df = agg_df[agg_df['num_flights'] >= 10]


# Get one DistanceGroup per AIRPORT_PAIR
pair_to_group = rdw_df[['AIRPORT_PAIR', 'DistanceGroup']].drop_duplicates(subset='AIRPORT_PAIR') # Get first occurrence - I checked and there's no variability at all so it's safe
agg_df = agg_df.merge(pair_to_group, on='AIRPORT_PAIR', how='left') # Merge with agg_df

# Normalize delay by distance, using distance groups (every 250 miles) 
    # need to consider if we want groups or just miles
agg_df['norm_delay'] = agg_df['avg_delay'] / agg_df['DistanceGroup'].astype(float)

# Optional: filter for n-worst routes to avoid over-cluttering the map
n_filter = 10
top_df = agg_df.sort_values('norm_delay', ascending=False).head(n_filter).copy()

# Normalize values to [0,1] within top-N (combats skewness that messes up the color gradiant)
min_nd = top_df['norm_delay'].min()
max_nd = top_df['norm_delay'].max()
top_df['norm_scaled'] = (top_df['norm_delay'] - min_nd) / (max_nd - min_nd)


External

In [None]:
# set up column headers because OpenFlights doesn't have them
columns = [
    'AirportID', 'Name', 'City', 'Country',
    'IATA', 'ICAO', 'Latitude', 'Longitude',
    'Altitude', 'Timezone', 'DST', 'TzDatabaseTimeZone',
    'Type', 'Source'
]

# Get coordinates from public OpenFlightd DB and remove duplicates ("N/A" is duplicated, causing indexing errors)
of_airports = pd.read_csv(r'https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', header=None, names=columns)
of_airports = of_airports[of_airports['IATA'].notna() & (of_airports['IATA'] != '\\N')]

# Create coordinates dictionary for all airports, to later map into the dataframe
airport_coords = of_airports.set_index('IATA')[['Latitude', 'Longitude']].to_dict('index')
airport_coords['IFP'] = {'Latitude': 35.1574, 'Longitude': -114.5596} # manually add Laughlin/Bullhead, AZ - it is missing from the OpenFlights DB

In [None]:
# Split each route into origin and destination pairs, and add their coordinates
top_df[['AIRPORT1', 'AIRPORT2']] = top_df['AIRPORT_PAIR'].str.split('-', expand=True)
top_df['LAT1'] = top_df['AIRPORT1'].map(lambda x: airport_coords.get(x, {}).get('Latitude'))
top_df['LON1'] = top_df['AIRPORT1'].map(lambda x: airport_coords.get(x, {}).get('Longitude'))
top_df['LAT2'] = top_df['AIRPORT2'].map(lambda x: airport_coords.get(x, {}).get('Latitude'))
top_df['LON2'] = top_df['AIRPORT2'].map(lambda x: airport_coords.get(x, {}).get('Longitude'))

## Map Generation

In [None]:
# Get color from colormap
cmap = plt.get_cmap('YlOrRd')

# Helper function to get line color to work with the scaled Normalized values
def get_rgba(norm_val):
    r, g, b, a = cmap(norm_val)
    return f'rgba({r*255:.0f}, {g*255:.0f}, {b*255:.0f}, 0.8)'


# Helper function to add markers to any flight path for tooltip hovering
def interpolate_coords(lat1, lon1, lat2, lon2, steps=5):
    lats = np.linspace(lat1, lat2, steps)
    lons = np.linspace(lon1, lon2, steps)
    return list(lats), list(lons)



fig = go.Figure() # create map plot
use_start = True  # initialize toggle for label positioning
# Fill map with flight paths (lines)
for _, row in top_df.iterrows():
    cust_color = get_rgba(row['norm_scaled'])

    # Add color-coded trace of flight path
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON1'], row['LON2']],
        lat=[row['LAT1'], row['LAT2']],
        mode='lines',
        line=dict(width=0.5 + row['avg_delay'] / 30, color=cust_color),
        opacity=0.8,
        name=row['AIRPORT_PAIR'],
        text=f"Route: {row['AIRPORT_PAIR']}<br>Avg Delay: {row['avg_delay']:.1f} min<br>Flights: {row['num_flights']}",
        hoverinfo='text'
    ))

    # Interpolate midpoints for better hover coverage
    interp_lats, interp_lons = interpolate_coords(row['LAT1'], row['LON1'], row['LAT2'], row['LON2'], steps=10)

    # Add invisible markers along the flight path to enable tooltip when hovering over the line
    # (otherwise the tooltip only shows up when hovering around the endpoints)
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=interp_lons,
        lat=interp_lats,
        mode='markers',
        marker=dict(size=3, color=cust_color, opacity=0.001),  # marker indistinguishable from line but still renders - need to see if this causes issues on non-GPU-accelerated machines
        text=f"Route: {row['AIRPORT_PAIR']}<br>Avg Delay: {row['avg_delay']:.1f} min<br>Flights: {row['num_flights']}",
        hoverinfo='text',
        showlegend=False
    ))

    # Alternate label position between origin and destination
    if use_start:
        label_lat, label_lon = row['LAT1'], row['LON1']
    else:
        label_lat, label_lon = row['LAT2'], row['LON2']
    
    # Jitter to prevent label overlapping (will be useful if we increase the N-Filter)
    label_lat += random.uniform(0, 0.5) if use_start else -random.uniform(0, 0.2)

    # Flip toggle
    use_start = not use_start

    # Add label trace
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[label_lon],
        lat=[label_lat],
        mode='text',
        text=[row['AIRPORT_PAIR']],
        textfont=dict(size=12, color='black'),
        showlegend=False,
        hoverinfo='skip'
    ))

# Create the plot's layout
fig.update_layout(
    title=dict(
        text=f'Flight Routes with Worst Normalized Delays (per 250 Miles)<br>Worst {n_filter} Routes with >=10 Flights',
        x=0.5,  # Center the title
        xanchor='center',
        font=dict(size=18)
    ),
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showland=True,
    ),
    showlegend=True
)

fig.show()

# Save the interactive map to an HTML file
pio.write_html(fig, file="worst_flight_delays_map.html", auto_open=False)
