Trace all flight routes by airline, for the top 10 airlines.
Find average delay per route.
Display map with 5 worst routes per airline.

# Setup

### Packages

In [1]:
import numpy as np
import pandas as pd
from plotnine import *
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.io as pio

### Load Data

In [2]:
train_data_path = r'D:\Summer Practicum\1_Data\training_airlines.csv'
df = pd.read_csv(train_data_path, low_memory=False)

# create a tuple for all airports origin/destination pairs (we don't mind which direction the flight is heading)
df['AIRPORT_PAIR'] = df.apply(
    lambda x: '-'.join(sorted([x['Origin'], x['Dest']])), 
    axis=1
)

### Delay Impact Score Function

In [3]:
def delay_score(minutes):
    if minutes <= 20:
        return 0
    elif minutes < 180:
        return ((minutes - 20) / 160) ** 2  # squashed quadratic curve from 0 to ~1
    else:
        return 5  # represent serious systemic impact like cancellation for flights delayed >180

Apply Score to Flight (used later)

In [4]:
df['DelayImpact'] = df['ArrDelay'].apply(delay_score)

# Data Prep

### Aggregate dataframe

In [5]:
df['SevereDelay'] = df['ArrDelay'] >= 180

agg_df = df[df['ArrDelay'] > 0].groupby(['IATA_Code_Operating_Airline', 'AIRPORT_PAIR']).agg(
    avg_delay=('ArrDelay', 'mean'),
    impact_score=('DelayImpact', 'mean'),
    num_flights=('ArrDelay', 'count'),
    num_severe_delays=('SevereDelay', 'sum')
).reset_index()

agg_df[['AIRPORT1', 'AIRPORT2']] = agg_df['AIRPORT_PAIR'].str.split('-', expand=True)

### Filter for Significant Routes by Flights

In [6]:
# Take only routes with a significant number of delayed flights
flights_per_route_filter = 100
agg_df = agg_df[agg_df['num_flights'] >= flights_per_route_filter]

## Normalize by Performance

In [7]:
agg_df = agg_df.sort_values('impact_score', ascending=False)

# Normalize impact score for coloring (within top N only)
min_score = agg_df['impact_score'].min()
max_score = agg_df['impact_score'].max()
agg_df['norm_scaled'] = (agg_df['impact_score'] - min_score) / (max_score - min_score + 1e-8)

### Filter for N-Worst Performing Routes

In [8]:
# This avoids over-cluttering the map
n_worst = 10
#agg_df = agg_df.sort_values('norm_delay', ascending=False).head(n_worst)

### Airport Coordinates - OpenFlights

In [9]:
# set up column headers because OpenFlights doesn't have them
columns = [
    'AirportID', 'Name', 'City', 'Country',
    'IATA', 'ICAO', 'Latitude', 'Longitude',
    'Altitude', 'Timezone', 'DST', 'TzDatabaseTimeZone',
    'Type', 'Source'
]

# Get coordinates from public OpenFlightd DB and remove duplicates ("N/A" is duplicated, causing indexing errors)
of_airports = pd.read_csv(r'https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat', header=None, names=columns)
of_airports = of_airports[of_airports['IATA'].notna() & (of_airports['IATA'] != '\\N')]

# Create coordinates dictionary for all airports, to later map into the dataframe
airport_coords = of_airports.set_index('IATA')[['Latitude', 'Longitude']].to_dict('index')
airport_coords['IFP'] = {'Latitude': 35.1574, 'Longitude': -114.5596} # manually add Laughlin/Bullhead, AZ - it is missing from the OpenFlights DB

In [10]:
# Add origin and destination coordinates
agg_df['LAT1'] = agg_df['AIRPORT1'].map(lambda x: airport_coords.get(x, {}).get('Latitude'))
agg_df['LON1'] = agg_df['AIRPORT1'].map(lambda x: airport_coords.get(x, {}).get('Longitude'))
agg_df['LAT2'] = agg_df['AIRPORT2'].map(lambda x: airport_coords.get(x, {}).get('Latitude'))
agg_df['LON2'] = agg_df['AIRPORT2'].map(lambda x: airport_coords.get(x, {}).get('Longitude'))

#### Filter Worst Routes per Airline

In [11]:
top_routes_df = (
    agg_df
    .sort_values(['IATA_Code_Operating_Airline', 'impact_score'], ascending=[True, False])
    .groupby('IATA_Code_Operating_Airline')
    .head(5)
)

Save Dataframe

In [12]:
# Add Airline Name field
airline_names = {
    "AA": "American Airlines",
    "AS": "Alaska Airlines",
    "B6": "JetBlue Airways",
    "DL": "Delta Air Lines",
    "F9": "Frontier Airlines",
    "G4": "Allegiant Air",
    "HA": "Hawaiian Airlines",
    "NK": "Spirit Airlines",
    "UA": "United Airlines",
    "WN": "Southwest Airlines"
}
top_routes_df['Airline_Name'] = top_routes_df['IATA_Code_Operating_Airline'].map(airline_names)


top_df_filename = 'D:/Summer Practicum/1_Data/worst_5_routes_per_airline.csv'
top_routes_df.to_csv(top_df_filename, index=False)

# Mapping

### Helper Functions

In [21]:
# Get color from colormap
cmap = plt.get_cmap('YlOrRd')

# Helper function to get line color to work with the scaled Normalized values
def get_rgba(norm_val):
    r, g, b, _ = cmap(norm_val)
    return f'rgba({r*255:.0f}, {g*255:.0f}, {b*255:.0f}, 0.8)'


# Helper function to add markers to any flight path for tooltip hovering
def interpolate_coords(lat1, lon1, lat2, lon2, steps=5):
    lats = np.linspace(lat1, lat2, steps)
    lons = np.linspace(lon1, lon2, steps)
    return list(lats), list(lons)

### Map 1: All Routes Per Airline, Uncolored

In [None]:
fig = go.Figure() # create map plot

# Create multi-line segments per airline with all airline's routes
for airline, group in agg_df.groupby('IATA_Code_Operating_Airline'):

    # Append start and end coordinates for each route
    lines_lats = []
    lines_lons = []
    for _, row in group.iterrows():
        lines_lats += [row['LAT1'], row['LAT2'], None]  # None separates segments
        lines_lons += [row['LON1'], row['LON2'], None]

    # Trace all routes per airline
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=lines_lons,
        lat=lines_lats,
        mode='lines',
        line=dict(width=1, color='gray'),
        opacity=0.7,
        name=airline,
        hoverinfo='none',
        visible='legendonly'
    ))


# Create the plot's layout
fig.update_layout(
    title=dict(
        text=f'Flight Routes by Airline<br>(Only Routes With {flights_per_route_filter} Flights Or More)',
        x=0.5,  # Center the title
        xanchor='center',
        font=dict(size=18)
    ),
    geo=dict(
        projection_type='mercator',  # use 'natural earth' or 'mercator'
        showland=True,
        center=dict(lat=37, lon=-95),
        lataxis=dict(range=[-15, 75]),
        lonaxis=dict(range=[-170, -50]),
        countrycolor='lightgray',       # borders between countries
        showcountries=True,             # show country borders
    ),
    showlegend=True
)

# Save the interactive map to an HTML file
pio.write_html(fig, file="D:/Summer Practicum/3_Exports/flight_routes_map.html", auto_open=False)

### Map 2: Worst 5 Routes Per Airline, Colored

#### Generate Map (All Airlines)

In [37]:
fig = go.Figure()

for _, row in top_routes_df.iterrows():
    color = get_rgba(row['norm_scaled'])

    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON1'], row['LON2']],
        lat=[row['LAT1'], row['LAT2']],
        mode='lines',
        line=dict(width=1 + row['impact_score'], color=color),
        opacity=0.8,
        name=f"{row['IATA_Code_Operating_Airline']} {row['AIRPORT_PAIR']}",
        text=f"Airline: {row['IATA_Code_Operating_Airline']}<br>Route: {row['AIRPORT_PAIR']}<br>Impact Score: {row['impact_score']:.2f}<br>Flights: {row['num_flights']}",
        hoverinfo='text',
        showlegend=False  # too many to show in legend cleanly
    ))

fig.update_layout(
    title=dict(
        text='Worst 5 Routes per Airline<br>Colored by Normalized Impact Score',
        x=0.5,
        xanchor='center',
        font=dict(size=18)
    ),
    geo=dict(
        projection_type='natural earth',  # use 'natural earth' or 'mercator'
        center=dict(lat=37, lon=-95),
        lataxis=dict(range=[-15, 75]),
        lonaxis=dict(range=[-170, -50]),
        showland=True,
        landcolor='rgb(240,240,240)',
        showcountries=True,
        countrycolor='gray'
    )
)

pio.write_html(fig, file="D:/Summer Practicum/3_Exports/worst_5_routes_per_airline.html", auto_open=False)


#### Generate Map (Airline Selectable)

In [42]:
fig = go.Figure()
trace_airlines = []  # store which airline each trace belongs to

for _, row in top_routes_df.iterrows():
    color = get_rgba(row['norm_scaled'])

    # Add flight path, colored by impact score
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON1'], row['LON2']],
        lat=[row['LAT1'], row['LAT2']],
        mode='lines',
        line=dict(width=1 + row['impact_score'], color=color),
        opacity=0.8,
        name=f"{row['IATA_Code_Operating_Airline']} {row['AIRPORT_PAIR']}",
        text=f"Airline: {row['IATA_Code_Operating_Airline']}<br>Route: {row['AIRPORT_PAIR']}<br>Impact Score: {row['impact_score']:.2f}<br>Flights: {row['num_flights']}",
        hoverinfo='text',
        visible=False  # start hidden
    ))
    trace_airlines.append(row['IATA_Code_Operating_Airline'])

    # Add origin airport label
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON1']],
        lat=[row['LAT1']],
        mode='text',
        text=[row['AIRPORT1']],
        textfont=dict(size=10, color='gray'),
        showlegend=False,
        hoverinfo='skip'
    ))

    # Add destination airport label
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON2']],
        lat=[row['LAT2']],
        mode='text',
        text=[row['AIRPORT2']],
        textfont=dict(size=10, color='gray'),
        showlegend=False,
        hoverinfo='skip'
    ))

    # Interpolate midpoints for better hover coverage
    interp_lats, interp_lons = interpolate_coords(row['LAT1'], row['LON1'], row['LAT2'], row['LON2'], steps=30)

    # Add invisible markers along the flight path to enable tooltip when hovering over the line
    # (otherwise the tooltip only shows up when hovering around the endpoints)
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=interp_lons,
        lat=interp_lats,
        mode='markers',
        marker=dict(size=3, color=color, opacity=0.001),  # marker indistinguishable from line but still renders - need to see if this causes issues on non-GPU-accelerated machines
        text=f"Airline: {row['IATA_Code_Operating_Airline']}<br>Route: {row['AIRPORT_PAIR']}<br>Impact Score: {row['impact_score']:.2f}<br>Flights: {row['num_flights']}",
        hoverinfo='text',
        showlegend=False
    ))


airlines = sorted(top_routes_df['IATA_Code_Operating_Airline'].unique())
buttons = []

for airline in airlines:
    visible_mask = [airline == trace_airline for trace_airline in trace_airlines]
    buttons.append(dict(
        label=airline,
        method='update',
        args=[{'visible': visible_mask},
            {'title': dict(
                text=f'Top 5 Impactful Routes — {airline}',
                x=0.5,
                xanchor='center',
                font=dict(size=18)
            )}]
    ))


fig.update_layout(
    updatemenus=[dict(
        active=0,
        buttons=buttons,
        x=1.02,
        y=0.9,
        xanchor='left',
        yanchor='top'
    )],
    title=dict(
        text='Top 5 Impactful Routes — Select an Airline',
        x=0.5,
        xanchor='center',
        font=dict(size=18)
    ),
    geo=dict(
        projection_type='natural earth',  # use 'natural earth' or 'mercator'
        center=dict(lat=37, lon=-95),
        lataxis=dict(range=[-15, 75]),
        lonaxis=dict(range=[-170, -50]),
        showland=True,
        landcolor='rgb(240,240,240)',
        showcountries=True,
        countrycolor='gray'
    ),
    showlegend=False
)

fig.write_html("D:/Summer Practicum/3_Exports/worst_5_routes_per_airline_dropdown.html", auto_open=False)

### previous version (1 trace per route)

In [None]:
# Fill map with flight paths (lines)
for _, row in agg_df.iterrows():
    cust_color = get_rgba(row['norm_scaled'])

    # Add color-coded trace of flight path
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=[row['LON1'], row['LON2']],
        lat=[row['LAT1'], row['LAT2']],
        mode='lines',
        line=dict(width=0.5 + row['impact_score'] / 30, color=cust_color),
        opacity=0.8,
        name=row['IATA_Code_Operating_Airline'],
        text=f"Route: {row['AIRPORT_PAIR']}<br>Airline: {row['IATA_Code_Operating_Airline']}<br>Avg Delay: {row['avg_delay']:.1f} min<br>Impact Score: {row['impact_score']:.2f}<br>Severe Delays: {row['num_severe_delays']} of {row['num_flights']}",
        hoverinfo='text',
        visible='legendonly',
        showlegend=True
    ))

    # Interpolate midpoints for better hover coverage
    interp_lats, interp_lons = interpolate_coords(row['LAT1'], row['LON1'], row['LAT2'], row['LON2'], steps=10)

    # Add invisible markers along the flight path to enable tooltip when hovering over the line
    # (otherwise the tooltip only shows up when hovering around the endpoints)
    fig.add_trace(go.Scattergeo(
        locationmode='USA-states',
        lon=interp_lons,
        lat=interp_lats,
        mode='markers',
        marker=dict(size=3, color=cust_color, opacity=0.001),  # marker indistinguishable from line but still renders - need to see if this causes issues on non-GPU-accelerated machines
        text=f"Route: {row['IATA_Code_Operating_Airline']}<br>Avg Delay: {row['avg_delay']:.1f} min<br>Flights: {row['num_flights']}",
        hoverinfo='text',
        showlegend=False
    ))