# Python Visualization Dashboard

In [15]:
### Mounting Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [24]:
!pip install dash
!pip install dash_core_components
import dash
import dash_core_components as dcc
from dash import dcc, html
from dash.dependencies import Input, Output, State
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import holidays
from sklearn.metrics import classification_report, confusion_matrix

Collecting dash_core_components
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Installing collected packages: dash_core_components
Successfully installed dash_core_components-2.0.0




The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`



## Training XGBoost Predictive Model

In [19]:
# Merge into one df
files = !ls 'gdrive/My Drive/OMSA/CSE6242/clean_data_v2/'
files = files[0].split('\t')
files = ['gdrive/My Drive/OMSA/CSE6242/clean_data_v2/' + f for f in files]

# Load your data
data = pd.concat([pd.read_csv(f) for f in files if f != 'gdrive/My Drive/OMSA/CSE6242/clean_data_v2/la_weather.csv'], ignore_index=True)
# data
weather_data = pd.read_csv('gdrive/My Drive/OMSA/CSE6242/clean_data_v2/la_weather.csv',skiprows=3)
data = data.merge(weather_data, left_on='flight_date', right_on='time', how='left')
data = data.drop(columns=['time'])
# Convert 'dep_delay' to categorical (delayed or not delayed)
data['delayed'] = data['dep_delay'].apply(lambda x: 1 if x >= 15 else 0)

# Check if flight_date is a weekday or a holiday
us_holidays = holidays.US()
data["flight_date_pd"] = pd.to_datetime(data["flight_date"])
data["weekday"] = data["flight_date_pd"].dt.day_name()
data["is_weekday"] = (data["flight_date_pd"].dt.weekday < 5).astype(int)
data["is_holiday"] = data["flight_date_pd"].apply(lambda x: 1 if x in us_holidays else 0)

# Feature selection
features = ['Carrier Code', 'destination_airport', 'scheduled_dep_hour', 'month', 'day', 'scheduled_elapsed', 'is_holiday', 'is_weekday','weather_code (wmo code)', 'temperature_2m_mean (°C)',
       'rain_sum (mm)', 'sunshine_duration (s)', 'snowfall_sum (cm)',
       'precipitation_sum (mm)', 'wind_direction_10m_dominant (°)',
       'daylight_duration (s)', 'wind_speed_10m_max (km/h)']
X = data[features]
y = data['delayed']

# One-hot encoding string features
carrier_encoder = OneHotEncoder(sparse_output=False, drop=None)
dest_encoder = OneHotEncoder(sparse_output=False, drop=None)

carrier_encoded = carrier_encoder.fit_transform(X[['Carrier Code']])
carrier_df = pd.DataFrame(carrier_encoded, columns=carrier_encoder.get_feature_names_out(['Carrier Code']))

dest_encoded = dest_encoder.fit_transform(X[['destination_airport']])
dest_df = pd.DataFrame(dest_encoded, columns=dest_encoder.get_feature_names_out(['destination_airport']))

X_encoded = pd.concat([X.drop(columns=['Carrier Code', 'destination_airport']), carrier_df, dest_df], axis=1)

X_encoded = X_encoded.reset_index(drop=True)
y = y.reset_index(drop=True)

# Balancing the dataset
delayed_mask = y == 1
X_delayed = X_encoded[delayed_mask]
y_delayed = y[delayed_mask]
X_not_delayed = X_encoded[~delayed_mask]
y_not_delayed = y[~delayed_mask]

n_samples = min(len(X_delayed), len(X_not_delayed))

X_delayed_sampled = X_delayed.sample(n=n_samples, random_state=42)
y_delayed_sampled = y_delayed.loc[X_delayed_sampled.index]

X_not_delayed_sampled = X_not_delayed.sample(n=n_samples, random_state=42)
y_not_delayed_sampled = y_not_delayed.loc[X_not_delayed_sampled.index]

X_balanced = pd.concat([X_delayed_sampled, X_not_delayed_sampled], axis=0)
y_balanced = pd.concat([y_delayed_sampled, y_not_delayed_sampled], axis=0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
features_encoded = X_balanced.columns  # Encoded feature names

# Train XGBoost model
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)
y_pred = xgb_classifier.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
importances = xgb_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features_encoded, 'Importance': importances})
print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))

Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.64      0.64     21647
           1       0.64      0.65      0.64     21618

    accuracy                           0.64     43265
   macro avg       0.64      0.64      0.64     43265
weighted avg       0.64      0.64      0.64     43265

Confusion Matrix:
 [[13785  7862]
 [ 7631 13987]]
Feature Importances:
                      Feature  Importance
21           Carrier Code_WN    0.100171
0         scheduled_dep_hour    0.057080
50   destination_airport_EWR    0.026435
20           Carrier Code_UA    0.025803
1                      month    0.024788
..                       ...         ...
81   destination_airport_PBI    0.000000
86   destination_airport_PSP    0.000000
94   destination_airport_SBN    0.000000
101  destination_airport_SNA    0.000000
104  destination_airport_TUL    0.000000

[106 rows x 2 columns]


# Interactive Dashboard

In [26]:
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Flight Delay Predictor"),
    html.Div([
        html.Label("Carrier Code (e.g., 'AA' 'AS' 'DL' 'B6' 'NK' 'UA' 'WN'):"),
        dcc.Input(id='carrier', type='text', value='AA'),
    ]),
    html.Div([
        html.Label("Destination Airport (e.g., JFK, SFO, HNL):"),
        dcc.Input(id='dest', type='text', value='JFK'),
    ]),
    html.Div([
        html.Label("Scheduled Departure Hour (0-23):"),
        dcc.Input(id='dep_hour', type='number', value=12, min=0, max=23),
    ]),
    html.Div([
        html.Label("Month (1-12):"),
        dcc.Input(id='month', type='number', value=1, min=1, max=12),
    ]),
    html.Div([
        html.Label("Day (1-31):"),
        dcc.Input(id='day', type='number', value=1, min=1, max=31),
    ]),
    html.Div([
        html.Label("Scheduled Elapsed Time (minutes, e.g., 60-1000):"),
        dcc.Input(id='elapsed', type='number', value=180, min=60, max=1000),
    ]),
    html.Div([
        html.Label("Is Holiday (0 = No, 1 = Yes):"),
        dcc.Dropdown(id='holiday', options=[{'label': 'No', 'value': 0}, {'label': 'Yes', 'value': 1}], value=0),
    ]),
    html.Div([
        html.Label("Is Weekday (0 = No, 1 = Yes):"),
        dcc.Dropdown(id='weekday', options=[{'label': 'No', 'value': 0}, {'label': 'Yes', 'value': 1}], value=1),
    ]),
    html.Button('Predict Delay', id='submit-btn', n_clicks=0),
    html.Div(id='prediction-output', style={'margin-top': '20px'})
])


@app.callback(
    Output('prediction-output', 'children'),
    Input('submit-btn', 'n_clicks'),
    [State('carrier', 'value'), State('dest', 'value'), State('dep_hour', 'value'),
     State('month', 'value'), State('day', 'value'), State('elapsed', 'value'),
     State('holiday', 'value'), State('weekday', 'value')]
)
def update_prediction(n_clicks, carrier, dest, dep_hour, month, day, elapsed, holiday, weekday):
    if n_clicks > 0:

        user_input = {
            'Carrier Code': carrier.upper() if carrier else 'AA',
            'destination_airport': dest.upper() if dest else 'JFK',
            'scheduled_dep_hour': int(dep_hour) if dep_hour is not None else 12,
            'month': int(month) if month is not None else 1,
            'day': int(day) if day is not None else 1,
            'scheduled_elapsed': int(elapsed) if elapsed is not None else 180,
            'is_holiday': int(holiday) if holiday is not None else 0,
            'is_weekday': int(weekday) if weekday is not None else 1
        }
        input_df = pd.DataFrame([user_input], columns=features)


        try:
            carrier_encoded = carrier_encoder.transform([[input_df['Carrier Code'][0]]])
            carrier_df = pd.DataFrame(carrier_encoded, columns=carrier_encoder.get_feature_names_out(['Carrier Code']))
            dest_encoded = dest_encoder.transform([[input_df['destination_airport'][0]]])
            dest_df = pd.DataFrame(dest_encoded, columns=dest_encoder.get_feature_names_out(['destination_airport']))
        except ValueError as e:
            return html.P(f"Error: Invalid input - {str(e)}. Use a valid carrier or destination from the data.")


        input_encoded = pd.concat([input_df.drop(columns=['Carrier Code', 'destination_airport']), carrier_df, dest_df], axis=1)
        missing_cols = set(X_balanced.columns) - set(input_encoded.columns)
        for col in missing_cols:
            input_encoded[col] = 0
        input_encoded = input_encoded[X_balanced.columns]

        prob = xgb_classifier.predict_proba(input_encoded)[0]
        delay_prob = prob[1]

        # Report
        report = [
            html.H3("Flight Delay Prediction Report"),
            html.P(f"Flight Details: {user_input}"),
            html.P(f"Probability of Delay: {delay_prob * 100:.1f}%"),
            html.P(f"Assessment: This flight is {'likely' if delay_prob >= 0.5 else 'unlikely'} to be delayed.")
        ]
        return report
    return "Enter flight details and click 'Predict Delay'."

if __name__ == '__main__':
    app.run(debug=True)

<IPython.core.display.Javascript object>

## Flight Path Map with Training Data

To create a map of the US and connecting flights, I need to import [airport coordinates data](https://www.partow.net/miscellaneous/airportdatabase/)

In [None]:
### Getting coordinates
column_names = [
"icao_code",
    "iata_code",
    "airport_name",
    "city",
    "country",
    "lat_degrees",
    "lat_minutes",
    "lat_seconds",
    "lat_direction",
    "lon_degrees",
    "lon_minutes",
    "lon_seconds",
    "lon_direction",
    "altitude",
    "lat_decimal",
    "lon_decimal"
]

coords_df = pd.read_csv(
    'gdrive/MyDrive/OMSA/CSE6242/GlobalAirportDatabase/GlobalAirportDatabase.txt',
    sep=':',
    header=None,
    names=column_names
)
usa_airports = coords_df[coords_df['country'] == 'USA']
usa_airports.head()
map_data = usa_airports[['iata_code', 'airport_name', 'city','country', 'lat_decimal', 'lon_decimal']]


# Merge flight delay data w/ map data
flight_data = data.merge(
    map_data[['iata_code', 'lat_decimal', 'lon_decimal']],
    left_on='destination_airport',
    right_on='iata_code',
    how='left'
).drop(columns=['iata_code'])

# LAX is the origin coordinates
flight_data['origin_lat'] = 33.9416
flight_data['origin_lon'] = -118.4085


# Calculate sum of flights to each airport and calculate average delay for each airport
flight_summary = flight_data.groupby('destination_airport').agg({
    'lat_decimal': 'first',
    'lon_decimal': 'first',
    'delayed': 'mean',
    'Carrier Code': 'count'
}).rename(columns={'Carrier Code': 'flight_count'}).reset_index()

fig = go.Figure()

# Add flight paths
for i, row in flight_summary.iterrows():
    fig.add_trace(go.Scattergeo(
        lon=[flight_data['origin_lon'].iloc[0], row['lon_decimal']],
        lat=[flight_data['origin_lat'].iloc[0], row['lat_decimal']],
        mode='lines',
        line=dict(width=1, color='blue'),
        opacity=0.5,
        showlegend=False
    ))

# LAX Marker
fig.add_trace(go.Scattergeo(
    lon=[flight_data['origin_lon'].iloc[0]],
    lat=[flight_data['origin_lat'].iloc[0]],
    mode='markers',
    marker=dict(size=15, color='red', symbol='star'),
    text='LAX - Los Angeles',
    hoverinfo='text',
    name='LAX'
))

# U.S. airports points
fig.add_trace(go.Scattergeo(
    lon=map_data['lon_decimal'],
    lat=map_data['lat_decimal'],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.3),
    text=map_data['iata_code'] + '<br>' + map_data['city'] + '<br>' + map_data['country'],
    hoverinfo='text',
    name='U.S. Airports'
))

# Add destination markers with delay info
fig.add_trace(go.Scattergeo(
    lon=flight_summary['lon_decimal'],
    lat=flight_summary['lat_decimal'],
    mode='markers',
    marker=dict(
        size=flight_summary['flight_count'] / flight_summary['flight_count'].max() * 20,
        color=flight_summary['delayed'],
        colorscale='RdYlGn_r',
        colorbar=dict(title='Delay Proportion', x=1, y = 0.3, xanchor='left'),
        showscale=True
    ),
    text=flight_summary['destination_airport'] + '<br>' +
         (flight_summary['delayed'] * 100).round(1).astype(str) + '% Delayed' + '<br>' +
         flight_summary['flight_count'].astype(str) + ' Flights',
    hoverinfo='text',
    name='LAX Destinations'
))


fig.update_layout(
    title_text='Flight Paths from LAX to U.S. Destinations',
    geo=dict(
        scope='north america',
        projection_type='equirectangular',
        showland=True,
        landcolor='rgb(243, 243, 243)',
        countrycolor='rgb(204, 204, 204)',
        lataxis=dict(range=[20, 75]),  # Include Hawaii and Alaska on map
        lonaxis=dict(range=[-170, -60])
    ),
    height=700
)

fig.show()