In [None]:
# Import necessary packages
import pandas as pd
import numpy as np
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

# Read in the data
df = pd.read_csv("top_ports_lines.csv")

In [None]:
# Train-Validation-Test Split

# Split the DataFrame into X (features) and y (target variable)
y = df[["cancelled"]]
X = df.drop("cancelled", axis=1) #dropping the column "cancelled"

# Selecting which variables we think are important to use in our logistic regression model, a priori.
X = X[["operating_airline", "origin", "crs_dep_time", "dep_delay", "distance"]]

In [None]:
# One-hot encode the categorical variables ("operating_airline" and "origin")

# define one hot encoding
encoder = OneHotEncoder(sparse_output=False, drop="first")
# transform data

encoder.fit(X[["operating_airline", "origin"]])
onehot = encoder.transform(X[["operating_airline", "origin"]])

# Get column names
col_names = encoder.get_feature_names_out(["operating_airline", "origin"])

# Create DataFrame with proper column names
one_hot_df = pd.DataFrame(onehot, columns=col_names)

In [None]:
# Merged one-hot-encoded DataFrame with original X DataFrame
x_encoded = pd.concat([X, one_hot_df], axis=1)

In [None]:
# Create 50:40:10 train:validation:test split before performing exploratory data analysis to prevent data leakage and overfitting
# Model development and feature selection will be done on the training set only.

# Train, Validation, Test Split
x_train, x_valtest, y_train, y_valtest = train_test_split(x_encoded, y, train_size=0.50, random_state=123)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, train_size=0.8, random_state=123)

In [None]:
# Transform binary variable "cancelled" with values 1 and 2 to 0 and 1, respectively
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

In [None]:
# Create new var crs_dep_hour to only include the expected hour of departure and drop the crs_dep_time var
x_train["crs_dep_hour"] = x_train["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_train = x_train.drop(columns=["crs_dep_time"])

x_val["crs_dep_hour"] = x_val["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_val = x_val.drop(columns=["crs_dep_time"])

x_test["crs_dep_hour"] = x_test["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_test = x_test.drop(columns=["crs_dep_time"])

In [None]:
# Impute missing values in dep_delay using the median (data is skewed)
x_train["dep_delay"] = x_train['dep_delay'].fillna(x_train["dep_delay"].median())
x_val["dep_delay"] = x_val['dep_delay'].fillna(x_val["dep_delay"].median())
x_test["dep_delay"] = x_test['dep_delay'].fillna(x_test["dep_delay"].median())

# Convert departure delay variable into type integer
x_train["dep_delay"] = x_train["dep_delay"].astype(int)
x_val["dep_delay"] = x_val["dep_delay"].astype(int)
x_test["dep_delay"] = x_test["dep_delay"].astype(int)

In [None]:
# SMOTE + Tomek (SMOTEENN) using stepwise selected variables
smote_enn_stepwise = SMOTEENN(random_state=123)

# Resample dataset to create a more balanced dataset for training using SMOTEENN on stepwise selected features
X_resampled_step, y_resampled_step = smote_enn_stepwise.fit_resample(x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']], y_train)

In [None]:
# Fit logistic model to resampled training data (SMOTEENN)
log_model_res = LogisticRegression(random_state = 123)

log_model_res.fit(X_resampled_step, y_resampled_step)
y_pred_val_res = log_model_res.predict(x_val[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']])
print(f"Validation accuracy after SMOTEENN (stepwise selected features): {accuracy_score(y_val, y_pred_val_res)}")

In [None]:
# Evaluate the model
print(classification_report(y_val, y_pred_val_res))
print(f"f1 score is {f1_score(y_val, y_pred_val_res, average='binary')}")

cm = confusion_matrix(y_val, y_pred_val_res)
print("Confusion Matrix: ")
print(cm)
print("ROC-AUC:", roc_auc_score(y_val, y_pred_val_res))

In [None]:
# Testing our Best Model - Smoteen from stepwise selected variables
y_pred_test_res = log_model_res.predict(x_test[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']])

# Generate predictions on test data
print(classification_report(y_test, y_pred_test_res))
cm2 = confusion_matrix(y_test, y_pred_test_res)
print("Confusion Matrix: ")
print(cm2)
print("ROC-AUC:", roc_auc_score(y_test, y_pred_test_res))
print(f"Test accuracy after SMOTEENN (stepwise selected features): {accuracy_score(y_test, y_pred_test_res)}")