In [2]:
# packages
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_colwidth', None)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from imblearn.under_sampling import RandomUnderSampler
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Data Preprocessing

In [3]:
#preparing dataframe
#Save the dataframes in the same directory as the notebook for the below reads to work. 
#df2018 = pd.read_parquet('Combined_Flights_2018.parquet', engine='pyarrow')
#df2019 = pd.read_parquet('Combined_Flights_2019.parquet', engine='pyarrow')
#df2020 = pd.read_parquet('Combined_Flights_2020.parquet', engine='pyarrow')
df2021 = pd.read_parquet('Combined_Flights_2021.parquet', engine='pyarrow')
#df2022 = pd.read_parquet('Combined_Flights_2022.parquet', engine='pyarrow')

#frames = [df2018, df2019,df2020,df2021,df2022]
#df=pd.concat(frames)

# I was struggling to get code to run with all data, dropped down to a single year here. BH
df = df2021

In [4]:
#cleaning dataset
df.dropna(inplace=True)

df = df.select_dtypes(include=['number'])
# bad features below that are technically numbers, but useless including groups, IDs
df = df.drop('Year', axis=1) #we may want to bring this one back in if analyzing multiple years
df = df.drop('DivAirportLandings', axis=1)
df = df.drop('OriginAirportID', axis=1)
df = df.drop('OriginAirportSeqID', axis=1)
df = df.drop('OriginCityMarketID', axis=1)
df = df.drop('DestAirportID', axis=1)
df = df.drop('DestAirportSeqID', axis=1)
df = df.drop('DestCityMarketID', axis=1)
df = df.drop('DOT_ID_Operating_Airline', axis=1)
df = df.drop('DOT_ID_Marketing_Airline', axis=1)
df = df.drop('OriginStateFips', axis=1)
df = df.drop('Flight_Number_Marketing_Airline', axis=1)
df = df.drop('Flight_Number_Operating_Airline', axis=1)
df = df.drop('OriginWac', axis=1)
df = df.drop('DestWac', axis=1)
df = df.drop('DestStateFips', axis=1)
df = df.drop('DepartureDelayGroups', axis=1)
df = df.drop('ArrDelay', axis=1) #removing this since having a delay over 15 will automatically trigger the classification
df = df.drop('ArrivalDelayGroups', axis=1)
df = df.drop('DistanceGroup', axis=1)
df = df.drop('DepDelay', axis=1)#removing since having a departure deplay people can assume arrival will also be delayed
df = df.drop('DepDelayMinutes', axis=1)#removing since having a departure deplay people can assume arrival will also be delayed
df = df.drop('DepDel15', axis=1) #removing since having a departure deplay people can assume arrival will also be delayed
df = df.drop('Quarter', axis=1) #removing this due to high correlation between quarter and month
df = df.drop('ActualElapsedTime', axis=1) #removing due to high correlation with other variables
df = df.drop('CRSElapsedTime', axis=1) #removing due to high correlation with other variables
df = df.drop('WheelsOff', axis=1) #removing due to high correlation with other variables
df = df.drop('WheelsOn', axis=1) #removing due to high correlation with other variables
df = df.drop('Distance', axis=1) #removing due to high correlation with other variables
df = df.drop('CRSArrTime', axis=1) #removing due to high correlation with other variables
df = df.drop('CRSDepTime', axis=1) #removing due to high correlation with other variables

In [14]:
#df.info()

In [15]:
#completing correlation matrix to determine highly correlated variables that we can remove CS
#correlation_matrix = df.corr()
#threshold = 0.8  # You can adjust this threshold as needed
#highly_correlated_pairs = []
#for i in range(len(correlation_matrix.columns)):
    #for j in range(i + 1, len(correlation_matrix.columns)):
        #if abs(correlation_matrix.iloc[i, j]) > threshold:
           # feature1 = correlation_matrix.columns[i]
            #feature2 = correlation_matrix.columns[j]
            #highly_correlated_pairs.append((feature1, feature2, correlation_matrix.iloc[i, j]))
#for feature1, feature2, correlation in highly_correlated_pairs:
    #print(f"Highly correlated pair: {feature1} and {feature2} (Correlation: {correlation:.2f})")

In [6]:
#splitting data for test/training
X = df.drop('ArrDel15', axis=1)
y = df['ArrDel15']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)   #creating training data set, 70% of the data


In [7]:
#undersampling not delayed class. I am undersampling before normalizing since the class imbalance is severe.
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)


In [8]:
#X_train_undersampled.info()

In [9]:
#normalizing the data
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train_undersampled)
X_test_normalized = scaler.transform(X_test)

# QDA

In [9]:
# create qda
qda = QuadraticDiscriminantAnalysis()

# grid search paramters
param_grid = {'reg_param': np.linspace(0, 1, 8)}

# create grid search with cross validation of 5 folds
grid_search = GridSearchCV(qda, param_grid, cv = 5,scoring='accuracy', verbose=2, n_jobs=-1)

# perform grid search on training data
grid_search.fit(X_train_normalized, y_train_undersampled)

# model with best parameters
best_qda_classifier = grid_search.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [10]:
# Calculate accuracy, precision, recall, and F1 score 
y_test_pred_qda = best_qda_classifier.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_test_pred_qda)
precision = precision_score(y_test, y_test_pred_qda)
recall = recall_score(y_test, y_test_pred_qda)
f1 = f1_score(y_test, y_test_pred_qda)

print("QDA Accuracy: ", accuracy)
print("QDA Precision: ", precision)
print("QDA Recall: ", recall)
print("QDA F1: ", f1)

QDA Accuracy:  0.9453124621112309
QDA Precision:  0.7591727771054317
QDA Recall:  1.0
QDA F1:  0.8631020067904707


# Random Forest Classifier

In [11]:

# create random forest classifier
rfc = RandomForestClassifier(random_state=1)

# grid search paramters
param_grid = {'n_estimators': range(1, 200, 75),
              'max_depth': range(5, 16, 5),
              'min_samples_split': range(10, 41, 15),
              'min_samples_leaf': range(20, 50, 15)}
              

# create grid search with cross validation of 5 folds
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# perform grid search on training data
grid_search.fit(X_train_normalized, y_train_undersampled)

# model with best parameters-P0[]
best_rf_classifier = grid_search.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [12]:
y_test_pred_rfc = best_rf_classifier.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_test_pred_rfc)
precision = precision_score(y_test,y_test_pred_rfc)
recall = recall_score(y_test, y_test_pred_rfc)
f1 = f1_score(y_test, y_test_pred_rfc)

print("RFC Accuracy: ", accuracy)
print("RFC Precision: ", precision)
print("RFC Recall: ", recall)
print("RFC F1: ", f1)

RFC Accuracy:  1.0
RFC Precision:  1.0
RFC Recall:  1.0
RFC F1:  1.0


In [1]:
#feature importance
importances = best_rf_classifier.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature_names = df.columns.tolist()
for f in range(df.shape[1]):
    print(f"{f}. Feature {feature_names[indices[f]]} - {importances[indices[f]]}")

NameError: name 'best_rf_classifier' is not defined

In [5]:
#removing unimportant features
df = df.drop('DayOfWeek', axis=1)
df = df.drop('DayofMonth', axis=1)
df = df.drop('AirTime', axis=1)
X = df.drop('ArrDel15', axis=1)
y = df['ArrDel15']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)   #creating training data set, 70% of the data
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_train_undersampled, y_train_undersampled = undersampler.fit_resample(X_train, y_train)
scaler = MinMaxScaler()
X_train_normalized = scaler.fit_transform(X_train_undersampled)
X_test_normalized = scaler.transform(X_test)

rfcAgain = RandomForestClassifier(random_state=1)

# grid search paramters
param_grid = {'n_estimators': range(1, 200, 75),
              'max_depth': range(5, 16, 5),
              'min_samples_split': range(10, 41, 15),
              'min_samples_leaf': range(20, 50, 15)}
              

# create grid search with cross validation of 5 folds
grid_search1 = GridSearchCV(rfcAgain, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# perform grid search on training data
grid_search1.fit(X_train_normalized, y_train_undersampled)


Fitting 5 folds for each of 54 candidates, totalling 270 fits


NameError: name 'grid_search' is not defined

In [8]:
# model with best parameters-P0[]
best_rf_classifier_again = grid_search1.best_estimator_
y_test_pred_rfc = best_rf_classifier_again.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_test_pred_rfc)
precision = precision_score(y_test,y_test_pred_rfc)
recall = recall_score(y_test, y_test_pred_rfc)
f1 = f1_score(y_test, y_test_pred_rfc)

print("RFC Accuracy: ", accuracy)
print("RFC Precision: ", precision)
print("RFC Recall: ", recall)
print("RFC F1: ", f1)

RFC Accuracy:  1.0
RFC Precision:  1.0
RFC Recall:  1.0
RFC F1:  1.0


In [10]:
#feature importance
importances = best_rf_classifier_again.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
feature_names = df.columns.tolist()
for f in range(df.shape[1]):
    print(f"{f}. Feature {feature_names[indices[f]]} - {importances[indices[f]]}")

Feature ranking:
0. Feature ArrDelayMinutes - 0.930020581147701
1. Feature ArrTime - 0.06997941885229894
2. Feature TaxiIn - 0.0
3. Feature TaxiOut - 0.0
4. Feature Month - 0.0
5. Feature DepTime - 0.0


IndexError: index 6 is out of bounds for axis 0 with size 6

# Random Forest Classifier with ADABoost

In [12]:
ada_rf_classifier = AdaBoostClassifier(base_estimator=rfc, random_state=1)

param_grid = {'n_estimators': range(1, 200, 75),
              'max_depth': range(5, 16, 5),
              'min_samples_split': range(10, 41, 15),
              'min_samples_leaf': range(20, 50, 15)}

grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=1), n_jobs=-1,
             param_grid={'max_depth': range(5, 16, 5),
                         'min_samples_leaf': range(20, 50, 15),
                         'min_samples_split': range(10, 41, 15),
                         'n_estimators': range(1, 200, 75)},
             scoring='accuracy', verbose=2)

In [14]:
# Calculate accuracy, precision, recall, and F1 score 
best_rfada_classifier = grid_search.best_estimator_
y_test_pred_rcfada = best_rfada_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred_rcfada)
precision = precision_score(y_test, y_test_pred_rcfada)
recall = recall_score(y_test, y_test_pred_rcfada)
f1 = f1_score(y_test, y_test_pred_rcfada)

print("RFCAda Accuracy: ", accuracy)
print("RFCAda Precision: ", precision)
print("RFCAda Recall: ", recall)
print("RFCAda F1: ", f1)

RFCAda Accuracy:  1.0
RFCAda Precision:  1.0
RFCAda Recall:  1.0
RFCAda F1:  1.0


In [10]:
#both random forest's models are over fitting.

# KNN

In [20]:
knn = KNeighborsClassifier()

param_grid = {
    'n_neighbors': [10, 15, 25, 30],  # Try different values of k
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Try different distance metrics
}
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
grid_search.fit(X_train_normalized, y_train_undersampled)

best_knn_classifier = grid_search.best_estimator_
y_test_pred_knn = best_knn_classifier.predict(X_test_normalized)
accuracy = accuracy_score(y_test, y_test_pred_knn)
precision = precision_score(y_test, y_test_pred_knn)
recall = recall_score(y_test, y_test_pred_knn)
f1 = f1_score(y_test, y_test_pred_knn)

print("RFCAda Accuracy: ", accuracy)
print("RFCAda Precision: ", precision)
print("RFCAda Recall: ", recall)
print("RFCAda F1: ", f1)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


RFCAda Accuracy:  0.8918535307078875
RFCAda Precision:  0.788477730677661
RFCAda Recall:  0.5093116156075056
RFCAda F1:  0.6188691069648198


In [21]:
print("KNN Accuracy: ", accuracy)
print("KNN Precision: ", precision)
print("KNN Recall: ", recall)
print("KNN F1: ", f1)

KNN Accuracy:  0.8918535307078875
KNN Precision:  0.788477730677661
KNN Recall:  0.5093116156075056
KNN F1:  0.6188691069648198


In [23]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 15}
Best Score: 0.8125583000511174
