In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
# Load the data
data = pd.read_csv('resources/flights.csv')
data.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,...,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,3,EV,20366,EV,N48901,4397,13930,1393007,ORD,...,1197705,GRB,1003.0,0.0,1000-1059,1117.0,0.0,0.0,0.0,174.0
1,1,3,EV,20366,EV,N16976,4401,15370,1537002,TUL,...,1393007,ORD,1027.0,0.0,1000-1059,1216.0,0.0,0.0,0.0,585.0
2,1,3,EV,20366,EV,N12167,4404,11618,1161802,EWR,...,1541205,TYS,1848.0,0.0,1800-1859,2120.0,0.0,0.0,0.0,631.0
3,1,3,EV,20366,EV,N14902,4405,10781,1078105,BTR,...,1226603,IAH,1846.0,0.0,1800-1859,2004.0,0.0,0.0,0.0,253.0
4,1,3,EV,20366,EV,N606UX,4407,14524,1452401,RIC,...,1226603,IAH,1038.0,0.0,1000-1059,1330.0,0.0,0.0,0.0,1157.0


In [3]:
# Drop unnecessary columns
data = data.drop(columns=['OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID','ARR_TIME'])

In [4]:
# check for missing values
data.isnull().sum()

DAY_OF_MONTH            0
DAY_OF_WEEK             0
OP_UNIQUE_CARRIER       0
ORIGIN_AIRPORT_ID       0
ORIGIN                  0
DEST_AIRPORT_ID         0
DEST                    0
DEP_TIME             6664
DEP_DEL15            6699
DEP_TIME_BLK            0
ARR_DEL15            8078
CANCELLED               0
DIVERTED                0
DISTANCE                0
dtype: int64

In [5]:
# Drop rows with missing values
data = data.dropna()


In [6]:
# Convert time columns to numeric values
data['DEP_TIME'] = pd.to_numeric(data['DEP_TIME'], errors='coerce', downcast='integer')
data.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN_AIRPORT_ID,ORIGIN,DEST_AIRPORT_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,3,EV,13930,ORD,11977,GRB,1003,0.0,1000-1059,0.0,0.0,0.0,174.0
1,1,3,EV,15370,TUL,13930,ORD,1027,0.0,1000-1059,0.0,0.0,0.0,585.0
2,1,3,EV,11618,EWR,15412,TYS,1848,0.0,1800-1859,0.0,0.0,0.0,631.0
3,1,3,EV,10781,BTR,12266,IAH,1846,0.0,1800-1859,0.0,0.0,0.0,253.0
4,1,3,EV,14524,RIC,12266,IAH,1038,0.0,1000-1059,0.0,0.0,0.0,1157.0


In [7]:
data = pd.get_dummies(data, columns=['DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'DEP_TIME_BLK'])

In [8]:
# Define features and target
X = data.drop(columns=['ARR_DEL15'])
y = data['ARR_DEL15']

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Create and train the RandomForestClassifier
clf_rf = RandomForestClassifier(random_state=42)
clf_rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [12]:
# Predictions and evaluation
y_pred_rf = clf_rf.predict(X_test)
print("Random Forest Classifier")
print("Training Accuracy: ", clf_rf.score(X_train, y_train))
print("Testing Accuracy: ", accuracy_score(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Classifier
Training Accuracy:  0.9999618581743892
Testing Accuracy:  0.9325290214205061

Confusion Matrix:
 [[150411   4614]
 [  7516  17240]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.97      0.96    155025
         1.0       0.79      0.70      0.74     24756

    accuracy                           0.93    179781
   macro avg       0.87      0.83      0.85    179781
weighted avg       0.93      0.93      0.93    179781



In [13]:
# Create and train the GradientBoostingClassifier
clf_gb = GradientBoostingClassifier(random_state=42)
clf_gb.fit(X_train, y_train)

GradientBoostingClassifier(random_state=42)

In [14]:
# Predictions and evaluation
y_pred_gb = clf_gb.predict(X_test)
print("\nGradient Boosting Classifier")
print("Training Accuracy: ", clf_gb.score(X_train, y_train))
print("Testing Accuracy: ", accuracy_score(y_test, y_pred_gb))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))


Gradient Boosting Classifier
Training Accuracy:  0.9323030272690215
Testing Accuracy:  0.9314888670104182

Confusion Matrix:
 [[149057   5968]
 [  6349  18407]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.96      0.96    155025
         1.0       0.76      0.74      0.75     24756

    accuracy                           0.93    179781
   macro avg       0.86      0.85      0.85    179781
weighted avg       0.93      0.93      0.93    179781



In [15]:
# Create and train the LogisticRegression model
clf_lr = LogisticRegression(random_state=42, max_iter=1000)
clf_lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [16]:
# Predictions and evaluation
y_pred_lr = clf_lr.predict(X_test)
print("Logistic Regression")
print("Training Accuracy: ", clf_lr.score(X_train, y_train))
print("Testing Accuracy: ", accuracy_score(y_test, y_pred_lr))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression
Training Accuracy:  0.9312255206955162
Testing Accuracy:  0.9300649123099771

Confusion Matrix:
 [[149133   5892]
 [  6681  18075]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.96      0.96    155025
         1.0       0.75      0.73      0.74     24756

    accuracy                           0.93    179781
   macro avg       0.86      0.85      0.85    179781
weighted avg       0.93      0.93      0.93    179781



### Here is an optimized version the Gradient Boosting Classifier using Grid Search to find the best hyperparameters

In [17]:

# Create the Gradient Boosting Classifier
gb_clf = GradientBoostingClassifier(random_state=42)



In [18]:
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3]
}

In [19]:
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [20]:
# Fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [None]:
# Get the best parameters and the best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

In [None]:
# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Testing Accuracy: ", accuracy_score(y_test, y_pred_best))