# Building a Logistic Regression Model to Predict Flight Cancellations
Author: Tina Lin
Date: 6/30/3035

In [2]:
### Import necessary packages
import pandas as pd
import numpy as np
from numpy import asarray
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# import statsmodels.formula.api as smf
    # from textbook but requires combined X and y

In [127]:
### ORIGINAL MERGED CSV
# read original csv file into DataFrame
# data = pd.read_csv("airline.csv")

# print(data.shape)
    # (7065617, 120)

In [128]:
### cLEANED BUT UNFILTERED CSV
# read csv file into DataFrame (cleaned data)
# clean = pd.read_csv("airline_data_clean.csv")

# print(clean.shape)
    # (7065617, 69)


### Initial DataFrame
Read cleaned and filtered csv file (top_ports_lines.csv) in as a DataFrame

In [34]:
# READ CLEANED AND FILTERED CSV FILE INTO DATAFRAME
df = pd.read_csv("top_ports_lines.csv")
# print(df.head())
# print(df.shape)
    # (1672943, 69)

In [35]:
# retrieve the data types of all columns in the full DataFrame
column_info = df.dtypes

# print(column_info)

A quick look at the available data for our target variable of interest.

In [36]:
print(f"The total number of cancellations is {df["cancelled"].sum()}.")
print(f"The total number of provided reasons for flights being canceled is {df["cancellation_code"].notna().sum()}.")
print(f"The number of cancellations due to carrier is {sum(df["cancellation_code"] == "A")}.")
print(f"The number of cancellations due to weather is {sum(df["cancellation_code"] == "B")}.")
print(f"The number of cancellations due to national air system is {sum(df["cancellation_code"] == "C")}.")
print(f"The number of cancellations due to security is {sum(df["cancellation_code"] == "D")}.")

The total number of cancellations is 1705972.
The total number of provided reasons for flights being canceled is 33029.
The number of cancellations due to carrier is 14199.
The number of cancellations due to weather is 16438.
The number of cancellations due to national air system is 2262.
The number of cancellations due to security is 130.


### Train-Validation-Test Split

Split the DataFrame into X (features) and y (target variable)

In [6]:
y = df[["cancelled"]]
X = df.drop("cancelled", axis=1) #dropping the column "cancelled"

Selecting which variables we think are important to use in our logistic regression model, a priori.

In [7]:
X = X[["operating_airline", "origin", "crs_dep_time", "dep_delay", "distance"]]

One-hot encode the categorical variables ("operating_airline" and "origin")

In [8]:
# define one hot encoding
encoder = OneHotEncoder(sparse_output=False, drop="first")
# transform data

encoder.fit(X[["operating_airline", "origin"]])

onehot = encoder.transform(X[["operating_airline", "origin"]])
# print(onehot)

# get column names
col_names = encoder.get_feature_names_out(["operating_airline", "origin"])

# Create DataFrame with proper column names
one_hot_df = pd.DataFrame(onehot, columns=col_names)


In [9]:
one_hot_df.head()

Unnamed: 0,operating_airline_AS,operating_airline_B6,operating_airline_DL,operating_airline_F9,operating_airline_G4,operating_airline_HA,operating_airline_NK,operating_airline_UA,operating_airline_WN,origin_CLT,origin_DEN,origin_DFW,origin_LAS,origin_LAX,origin_MCO,origin_MIA,origin_ORD,origin_PHX
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# merged one-hot-encoded DataFrame with original X DataFrame
x_encoded = pd.concat([X, one_hot_df], axis=1)

Create 50:40:10 train:validation:test split before performing exploratory data analysis to prevent data leakage and overfitting. Model development and feature selection will be done on the training set only.

In [11]:
### Train, Validation, Test Split

x_train, x_valtest, y_train, y_valtest = train_test_split(x_encoded, y, train_size=0.50, random_state=123)
x_val, x_test, y_val, y_test = train_test_split(x_valtest, y_valtest, train_size=0.8, random_state=123)

In [12]:
# print(x_train.shape)
#     # (836471, 23) 
# print(x_val.shape)
#     # (669177, 23)
# print(x_test.shape)
#     # (167295, 23)
# print(y_train.shape)
#     # (836471, 1)
# print(y_val.shape)
#     # (669177, 1)
# print(y_test.shape)
#     # (167295, 1)

### Exploratory Data Analysis

First, let's take a look at our target variable.

In [None]:
print(y_train.value_counts())
    # currently, the binary variable has values 1 and 2 when originally they were 0 (no) and 1 (yes)

# transform binary variable "cancelled" with values 1 and 2 to 0 and 1, respectively
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

print(y_train.value_counts()) # check to make sure response variable is binary with 0,1

cancelled
1            819915
2             16556
Name: count, dtype: int64
cancelled
0            819915
1             16556
Name: count, dtype: int64


Only keep the hour when the flight was expected to depart

In [13]:
### create new var crs_dep_hour to only include the expected hour of departure and drop the crs_dep_time var
x_train["crs_dep_hour"] = x_train["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_train = x_train.drop(columns=["crs_dep_time"])

x_val["crs_dep_hour"] = x_val["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_val = x_val.drop(columns=["crs_dep_time"])

x_test["crs_dep_hour"] = x_test["crs_dep_time"].astype(str).str.zfill(4).str[:2].astype(int)
x_test = x_test.drop(columns=["crs_dep_time"])

In [14]:
# impute missing values in dep_delay using the median (data is skewed)
x_train["dep_delay"] = x_train['dep_delay'].fillna(x_train["dep_delay"].median())

x_val["dep_delay"] = x_val['dep_delay'].fillna(x_val["dep_delay"].median())

x_test["dep_delay"] = x_test['dep_delay'].fillna(x_test["dep_delay"].median())

In [None]:
# convert departure delay variable into type integer
x_train["dep_delay"] = x_train["dep_delay"].astype(int)

x_val["dep_delay"] = x_val["dep_delay"].astype(int)

x_test["dep_delay"] = x_test["dep_delay"].astype(int)

In [None]:
# check to make sure features are of correct type
x_train.columns
x_train.dtypes

operating_airline        object
origin                   object
dep_delay                 int64
distance                  int64
operating_airline_AS    float64
operating_airline_B6    float64
operating_airline_DL    float64
operating_airline_F9    float64
operating_airline_G4    float64
operating_airline_HA    float64
operating_airline_NK    float64
operating_airline_UA    float64
operating_airline_WN    float64
origin_CLT              float64
origin_DEN              float64
origin_DFW              float64
origin_LAS              float64
origin_LAX              float64
origin_MCO              float64
origin_MIA              float64
origin_ORD              float64
origin_PHX              float64
crs_dep_hour              int64
dtype: object

Encode categorical features as a one-hot numeric array

In [16]:
x_train.columns

Index(['operating_airline', 'origin', 'dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour'],
      dtype='object')

### Basic Logistic Regression Model Using Five Predictors (Operating Airline, Origin Airport, Departure Hour, Departure Delay, and Distance)

In [17]:
# Initialize logistic regression model
log_model = LogisticRegression(max_iter=1000, random_state=123, class_weight="balanced")
    # max_iter can be increased if needed

# fit the model to training data
log_model.fit(x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']], y_train)

# predict on validation data
y_pred = log_model.predict(x_val[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']])

# evaluate model
print("Accuracy: ", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
# print(log_model.coef_)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy:  0.5807282677079457
              precision    recall  f1-score   support

           0       0.99      0.58      0.73    655944
           1       0.03      0.69      0.06     13233

    accuracy                           0.58    669177
   macro avg       0.51      0.63      0.40    669177
weighted avg       0.97      0.58      0.72    669177



In [None]:
# Feature importance from our basic logistic regression
feature_names_basic = x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']].columns
coefficients_basics = log_model.coef_[0]
feature_importance_basics = pd.DataFrame({
    'Feature': feature_names_basic,
    'Coefficient': coefficients_basics,
    'Importance': np.abs(coefficients_basics)
})

feature_importance_basics = feature_importance_basics.sort_values(by = 'Importance', ascending = False)
print(feature_importance_basics)

                 Feature  Coefficient  Importance
16            origin_MCO     0.729350    0.729350
13            origin_DFW     0.619709    0.619709
18            origin_ORD     0.569038    0.569038
9   operating_airline_UA    -0.526788    0.526788
17            origin_MIA     0.491253    0.491253
10  operating_airline_WN     0.400410    0.400410
12            origin_DEN     0.332347    0.332347
2   operating_airline_AS     0.317871    0.317871
7   operating_airline_HA    -0.308680    0.308680
11            origin_CLT     0.267527    0.267527
8   operating_airline_NK     0.252459    0.252459
4   operating_airline_DL    -0.235266    0.235266
14            origin_LAS     0.177300    0.177300
6   operating_airline_G4    -0.169986    0.169986
3   operating_airline_B6     0.091324    0.091324
5   operating_airline_F9     0.082547    0.082547
19            origin_PHX    -0.062877    0.062877
15            origin_LAX     0.048165    0.048165
20          crs_dep_hour     0.040423    0.040423


### Synthetic Minority Oversampling Technique (SMOTE)
After noticing an imbalance in our response variable, we implemented SMOTE to help oversample our minority class (canceled flights)

In [None]:
#Apply SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=123)

# resample dataset to create a more balanced dataset for training 
x_res, y_res = sm.fit_resample(x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']], y_train)

In [44]:
log_model_res = LogisticRegression(random_state = 123)
log_model_res.fit(x_res, y_res)
       # fit logistic model to resampled training data (SMOTE)
y_pred_val_res = log_model_res.predict(x_val[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']])
       # generate predictions on validation data
print(f"Validation accuracy after SMOTE: {accuracy_score(y_val, y_pred_val_res)}")


  y = column_or_1d(y, warn=True)


Validation accuracy after SMOTE: 0.5835586100538422


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Summary of the performance of our logistic regression model that has been balanced using the SMOTE technique

In [21]:
from sklearn.metrics import f1_score
print(classification_report(y_val, y_pred_val_res))
print(f"f1 score is {f1_score(y_val, y_pred_val_res, average='binary')}")

              precision    recall  f1-score   support

           0       0.99      0.58      0.73    655944
           1       0.03      0.68      0.06     13233

    accuracy                           0.58    669177
   macro avg       0.51      0.63      0.40    669177
weighted avg       0.97      0.58      0.72    669177

f1 score is 0.06040682560715333


In [41]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
cm = confusion_matrix(y_val, y_pred_val_res)
print("Confusion Matrix:")
print(cm)
print("ROC-AUC:", roc_auc_score(y_val, y_pred_val_res))

Confusion Matrix:
[[381546 274398]
 [  4275   8958]]
ROC-AUC: 0.629309329390392


### SMOTE + Tomek (SMOTEENN) - SMOTE with Edited Nearests Neighbors
As SMOTE may produce noisy or irrelavent synthetic samples for the minority class, especially in regions where the minority class is highly concentrated, overfitting may occur. SMOTEENN is a technique that can help remove those noisy/irrelavent samples, making the overall model that is produced more generalizable compared to the SMOTE technique for oversampling. 

In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=123)
# resample dataset to create a more balanced dataset for training using SMOTEENN on all five features selected a priori
X_resampled, y_resampled = smote_enn.fit_resample(x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']], y_train)

In [43]:
log_model_res2 = LogisticRegression(random_state = 123)
log_model_res2.fit(X_resampled, y_resampled)
       # fit logistic model to resampled training data (SMOTEENN)
y_pred_val_res2 = log_model_res2.predict(x_val[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL',
       'operating_airline_F9', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS', 'origin_LAX',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'origin_PHX', 'crs_dep_hour']])
       # generate predictions on validation data
print(f"Validation accuracy after SMOTEENN: {accuracy_score(y_val, y_pred_val_res2)}")

  y = column_or_1d(y, warn=True)


Validation accuracy after SMOTEENN: 0.814649935667245


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [45]:
from sklearn.metrics import f1_score
print(classification_report(y_val, y_pred_val_res2))
print(f"f1 score is {f1_score(y_val, y_pred_val_res2, average='binary')}")
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
cm2 = confusion_matrix(y_val, y_pred_val_res2)
print("Confusion Matrix after SMOTEENN:")
print(cm2)
print("ROC-AUC:", roc_auc_score(y_val, y_pred_val_res2))

              precision    recall  f1-score   support

           0       0.99      0.82      0.90    655944
           1       0.04      0.40      0.08     13233

    accuracy                           0.81    669177
   macro avg       0.51      0.61      0.49    669177
weighted avg       0.97      0.81      0.88    669177

f1 score is 0.07806205122868569
Confusion Matrix after SMOTEENN:
[[539894 116050]
 [  7982   5251]]
ROC-AUC: 0.6099452060069502


In [None]:
# Feature importance from our logistic regression after SMOTEENN
feature_names = X_resampled.columns
coefficients = log_model_res2.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
})

feature_importance = feature_importance.sort_values(by = 'Importance', ascending = False)
print(feature_importance)

                 Feature  Coefficient  Importance
9   operating_airline_UA    -0.710299    0.710299
4   operating_airline_DL    -0.596166    0.596166
16            origin_MCO     0.433197    0.433197
10  operating_airline_WN     0.394439    0.394439
11            origin_CLT    -0.322381    0.322381
19            origin_PHX    -0.318522    0.318522
8   operating_airline_NK     0.262003    0.262003
18            origin_ORD     0.230118    0.230118
15            origin_LAX    -0.217438    0.217438
2   operating_airline_AS     0.172238    0.172238
13            origin_DFW     0.165238    0.165238
5   operating_airline_F9     0.093228    0.093228
3   operating_airline_B6     0.084386    0.084386
12            origin_DEN     0.083570    0.083570
14            origin_LAS    -0.074850    0.074850
17            origin_MIA     0.064480    0.064480
7   operating_airline_HA    -0.047383    0.047383
6   operating_airline_G4    -0.044087    0.044087
20          crs_dep_hour     0.034294    0.034294


### SMOTEENN using stepwise selected variables
After performing a stepwise selection on our logistic regression model in R, we noted the significant predictors and created a new resampled dataset using only those stepwise selected features

In [None]:
from imblearn.combine import SMOTEENN

smote_enn_stepwise = SMOTEENN(random_state=123)
# resample dataset to create a more balanced dataset for training using SMOTEENN on stepwise selected features
X_resampled_step, y_resampled_step = smote_enn_stepwise.fit_resample(x_train[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']], y_train)

In [46]:
log_model_res3 = LogisticRegression(random_state = 123)
# fit logistic model to resampled training data (SMOTEENN)
log_model_res3.fit(X_resampled_step, y_resampled_step)
y_pred_val_res3 = log_model_res3.predict(x_val[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']])
print(f"Validation accuracy after SMOTEENN (stepwise selected features): {accuracy_score(y_val, y_pred_val_res3)}")

  y = column_or_1d(y, warn=True)


Validation accuracy after SMOTEENN (stepwise selected features): 0.8248998396537837


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
from sklearn.metrics import f1_score
print(classification_report(y_val, y_pred_val_res3))
print(f"f1 score is {f1_score(y_val, y_pred_val_res3, average='binary')}")
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
cm2 = confusion_matrix(y_val, y_pred_val_res3)
print("Confusion Matrix: ")
print(cm2)
print("ROC-AUC:", roc_auc_score(y_val, y_pred_val_res3))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90    655944
           1       0.04      0.37      0.08     13233

    accuracy                           0.82    669177
   macro avg       0.51      0.60      0.49    669177
weighted avg       0.97      0.82      0.89    669177

f1 score is 0.07770475028533197
Confusion Matrix: 
[[547068 108876]
 [  8297   4936]]
ROC-AUC: 0.6035115976068678


In [32]:
feature_names = X_resampled.columns
coefficients = log_model_res2.coef_[0]
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Importance': np.abs(coefficients)
})

feature_importance = feature_importance.sort_values(by = 'Importance', ascending = False)
print(feature_importance)

                 Feature  Coefficient  Importance
9   operating_airline_UA    -0.710299    0.710299
4   operating_airline_DL    -0.596166    0.596166
16            origin_MCO     0.433197    0.433197
10  operating_airline_WN     0.394439    0.394439
11            origin_CLT    -0.322381    0.322381
19            origin_PHX    -0.318522    0.318522
8   operating_airline_NK     0.262003    0.262003
18            origin_ORD     0.230118    0.230118
15            origin_LAX    -0.217438    0.217438
2   operating_airline_AS     0.172238    0.172238
13            origin_DFW     0.165238    0.165238
5   operating_airline_F9     0.093228    0.093228
3   operating_airline_B6     0.084386    0.084386
12            origin_DEN     0.083570    0.083570
14            origin_LAS    -0.074850    0.074850
17            origin_MIA     0.064480    0.064480
7   operating_airline_HA    -0.047383    0.047383
6   operating_airline_G4    -0.044087    0.044087
20          crs_dep_hour     0.034294    0.034294


## Testing our Best Model - Smoteen from stepwise selected variables

In [None]:
y_pred_test_res3 = log_model_res3.predict(x_test[['dep_delay', 'distance',
       'operating_airline_AS', 'operating_airline_B6', 'operating_airline_DL', 'operating_airline_G4', 'operating_airline_HA',
       'operating_airline_NK', 'operating_airline_UA', 'operating_airline_WN',
       'origin_CLT', 'origin_DEN', 'origin_DFW', 'origin_LAS',
       'origin_MCO', 'origin_MIA', 'origin_ORD', 'crs_dep_hour']])
# generate predictions on test data
print(classification_report(y_test, y_pred_test_res3))
cm3 = confusion_matrix(y_test, y_pred_test_res3)
print("Confusion Matrix: ")
print(cm3)
print("ROC-AUC:", roc_auc_score(y_test, y_pred_test_res3))
print(f"Test accuracy after SMOTEENN (stepwise selected features): {accuracy_score(y_test, y_pred_test_res3)}")

              precision    recall  f1-score   support

           0       0.99      0.83      0.90    164055
           1       0.04      0.39      0.08      3240

    accuracy                           0.83    167295
   macro avg       0.52      0.61      0.49    167295
weighted avg       0.97      0.83      0.89    167295

Confusion Matrix: 
[[136898  27157]
 [  1969   1271]]
ROC-AUC: 0.613374001153633
Test accuracy after SMOTEENN (stepwise selected features): 0.825900355659165
