# Random Forest - Predicting Flight Delay

In [13]:
### Mounting Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [14]:
# Necessary Packages
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

## Reading in cleaned data & Extracting features

In [15]:
# Merge into one df
files = !ls 'gdrive/My Drive/OMSA/CSE6242/clean_data_v2/'
files = files[0].split('\t')
files = ['gdrive/My Drive/OMSA/CSE6242/clean_data_v2/' + f for f in files]

# Load your data
data = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

# Convert 'dep_delay' to categorical (delayed or not delayed)
data['delayed'] = data['dep_delay'].apply(lambda x: 1 if x >= 15 else 0)

In [16]:
# Feature selection: choose relevant features for the model
features = ['Carrier Code', 'destination_airport', 'scheduled_dep_hour', 'month', 'day', 'scheduled_elapsed']

X = data[features]
y = data['delayed']

# One hot encoding string features
encoder = OneHotEncoder(sparse_output=False, drop=None)
encoded_cols = encoder.fit_transform(X[['Carrier Code', 'destination_airport']])
encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(['Carrier Code', 'destination_airport']))
X_encoded = pd.concat([X, encoded_df], axis=1).drop(columns=['Carrier Code', 'destination_airport'])


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
features = X_encoded.columns

In [17]:
# Optional: Standardizing the data - Not necessary for random forest.
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

## Preliminary Implementation of random forest

In [18]:
rf = RandomForestClassifier(n_estimators=100, random_state=7)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Feature importance (optional: to understand which features are most influential)
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.87     78993
           1       0.33      0.21      0.26     17928

    accuracy                           0.78     96921
   macro avg       0.58      0.56      0.56     96921
weighted avg       0.74      0.78      0.76     96921

Confusion Matrix:
 [[71472  7521]
 [14184  3744]]
Feature Importances:
                     Feature    Importance
2                       day  5.203685e-01
3         scheduled_elapsed  1.756191e-01
0        scheduled_dep_hour  1.425025e-01
1                     month  1.205915e-01
8           Carrier Code_WN  6.336007e-03
..                      ...           ...
66  destination_airport_PSP  7.704133e-07
55  destination_airport_MSN  4.274535e-07
21  destination_airport_CID  1.834186e-07
17  destination_airport_BTR  0.000000e+00
29  destination_airport_DSM  0.000000e+00

[85 rows x 2 columns]


## Balancing Datasets

In [19]:
delayed_mask = y == 1
X_delayed = X_encoded[delayed_mask]
y_delayed = y[delayed_mask]
X_not_delayed = X_encoded[~delayed_mask]
y_not_delayed = y[~delayed_mask]

n_samples = min(len(X_delayed), len(X_not_delayed))


X_delayed_sampled = X_delayed.sample(n=n_samples, random_state=42)
y_delayed_sampled = y_delayed.loc[X_delayed_sampled.index]

X_not_delayed_sampled = X_not_delayed.sample(n=n_samples, random_state=42)
y_not_delayed_sampled = y_not_delayed.loc[X_not_delayed_sampled.index]

X_balanced = pd.concat([X_delayed_sampled, X_not_delayed_sampled], axis=0)
y_balanced = pd.concat([y_delayed_sampled, y_not_delayed_sampled], axis=0)

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)
features = X_balanced.columns

In [20]:
rf = RandomForestClassifier(n_estimators=100, random_state=7)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Feature importance (optional: to understand which features are most influential)
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.59      0.59     17980
           1       0.59      0.60      0.59     17677

    accuracy                           0.59     35657
   macro avg       0.59      0.59      0.59     35657
weighted avg       0.59      0.59      0.59     35657

Confusion Matrix:
 [[10582  7398]
 [ 7139 10538]]
Feature Importances:
                     Feature  Importance
2                       day    0.442390
3         scheduled_elapsed    0.192491
0        scheduled_dep_hour    0.169204
1                     month    0.132225
8           Carrier Code_WN    0.007952
..                      ...         ...
29  destination_airport_DSM    0.000000
21  destination_airport_CID    0.000000
55  destination_airport_MSN    0.000000
66  destination_airport_PSP    0.000000
80  destination_airport_SNA    0.000000

[85 rows x 2 columns]


In [8]:
# ### Testing 1 airline
# test = pd.read_csv('gdrive/My Drive/OMSA/CSE6242/clean_data_v2/AA.csv')
# test['delayed'] = test['dep_delay'].apply(lambda x: 1 if x >= 15 else 0)
# # Feature selection: choose relevant features for the model
# features = ['Carrier Code', 'destination_airport', 'scheduled_dep_hour', 'month', 'day', 'scheduled_elapsed']

# X = test[features]
# y = test['delayed']

# # One hot encoding string features
# encoder = OneHotEncoder(sparse_output=False, drop=None)
# encoded_cols = encoder.fit_transform(X[['Carrier Code', 'destination_airport']])
# encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(['Carrier Code', 'destination_airport']))
# X_encoded = pd.concat([X, encoded_df], axis=1).drop(columns=['Carrier Code', 'destination_airport'])


# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# features = X_encoded.columns

# rf = RandomForestClassifier(n_estimators=100, random_state=7)
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# # Feature importance (optional: to understand which features are most influential)
# importances = rf.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
# print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))


In [9]:
# ### Testing 1 airline from old cleaned data
# test = pd.read_csv('gdrive/My Drive/OMSA/CSE6242/clean_data/AA_new.csv')
# test['delayed'] = test['dep_delay'].apply(lambda x: 1 if x >= 15 else 0)
# test['day'] = test['flight_date'].apply(lambda x: int(x.split('-')[1]))
# test['month'] = test['flight_date'].apply(lambda x: int(x.split('-')[2]))
# print(test.columns)
# test['scheduled_dep_hour'] = test['scheduled_dep_time'].apply(lambda x: int(x.split(":")[1]))

# # Feature selection: choose relevant features for the model
# features = ['Carrier Code', 'destination_airport', 'scheduled_dep_hour', 'month', 'day', 'scheduled_elapsed']
# X = test[features]
# y = test['delayed']

# # One hot encoding string features
# encoder = OneHotEncoder(sparse_output=False, drop=None)
# encoded_cols = encoder.fit_transform(X[['Carrier Code', 'destination_airport']])
# encoded_df = pd.DataFrame(encoded_cols, columns = encoder.get_feature_names_out(['Carrier Code', 'destination_airport']))
# X_encoded = pd.concat([X, encoded_df], axis=1).drop(columns=['Carrier Code', 'destination_airport'])


# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
# features = X_encoded.columns

# rf = RandomForestClassifier(n_estimators=100, random_state=7)
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)
# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# # Feature importance (optional: to understand which features are most influential)
# importances = rf.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
# print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))


## Tuning Hyperparameters

In [10]:
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['gini', 'entropy']
# }
# rf = RandomForestClassifier(random_state=7)
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')
# grid_search.fit(X_train, y_train)

# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# # Test set w/ best parameters
# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_test)

# print("Classification Report:\n", classification_report(y_test, y_pred))
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# # Feature importance
# importances = best_rf.feature_importances_
# feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
# print("Feature Importances:\n", feature_importance_df.sort_values(by='Importance', ascending=False))


Fitting 5 folds for each of 162 candidates, totalling 810 fits




KeyboardInterrupt: 