In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

cur_dir = os.getcwd()
train_set = pd.read_csv(cur_dir + '/../data/train_set_artificial.csv', low_memory=False)
test_set = pd.read_csv(cur_dir + '/../data/test_set.csv', low_memory=False)
val_set = pd.read_csv(cur_dir + '/../data/test_set.csv', low_memory=False)

df = pd.concat([train_set, test_set, val_set], axis=0)

X_train = train_set.drop(['delay_class'],axis=1)
y_train = train_set['delay_class']

X_val = val_set.drop(['delay_class'],axis=1)
y_val = val_set['delay_class']

X_test = test_set.drop(['delay_class'],axis=1)
y_test = test_set['delay_class']

In [2]:
df

Unnamed: 0,carrier,origin,dest,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,sched_time_in_min,Precipitation Binary,New Snow Binary,Snow Depth Binary,delay_class
0,DL,JFK,MCO,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,0.316067,yes,no,no,no
1,EV,EWR,MCI,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,0.374012,yes,no,no,no
2,B6,JFK,MSY,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,0.087796,no,no,no,no
3,B6,EWR,FLL,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,0.579456,yes,no,no,yes
4,WN,LGA,BNA,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,0.267779,yes,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65465,EV,EWR,MEM,0.182222,0.176627,0.135802,0.160000,0.142857,0.023965,0.000000,0.0,0.0,0.898352,0.636523,no,no,no,no
65466,EV,EWR,MCI,0.201481,0.206404,0.938272,0.893333,0.922078,0.583878,0.022444,0.0,0.0,0.549451,0.352063,yes,no,no,no
65467,EV,EWR,CHS,0.102222,0.111768,0.740741,0.613333,0.681818,0.422658,0.134663,0.0,0.0,0.722527,0.140474,yes,no,no,no
65468,EV,LGA,PIT,0.048889,0.052009,0.654321,0.573333,0.616883,0.230937,0.000000,0.0,0.0,0.706044,0.340650,no,no,no,no


In [3]:
from sklearn.preprocessing import OneHotEncoder

# Transform categorical features into binary features
categorical_columns = list(X_train.select_dtypes(include=['object']).columns)
encoder = OneHotEncoder()

# Use df to fit the encoder to prevent scenarios that the binary features in train and test sets are different, 
# For example, test set has dest_LAX, but train set does not have dest_LAX
encoder.fit(df[categorical_columns])

X_train_category = encoder.transform(X_train[categorical_columns])
X_test_category = encoder.transform(X_test[categorical_columns])
X_val_category = encoder.transform(X_val[categorical_columns])

# Get numerical features
numerical_columns = list(X_train.select_dtypes(include=['float64']).columns)

# Combine the numerical and categorical features
X_train_df_category = pd.DataFrame(X_train_category.toarray())
X_train_df_category.columns = encoder.get_feature_names_out()
X_train_df_numerical = pd.DataFrame(X_train[numerical_columns]).reset_index(drop=True)
X_train_encoded = pd.concat([X_train_df_numerical, X_train_df_category], axis=1)


X_test_df_category = pd.DataFrame(X_test_category.toarray())
X_test_df_category.columns = encoder.get_feature_names_out()
X_test_df_numerical = pd.DataFrame(X_test[numerical_columns]).reset_index(drop=True)
X_test_encoded = pd.concat([X_test_df_numerical, X_test_df_category], axis=1)

X_val_df_category = pd.DataFrame(X_val_category.toarray())
X_val_df_category.columns = encoder.get_feature_names_out()
X_val_df_numerical = pd.DataFrame(X_val[numerical_columns]).reset_index(drop=True)
X_val_encoded = pd.concat([X_val_df_numerical, X_val_df_category], axis=1)

In [4]:
X_train_encoded

Unnamed: 0,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,...,dest_TUL,dest_TVC,dest_TYS,dest_XNA,Precipitation Binary_no,Precipitation Binary_yes,New Snow Binary_no,New Snow Binary_yes,Snow Depth Binary_no,Snow Depth Binary_yes
0,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293895,0.432344,0.483009,0.716049,0.666667,0.694805,0.405229,0.000000,0.0,0.0,0.445055,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
293896,0.119368,0.122401,0.740741,0.680000,0.714286,0.601307,0.000000,0.0,0.0,0.758242,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
293897,0.182222,0.207220,0.595520,0.589003,0.593587,0.350269,0.002057,0.0,0.0,0.372975,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
293898,0.048651,0.045074,0.738151,0.736785,0.740578,0.395707,0.018992,0.0,0.0,0.480462,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [5]:
X_test_encoded

Unnamed: 0,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,...,dest_TUL,dest_TVC,dest_TYS,dest_XNA,Precipitation Binary_no,Precipitation Binary_yes,New Snow Binary_no,New Snow Binary_yes,Snow Depth Binary_no,Snow Depth Binary_yes
0,0.305185,0.267387,0.444444,0.253333,0.350649,0.627451,0.112219,0.0,0.0,0.082418,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.407407,0.488476,0.827160,0.840000,0.837662,0.501089,0.000000,0.0,0.0,0.516484,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.405926,0.437895,0.296296,0.253333,0.272727,0.294118,0.000000,0.0,0.0,0.197802,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,0.183704,0.139099,0.790123,0.813333,0.805195,0.385621,0.000000,0.0,0.0,0.607143,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.171852,0.133184,0.345679,0.266667,0.305195,0.405229,0.000000,0.0,0.1,0.967033,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65465,0.182222,0.176627,0.135802,0.160000,0.142857,0.023965,0.000000,0.0,0.0,0.898352,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
65466,0.201481,0.206404,0.938272,0.893333,0.922078,0.583878,0.022444,0.0,0.0,0.549451,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
65467,0.102222,0.111768,0.740741,0.613333,0.681818,0.422658,0.134663,0.0,0.0,0.722527,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
65468,0.048889,0.052009,0.654321,0.573333,0.616883,0.230937,0.000000,0.0,0.0,0.706044,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

classifier = RandomForestClassifier()

classifier.fit(X_train_encoded, y_train)
accuracy = classifier.score(X_test_encoded, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 77.34%


In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = classifier.predict(X_test_encoded)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

print(classification_report(y_test, y_pred))

[[43206  5896]
 [ 8939  7429]]
              precision    recall  f1-score   support

          no       0.83      0.88      0.85     49102
         yes       0.56      0.45      0.50     16368

    accuracy                           0.77     65470
   macro avg       0.69      0.67      0.68     65470
weighted avg       0.76      0.77      0.77     65470



In [8]:
#TODO: Add in grid search stuff.
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators': [50, 100, 150],
              'max_depth': [10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train_encoded, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

best_random_forest = grid_search.best_estimator_

Best parameters:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Best score:  0.8418441646818646


In [9]:
y_pred = best_random_forest.predict(X_test_encoded)

In [10]:
accuracy = best_random_forest.score(X_test_encoded, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 77.32%


In [11]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

print(classification_report(y_test, y_pred))

[[42513  6589]
 [ 8262  8106]]
              precision    recall  f1-score   support

          no       0.84      0.87      0.85     49102
         yes       0.55      0.50      0.52     16368

    accuracy                           0.77     65470
   macro avg       0.69      0.68      0.69     65470
weighted avg       0.77      0.77      0.77     65470

