In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
df = pd.read_csv("aggregated.csv")
# df = df.fillna(df.mean())
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE,Unnamed: 13
0,2.0,6.0,2017-02-25,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",1000.0,0.0,156.0,937.0,
1,2.0,7.0,2017-02-26,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",739.0,0.0,153.0,937.0,
2,2.0,1.0,2017-02-27,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",1028.0,0.0,158.0,937.0,
3,2.0,2.0,2017-02-28,B6,28.0,MCO,"Orlando, FL",EWR,"Newark, NJ",739.0,0.0,153.0,937.0,
4,2.0,3.0,2017-02-01,B6,33.0,BTV,"Burlington, VT",JFK,"New York, NY",1907.0,0.0,90.0,266.0,


In [3]:
df = df.drop(df.columns[[13]], axis=1)
# df = df.drop("FL_DATE", axis=1)
df = df.drop("ORIGIN_CITY_NAME", axis=1)
df = df.drop("DEST_CITY_NAME", axis=1)
df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,2017-02-25,B6,28.0,MCO,EWR,1000.0,0.0,156.0,937.0
1,2.0,7.0,2017-02-26,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
2,2.0,1.0,2017-02-27,B6,28.0,MCO,EWR,1028.0,0.0,158.0,937.0
3,2.0,2.0,2017-02-28,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
4,2.0,3.0,2017-02-01,B6,33.0,BTV,JFK,1907.0,0.0,90.0,266.0


After viewing the data, I found there is a unnamed and empty column in dataset, so I removed it from data, also, I figured that city and airport code have roughly the same meaning, there might be two airports in one city but not one airport in two cities, so I removed the columns of city name and keep the columns of airport code.

In [4]:
print("NaN value exists?:", df.isnull().values.any())

NaN value exists?: True


In [5]:
print("NaN value exists?:")
for col in df.columns:
    print(col, df[col].isnull().values.any())

NaN value exists?:
MONTH False
DAY_OF_WEEK False
FL_DATE False
UNIQUE_CARRIER False
FL_NUM False
ORIGIN False
DEST False
CRS_DEP_TIME False
ARR_DEL15 True
CRS_ELAPSED_TIME True
DISTANCE False


Turned out there is NaN value in the data so I checked what columns are having NaN value. For different columns, I specify particular strategy for different imputer that I think is suitable for that type of value. Most frequent for ARR_DEL15 which is a boolean value that indicates whether or not the delay time was more than 15 minutes and mean for CRS_ELAPSED_TIME which is a value indicates how long the filght took.

In [6]:
from sklearn.preprocessing import Imputer
def get_imputed_data(option, data):
    print("-----------------------------------------")
    print("Imputer:", option)
    imputer = Imputer(missing_values="NaN", strategy=option, axis=0)
    data_imputed = imputer.fit_transform(data)
    return data_imputed

In [7]:
df[["ARR_DEL15"]] = get_imputed_data("most_frequent", df[["ARR_DEL15"]])
df[["CRS_ELAPSED_TIME"]] = get_imputed_data("mean", df[["CRS_ELAPSED_TIME"]])
print("NaN value exists?:", df.isnull().values.any())
df.head()

-----------------------------------------
Imputer: most_frequent
-----------------------------------------
Imputer: mean
NaN value exists?: False


Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,2017-02-25,B6,28.0,MCO,EWR,1000.0,0.0,156.0,937.0
1,2.0,7.0,2017-02-26,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
2,2.0,1.0,2017-02-27,B6,28.0,MCO,EWR,1028.0,0.0,158.0,937.0
3,2.0,2.0,2017-02-28,B6,28.0,MCO,EWR,739.0,0.0,153.0,937.0
4,2.0,3.0,2017-02-01,B6,33.0,BTV,JFK,1907.0,0.0,90.0,266.0


In [8]:
from sklearn import preprocessing
def encode_labels(labels):
    le = preprocessing.LabelEncoder()
    return le.fit_transform(labels)

In [9]:
df.dtypes

MONTH               float64
DAY_OF_WEEK         float64
FL_DATE              object
UNIQUE_CARRIER       object
FL_NUM              float64
ORIGIN               object
DEST                 object
CRS_DEP_TIME        float64
ARR_DEL15           float64
CRS_ELAPSED_TIME    float64
DISTANCE            float64
dtype: object

Do label encoding for all the varaibles that are string, so the model can recognize them and learn.

In [10]:
for col in df.columns:
    if col == "ARR_DEL15":
        continue
    if df[col].dtype != np.float64 and df[col].dtype != np.int64:
        print(col)
        df[col] = encode_labels(df[col])
df.head()

FL_DATE
UNIQUE_CARRIER
ORIGIN
DEST


Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,ARR_DEL15,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,330,2,28.0,188,98,1000.0,0.0,156.0,937.0
1,2.0,7.0,331,2,28.0,188,98,739.0,0.0,153.0,937.0
2,2.0,1.0,332,2,28.0,188,98,1028.0,0.0,158.0,937.0
3,2.0,2.0,333,2,28.0,188,98,739.0,0.0,153.0,937.0
4,2.0,3.0,306,2,33.0,48,158,1907.0,0.0,90.0,266.0


Splitting data to predited variable and predictor variables.

In [11]:
x = df.drop("ARR_DEL15", axis=1)
x.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE
0,2.0,6.0,330,2,28.0,188,98,1000.0,156.0,937.0
1,2.0,7.0,331,2,28.0,188,98,739.0,153.0,937.0
2,2.0,1.0,332,2,28.0,188,98,1028.0,158.0,937.0
3,2.0,2.0,333,2,28.0,188,98,739.0,153.0,937.0
4,2.0,3.0,306,2,33.0,48,158,1907.0,90.0,266.0


In [12]:
y = df["ARR_DEL15"]
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: ARR_DEL15, dtype: float64

Since the data is pretty big, I decided to use Random Forest Classifier, it handles large amounts of data very well and usually has good performance on predicting, plus we can run other booster classifiers as well and compare their performance.

In [13]:
from sklearn.model_selection import cross_validate

For running 10-fold cross validation, although accuracy is an obvious way to examine the performance of a model, we should also take all the true negative, false positive prediction into consideration, so I also included precision, recall, and F1 to score display.

In [14]:
scoring_list = ["accuracy", "precision", "recall", "f1"]
model = RandomForestClassifier(n_estimators=10, max_depth=4)
RF = cross_validate(model, x, y, cv=5, verbose=3, return_train_score=False, scoring=scoring_list)
pd.DataFrame(RF)

[CV]  ................................................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   51.0s remaining:    0.0s


[CV]  , accuracy=0.8223803749395636, precision=0.0, recall=0.0, f1=0.0, total=  49.8s
[CV]  ................................................................
[CV]  , accuracy=0.8195562599976021, precision=0.09067796610169492, recall=0.0017616648556093384, f1=0.0034561839852708417, total=  50.0s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  , accuracy=0.8223811765806812, precision=0.0, recall=0.0, f1=0.0, total=  41.8s
[CV]  ................................................................


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[CV]  , accuracy=0.8223810034409818, precision=0.0, recall=0.0, f1=0.0, total=  45.8s
[CV]  ................................................................


  'precision', 'predicted', average, warn_for)


[CV]  , accuracy=0.8223810034409818, precision=0.0, recall=0.0, f1=0.0, total=  45.2s


  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.9min finished


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,45.924666,3.920647,0.82238,0.0,0.0,0.0
1,46.194027,3.766332,0.819556,0.003456,0.090678,0.001762
2,38.102208,3.71135,0.822381,0.0,0.0,0.0
3,42.04741,3.71712,0.822381,0.0,0.0,0.0
4,39.388852,5.820057,0.822381,0.0,0.0,0.0


From the result above, we can see that although the models do have accuracy around 80%, but the scores for precision, recall and F1 are nearly 0. which means it's actually predicting almost every result to a majority value. I figured it might because of the unbalanced percentage of variety of value in our predicted variable. At this point it is not good enough to "only" check accuracy, we should also check out the f1, precision, recall scores.

In [15]:
print("value percentages:")
y.value_counts(normalize=True)

value percentages:


0.0    0.822381
1.0    0.177619
Name: ARR_DEL15, dtype: float64

And it is, it has an unbalanced quantity of data among variety of values. Around 82% of the predited variable are output as 0.0, near 17% of them are 1.0.

So I think it would be better if I make a balanced sample of data that contains 50% of predited variable value output as 1.0 and another 50% as 0.0.

In [16]:
df_val1 = df[df["ARR_DEL15"] == 1]
print(len(df_val1))
df_val0 = df[df["ARR_DEL15"] == 0]
print(len(df_val0))
df_val0 = df[df["ARR_DEL15"] == 0].sample(n=len(df_val1))
new_df = pd.concat([df_val1, df_val0], ignore_index=True)

print("value percentages:")
new_df["ARR_DEL15"].value_counts(normalize=True)

911071
4218283
value percentages:


1.0    0.5
0.0    0.5
Name: ARR_DEL15, dtype: float64

Creating new predicted variable and predictor variables from new balanced data.

In [17]:
x = new_df.drop("ARR_DEL15", axis=1)
x.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,FL_DATE,UNIQUE_CARRIER,FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,CRS_ELAPSED_TIME,DISTANCE
0,2.0,1.0,311,2,33.0,48,158,1907.0,90.0,266.0
1,2.0,3.0,313,2,33.0,48,158,1907.0,90.0,266.0
2,2.0,6.0,316,2,33.0,48,158,1907.0,90.0,266.0
3,2.0,7.0,317,2,33.0,48,158,1907.0,90.0,266.0
4,2.0,1.0,318,2,33.0,48,158,1907.0,90.0,266.0


In [18]:
y = new_df["ARR_DEL15"]
y.head()

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: ARR_DEL15, dtype: float64

In [19]:
scoring_list = ["accuracy", "precision", "recall", "f1"]

In [21]:
model = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=123)
newRF = cross_validate(model, x, y, cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
pd.DataFrame(newRF)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.7min finished


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,36.376876,0.706304,0.342407,0.213815,0.26579,0.178843
1,25.800208,0.810218,0.291218,0.021902,0.035324,0.015871
2,24.589,1.03283,0.36017,0.16445,0.236923,0.125929
3,33.028751,0.674332,0.272174,0.000407,0.00065,0.000296
4,24.142147,0.743547,0.286696,0.0,0.0,0.0
5,26.656772,0.66748,0.600744,0.645706,0.58035,0.72765
6,27.151543,0.678491,0.490742,0.475941,0.490187,0.4625
7,24.580758,0.778657,0.293095,0.0,0.0,0.0
8,24.180471,0.790387,0.328603,0.06746,0.110401,0.048569
9,25.319645,0.779656,0.275034,0.0,0.0,0.0


In [22]:
newRF["test_accuracy"].mean()

0.35408822329040152

With Random Forest Classifier, this time we got lower accuracy, around 35%, but way more reasonable f1 score on this model, which is a good sign, means the model is actually doing something instead of just predicting everything to a majority value!

In [23]:
model = XGBClassifier(n_estimators=10, max_depth=4, random_state=123)
XGB = cross_validate(model, x, y, cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
pd.DataFrame(XGB)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.4min finished


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,23.822193,0.53124,0.303173,0.005888,0.010268,0.004127
1,23.990018,0.572726,0.286767,0.006559,0.010803,0.004709
2,23.664566,0.518463,0.34668,0.140278,0.205063,0.1066
3,26.166525,0.6234,0.268432,0.005075,0.00793,0.003732
4,38.83358,0.695818,0.27285,0.037876,0.055958,0.028626
5,26.999167,0.568515,0.574039,0.596632,0.566581,0.63005
6,23.423625,0.595339,0.425203,0.334062,0.397014,0.288342
7,23.447046,0.587198,0.291459,0.0,0.0,0.0
8,22.800525,0.606922,0.315239,0.079757,0.12156,0.059348
9,24.325232,0.545231,0.266571,0.0,0.0,0.0


In [24]:
XGB["test_accuracy"].mean()

0.33504143131355757

In [25]:
model = AdaBoostClassifier(n_estimators=10, random_state=123)
ADA = cross_validate(model, x, y, cv=10, verbose=1, return_train_score=False, scoring=scoring_list)
pd.DataFrame(ADA)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.9min finished


Unnamed: 0,fit_time,score_time,test_accuracy,test_f1,test_precision,test_recall
0,28.989543,1.035521,0.342725,0.142237,0.204666,0.108992
1,27.179566,0.927263,0.302694,0.003764,0.006588,0.002634
2,27.350706,1.134946,0.374285,0.257866,0.316811,0.217415
3,29.933923,1.084522,0.285521,0.0,0.0,0.0
4,27.8159,1.063581,0.26723,0.041562,0.060057,0.031776
5,28.670104,0.977034,0.612829,0.652709,0.591756,0.727661
6,27.95329,1.124462,0.476626,0.433303,0.472408,0.400178
7,28.403381,0.991027,0.480649,0.433108,0.476749,0.396786
8,27.376071,1.176955,0.335177,0.06702,0.112327,0.047757
9,28.254666,1.116507,0.25689,0.0,0.0,0.0


In [26]:
ADA["test_accuracy"].mean()

0.37346269471597415

After running boosters, we can see that Adabooster is working slightly better than XGB and Random Forest with 4% and 2% difference.

Grid search on Random Forest Classifier for different parameter input.

In [27]:
param_grid = { 
    'n_estimators': [10, 15],
    'max_depth': [4, 6],
    'random_state': [123]
#     'max_features': ['auto', 'sqrt', 'log2']
}

scoring_list = ["accuracy", "precision", "recall", "f1"]
grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=10, verbose=1, scoring=scoring_list, refit="accuracy")
grid.fit(x, y)
grid.best_params_

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 30.8min finished


{'max_depth': 4, 'n_estimators': 10, 'random_state': 123}

In [28]:
grid.best_score_

0.35408821046877798

In [34]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_train_accuracy,mean_train_f1,mean_train_precision,mean_train_recall,...,std_fit_time,std_score_time,std_test_accuracy,std_test_f1,std_test_precision,std_test_recall,std_train_accuracy,std_train_f1,std_train_precision,std_train_recall
0,24.290194,0.689917,0.354088,0.158968,0.171963,0.155966,0.613861,0.635883,0.602036,0.675153,...,2.061248,0.040272,0.102718,0.216458,0.205327,0.234644,0.00732,0.010179,0.011085,0.032024
1,36.162141,0.940275,0.351597,0.15929,0.170928,0.156083,0.614856,0.638945,0.601772,0.682238,...,2.198139,0.128361,0.100587,0.215322,0.206787,0.23061,0.007313,0.009021,0.010731,0.029658
2,33.215793,0.752863,0.32202,0.094116,0.117816,0.080467,0.626289,0.648128,0.612395,0.688789,...,3.051049,0.039777,0.059808,0.136461,0.152423,0.126287,0.007526,0.010186,0.007538,0.022456
3,52.18208,1.09785,0.309295,0.066227,0.092983,0.051621,0.627001,0.647843,0.613556,0.686719,...,4.138913,0.076194,0.036019,0.081745,0.110847,0.064907,0.006434,0.010545,0.00609,0.023961


Grid search on Adaboost classifier with different parameter as input.

In [37]:
param_grid = { 
    'n_estimators': [10, 15],
    'learning_rate': [0.1],
    'random_state': [123]
#     'max_features': ['auto', 'sqrt', 'log2']
}

scoring_list = ["accuracy", "precision", "recall", "f1"]
grid_ada = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=param_grid, cv=10, verbose=2, scoring=scoring_list, refit="accuracy")
grid_ada.fit(x, y)
grid_ada.best_params_

Fitting 10 folds for each of 2 candidates, totalling 20 fits
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  35.6s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   47.1s remaining:    0.0s


[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  34.0s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  36.3s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  35.8s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  38.9s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  39.4s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  36.2s
[CV] learning_rate=0.1, n_estimators=10, random_state=123 ............
[CV]  learning_rate=0.1, n_estimators=10, random_state=123, total=  40.2s
[CV] learning_rate=0.1, n_estimators=10, random_state=12

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 19.6min finished


{'learning_rate': 0.1, 'n_estimators': 10, 'random_state': 123}

In [38]:
grid_ada.best_score_

0.45020201499114776

In [39]:
pd.DataFrame(grid_ada.cv_results_)

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_train_accuracy,mean_train_f1,mean_train_precision,mean_train_recall,...,std_fit_time,std_score_time,std_test_accuracy,std_test_f1,std_test_precision,std_test_recall,std_train_accuracy,std_train_f1,std_train_precision,std_train_recall
0,35.539234,1.380383,0.450202,0.365035,0.350401,0.387402,0.595452,0.62616,0.582432,0.677833,...,1.893622,0.236359,0.137503,0.27219,0.247584,0.304164,0.011421,0.008973,0.01279,0.022529
1,49.679783,1.665556,0.421846,0.308548,0.304869,0.320517,0.599124,0.62726,0.586593,0.675042,...,0.922485,0.083384,0.136014,0.268942,0.245204,0.29661,0.011786,0.010871,0.013565,0.026608


From the two grid search results above we can tell that the 45% accuracy on Adabooster with 10 trees as n_estimators and learning_rate as 0.1 is still the best model comparing to Random Forest and XGB classifier. On top of that, I did try to run SVM, although one of the disadvantage of it is that it natively sensitive to class inbalances, but I fixed it with making a balanced version sample of data, unfortunately my laptop hangs when I ran SVM, otherwise there would be more results to compare and discuss!