In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../cleaned_data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851422 entries, 0 to 1851421
Data columns (total 36 columns):
 #   Column             Dtype  
---  ------             -----  
 0   FlightDate         object 
 1   Quarter            int64  
 2   Year               int64  
 3   Month              int64  
 4   DayofMonth         int64  
 5   DepTime            float64
 6   DepDel15           float64
 7   CRSDepTime         int64  
 8   DepDelayMinutes    float64
 9   OriginAirportID    int64  
 10  DestAirportID      int64  
 11  ArrTime            float64
 12  CRSArrTime         int64  
 13  ArrDelayMinutes    float64
 14  Origin             object 
 15  Dest               object 
 16  RoundedFlightDate  object 
 17  DepatHr            int64  
 18  ArrDel15           float64
 19  Delayed            int64  
 20  date               object 
 21  airport            object 
 22  windspeedKmph      int64  
 23  winddirDegree      int64  
 24  weatherCode        int64  
 25  precipMM          

In [4]:
data = pd.concat([data, pd.get_dummies(data['Origin'],prefix='Origin')], axis=1)

In [5]:
data = pd.concat([data, pd.get_dummies(data['Dest'],prefix='Destination')], axis=1)

In [6]:
data.columns

Index(['FlightDate', 'Quarter', 'Year', 'Month', 'DayofMonth', 'DepTime',
       'DepDel15', 'CRSDepTime', 'DepDelayMinutes', 'OriginAirportID',
       'DestAirportID', 'ArrTime', 'CRSArrTime', 'ArrDelayMinutes', 'Origin',
       'Dest', 'RoundedFlightDate', 'DepatHr', 'ArrDel15', 'Delayed', 'date',
       'airport', 'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'WindChillF', 'humidity', 'time', 'timeInHr', 'Origin_ATL',
       'Origin_CLT', 'Origin_DEN', 'Origin_DFW', 'Origin_EWR', 'Origin_IAH',
       'Origin_JFK', 'Origin_LAS', 'Origin_LAX', 'Origin_MCO', 'Origin_MIA',
       'Origin_ORD', 'Origin_PHX', 'Origin_SEA', 'Origin_SFO',
       'Destination_ATL', 'Destination_CLT', 'Destination_DEN',
       'Destination_DFW', 'Destination_EWR', 'Destination_IAH',
       'Destination_JFK', 'Destination_LAS', 'Destination_LAX',
       'Destination_MCO', 'Destination_MIA', 'Destination_O

In [7]:
Df = data[[ 'Quarter', 'Month', 'DayofMonth',
       'CRSDepTime', 'CRSArrTime', 
       'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'WindChillF', 'humidity',  'Origin_ATL',
       'Origin_CLT', 'Origin_DEN', 'Origin_DFW', 'Origin_EWR', 'Origin_IAH',
       'Origin_JFK', 'Origin_LAS', 'Origin_LAX', 'Origin_MCO', 'Origin_MIA',
       'Origin_ORD', 'Origin_PHX', 'Origin_SEA', 'Origin_SFO',
       'Destination_ATL', 'Destination_CLT', 'Destination_DEN',
       'Destination_DFW', 'Destination_EWR', 'Destination_IAH',
       'Destination_JFK', 'Destination_LAS', 'Destination_LAX',
       'Destination_MCO', 'Destination_MIA', 'Destination_ORD',
       'Destination_PHX', 'Destination_SEA', 'Destination_SFO',  'DepDel15']]


In [8]:
Df.head()

Unnamed: 0,Quarter,Month,DayofMonth,CRSDepTime,CRSArrTime,windspeedKmph,winddirDegree,weatherCode,precipMM,visibility,...,Destination_JFK,Destination_LAS,Destination_LAX,Destination_MCO,Destination_MIA,Destination_ORD,Destination_PHX,Destination_SEA,Destination_SFO,DepDel15
0,1,1,1,2347,714,15,123,113,0.0,10,...,0,0,0,0,1,0,0,0,0,1.0
1,1,1,1,20,705,15,123,113,0.0,10,...,0,0,0,0,0,0,0,0,0,0.0
2,1,1,1,2359,603,15,38,113,0.0,10,...,0,0,0,0,0,1,0,0,0,0.0
3,1,1,1,30,823,15,38,113,0.0,10,...,0,0,0,0,1,0,0,0,0,0.0
4,1,1,1,10,501,15,38,113,0.0,10,...,0,0,0,0,0,0,0,0,0,0.0


In [9]:
X = Df.iloc[:, :-1]
y =  Df.iloc[:, -1]

In [10]:
# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=17)
# X, y = ros.fit_resample(X, y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75, random_state=0)

In [12]:
X_train.shape, y_train.shape

((1388566, 47), (1388566,))

In [13]:
X_test.shape, y_test.shape

((462856, 47), (462856,))

In [14]:
y_train.value_counts()

0.0    1109362
1.0     279204
Name: DepDel15, dtype: int64

In [15]:
# Negative to positive ratio
ratio = 1109510/279056
ratio

3.9759403130554443

In [17]:
xgb = XGBClassifier( n_estimators=300)

In [18]:
%%time
model = xgb.fit(X_train.values ,y_train.values)

CPU times: user 20min 37s, sys: 1min 8s, total: 21min 45s
Wall time: 3min 38s


In [19]:
prediction = model.predict(X_test)

In [20]:
accuracy_score(prediction, y_test.values )

0.8186649843579861

In [21]:
f1_score(prediction,y_test.values )

0.26791570720815017

In [22]:
print(classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       0.98      0.82      0.90    441079
           1       0.17      0.71      0.27     21777

    accuracy                           0.82    462856
   macro avg       0.57      0.76      0.58    462856
weighted avg       0.94      0.82      0.87    462856



In [23]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state = 42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [24]:
# xgb = XGBClassifier( n_estimators=400, colsample_bytree= 0.8, gamma= 0,learning_rate= 0.3, max_depth= 10,reg_lambda= 15,scale_pos_weight= 3.77, subsample= 0.8)
xgb = XGBClassifier( n_estimators=300)#, gamma= 0,learning_rate= 0.7, max_depth= 20,scale_pos_weight= 11)


In [25]:
# sc = StandardScaler()
# X_train_smote = sc.fit_transform(X_train_smote)
# X_train_smote[0]

In [26]:
%%time
model = xgb.fit(X_train_smote ,y_train_smote)
# model = xgb.fit(X_train.values ,y_train.values)

CPU times: user 32min 21s, sys: 1min 25s, total: 33min 46s
Wall time: 5min 17s


In [27]:
# X_test = sc.transform(X_test)
# X_test[0]

In [28]:
prediction = model.predict(X_test)
prediction[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [29]:
y_test.values[:100]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0.,
       1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.])

In [30]:
accuracy_score(prediction, y_test.values )

0.7965673989318492

In [31]:
f1_score(prediction,y_test.values )

0.34166736583046675

In [32]:
print(classification_report(prediction, y_test))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88    412699
           1       0.26      0.49      0.34     50157

    accuracy                           0.80    462856
   macro avg       0.60      0.66      0.61    462856
weighted avg       0.86      0.80      0.82    462856

