In [281]:
import pandas as pd
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [136]:
data = pd.read_csv('cleaned_data.csv')

In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851422 entries, 0 to 1851421
Data columns (total 36 columns):
 #   Column             Dtype  
---  ------             -----  
 0   FlightDate         object 
 1   Quarter            int64  
 2   Year               int64  
 3   Month              int64  
 4   DayofMonth         int64  
 5   DepTime            float64
 6   DepDel15           float64
 7   CRSDepTime         int64  
 8   DepDelayMinutes    float64
 9   OriginAirportID    int64  
 10  DestAirportID      int64  
 11  ArrTime            float64
 12  CRSArrTime         int64  
 13  ArrDelayMinutes    float64
 14  Origin             object 
 15  Dest               object 
 16  RoundedFlightDate  object 
 17  DepatHr            int64  
 18  ArrDel15           float64
 19  Delayed            int64  
 20  date               object 
 21  airport            object 
 22  windspeedKmph      int64  
 23  winddirDegree      int64  
 24  weatherCode        int64  
 25  precipMM          

## Classification

In [138]:
label_encoder = LabelEncoder()
data['Encoded_Origin_Airport'] = label_encoder.fit_transform(data['Origin'])
data['Encoded_Dest_Airport'] = label_encoder.fit_transform(data['Dest'])

In [139]:
Df = data[[ 'Quarter', 'Month', 'DayofMonth',
       'CRSDepTime', 'CRSArrTime', 
       'windspeedKmph', 'winddirDegree', 'weatherCode', 'precipMM',
       'visibility', 'pressure', 'cloudcover', 'DewPointF', 'WindGustKmph',
       'tempF', 'WindChillF', 'humidity', 'Encoded_Origin_Airport', 'Encoded_Dest_Airport' ,'DepDel15']]


In [140]:
X = Df.iloc[:, :-1]
y = Df.iloc[:, -1]

In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [142]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
smote = SMOTE(random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [143]:
cls = ExtraTreesClassifier(random_state=42)

In [144]:
cls.fit(X_train_smote.values, y_train_smote.values)

In [145]:
prediction = cls.predict(X.values)

In [146]:
index = np.where(prediction==1)

In [147]:
X_new = X.iloc[np.where(prediction==1)]
y_new = y.iloc[np.where(prediction==1)]

## Regression Analysis

In [148]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [149]:
numerical_r = ['DepDelayMinutes']
categorical_r = ['Origin', 'Dest']

In [150]:
categorical_pipeline_r = Pipeline(
    steps=[
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)
numerical_pipeline_r = Pipeline(
    steps=[
        ("scale", StandardScaler())
    ]
)

full_pipeline_r = ColumnTransformer(
    transformers=[
        ("numeric", numerical_pipeline_r, numerical_r),
        ("categorical", categorical_pipeline_r, categorical_r),
    ],
    remainder='passthrough'
)

model_pipeline_r = Pipeline(
    steps=[
        ('preprocess',  full_pipeline_r),
        ('xgb', XGBRegressor() )
    ]
)

In [201]:
columns_for_regression = ['Origin', 'Dest' ,  'ArrDel15', 'DepDelayMinutes' , 'ArrDelayMinutes']
Df = data[columns_for_regression]

In [202]:
Df = Df[Df.ArrDel15 == 1]
Df.head()

Unnamed: 0,Origin,Dest,ArrDel15,DepDelayMinutes,ArrDelayMinutes
13,SFO,IAH,1.0,639.0,645.0
22,LAX,ORD,1.0,7.0,18.0
32,DEN,LAS,1.0,79.0,67.0
40,MIA,IAH,1.0,99.0,87.0
43,PHX,LAX,1.0,0.0,32.0


In [204]:
Df.drop(['ArrDel15'], inplace=True, axis=1)

In [205]:
X_train, X_test, y_train, y_test = train_test_split(Df.iloc[:, :-1], Df.iloc[:, -1], train_size=0.75,  random_state=0)

In [206]:
# xgb = XGBRegressor(n_estimators=300, colsample_bytree= 0.8, gamma= 0, learning_rate= 0.3, max_depth= 9, reg_lambda= 10, scale_pos_weight= 1, subsample= 0.8)
xgb = XGBRegressor()
X_train_processed = full_pipeline_r.fit_transform(X_train)
X_train_processed[0]

array([-0.77310031,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ])

In [207]:
%%time
model = xgb.fit(X_train_processed ,y_train)

CPU times: user 50.8 s, sys: 9.58 s, total: 1min
Wall time: 9.06 s


In [208]:
columns_for_regression_analysis = ['Origin', 'Dest' ,   'DepDelayMinutes' , 'ArrDelayMinutes']
Df = data[columns_for_regression_analysis]

In [209]:
index

(array([      0,       6,      13, ..., 1851417, 1851419, 1851420]),)

In [210]:
Df = Df.iloc[index[0]]

In [211]:
Df

Unnamed: 0,Origin,Dest,DepDelayMinutes,ArrDelayMinutes
0,LAS,MIA,19.0,7.0
6,LAX,EWR,26.0,9.0
13,SFO,IAH,639.0,645.0
19,LAX,IAH,17.0,1.0
23,LAX,MIA,0.0,0.0
...,...,...,...,...
1851415,MCO,EWR,25.0,17.0
1851416,JFK,MCO,41.0,51.0
1851417,JFK,LAX,82.0,64.0
1851419,DEN,SFO,75.0,65.0


In [212]:
X = Df.iloc[:, :-1]
y =  Df.iloc[:, -1]

In [213]:
X['Range'] = pd.cut(X['DepDelayMinutes'], bins=[15,100,200,500, 1000, 2000])

In [214]:
X.head()

Unnamed: 0,Origin,Dest,DepDelayMinutes,Range
0,LAS,MIA,19.0,"(15.0, 100.0]"
6,LAX,EWR,26.0,"(15.0, 100.0]"
13,SFO,IAH,639.0,"(500.0, 1000.0]"
19,LAX,IAH,17.0,"(15.0, 100.0]"
23,LAX,MIA,0.0,


In [215]:
X.Range.unique()

[(15.0, 100.0], (500.0, 1000.0], NaN, (100.0, 200.0], (200.0, 500.0], (1000.0, 2000.0]]
Categories (5, interval[int64, right]): [(15, 100] < (100, 200] < (200, 500] < (500, 1000] < (1000, 2000]]

In [233]:
len(X[X['Range'] ==  X['Range'].unique()[5]]['Origin'].unique())

15

In [234]:
len(X[X['Range'] ==  X['Range'].unique()[5]]['Dest'].unique())

14

In [244]:
X.head()

Unnamed: 0,Origin,Dest,DepDelayMinutes,Range
0,LAS,MIA,19.0,"(15.0, 100.0]"
6,LAX,EWR,26.0,"(15.0, 100.0]"
13,SFO,IAH,639.0,"(500.0, 1000.0]"
19,LAX,IAH,17.0,"(15.0, 100.0]"
23,LAX,MIA,0.0,


In [256]:
ranges = X['Range'].unique()
X_processed = full_pipeline_r.fit_transform(X) 
type(X_processed)

numpy.ndarray

In [259]:
index = np.where(X_processed[:, -1] ==  ranges[0])
index

(array([     0,      1,      3, ..., 347703, 347704, 347705]),)

In [282]:
for i in range(6):
    index = np.where(X_processed[:, -1] ==  ranges[i])
    x_temp = X_processed[index]

    if len(x_temp) == 0:
        continue
    y_temp = y.iloc[index[0]]
    
    prediction = model.predict(x_temp[:,:-1])

    print('Range: ',  ranges[i], end= '\t')
    print('Count: ', len(x_temp), end='\t')
    print(f' MSE: {mean_squared_error(y_pred=prediction, y_true=y_temp)}', end = '\t')
    print(f' MAE: {mean_absolute_error(y_pred=prediction, y_true=y_temp)}')      

Range:  (15, 100]	Count:  249204	 MSE: 350.8432709867058	 MAE: 14.030095398011795
Range:  (500, 1000]	Count:  1084	 MSE: 346.1653668186026	 MAE: 13.150061969827462
Range:  (100, 200]	Count:  44264	 MSE: 407.81421662092	 MAE: 13.829298218658684
Range:  (200, 500]	Count:  13510	 MSE: 469.84052581710745	 MAE: 15.059065613651347
Range:  (1000, 2000]	Count:  142	 MSE: 351.16442034739845	 MAE: 13.845073861135564
