In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../cleaned_data.csv')

**Details of few columns:**

Col Name | Description
------------- | -------------
DepTime  | Actual departure time
DepDel15  | If departure delay is $\geq$ 15 then 1 else 0
CRSDepTime | Scheduled Departure Time
DepDelayMinutes | Difference between scheduled and actual departure time in minutes
ArrTime | Actual Arrival time
CRSArrTime | Scheduled Arrival Time
ArrDel15 (label) | If arrival delay is $\geq$ 15 then 1 else 0
ArrDelayMinutes (target) | Difference between scheduled and actual arrival time in minutes

In [3]:
columns_for_regression = ['Origin', 'Dest' , 'DepDelayMinutes' , 'ArrDel15', 'ArrDelayMinutes']

In [4]:
Df = data[columns_for_regression]

In [5]:
Df = Df[Df.ArrDel15 == 1]
Df.head()

Unnamed: 0,Origin,Dest,DepDelayMinutes,ArrDel15,ArrDelayMinutes
13,SFO,IAH,639.0,1.0,645.0
22,LAX,ORD,7.0,1.0,18.0
32,DEN,LAS,79.0,1.0,67.0
40,MIA,IAH,99.0,1.0,87.0
43,PHX,LAX,0.0,1.0,32.0


In [6]:
numerical_r = ['DepDelayMinutes']
categorical_r = ['Origin', 'Dest']

In [7]:
categorical_pipeline_r = Pipeline(
    steps=[
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)
numerical_pipeline_r = Pipeline(
    steps=[
        ("scale", StandardScaler())
    ]
)

full_pipeline_r = ColumnTransformer(
    transformers=[
        ("numeric", numerical_pipeline_r, numerical_r),
        ("categorical", categorical_pipeline_r, categorical_r),
    ],
    remainder='passthrough'
)

model_pipeline_r = Pipeline(
    steps=[
        ('preprocess',  full_pipeline_r),
        ('xgb', XGBRegressor() )
    ]
)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(Df.iloc[:, :-1], Df.iloc[:, -1], train_size=0.75,  random_state=0)


In [9]:
# xgb = XGBRegressor(n_estimators=300, colsample_bytree= 0.8, gamma= 0, learning_rate= 0.3, max_depth= 9, reg_lambda= 10, scale_pos_weight= 1, subsample= 0.8)
xgb = XGBRegressor()
X_train_processed = full_pipeline_r.fit_transform(X_train)
X_train_processed[0]

array([-0.77310031,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ])

In [10]:
%%time
model = xgb.fit(X_train_processed ,y_train)

CPU times: user 52.2 s, sys: 9.5 s, total: 1min 1s
Wall time: 9.3 s


In [11]:
X_test_processed = full_pipeline_r.transform(X_test)
prediction = model.predict(X_test_processed)

In [12]:
prediction.shape, y_test.shape

((97014,), (97014,))

In [13]:
prediction[:5], y_test[:5].values

(array([ 50.44641 , 165.17227 ,  26.030409,  32.252728,  59.181774],
       dtype=float32),
 array([ 61., 157.,  24.,  30.,  51.]))

In [14]:
r2_score(y_pred=prediction, y_true=y_test)

0.9438303927954105

In [15]:
mean_squared_error(y_pred=prediction, y_true=y_test)

291.32230889383163

In [16]:
mean_absolute_error(y_pred=prediction, y_true=y_test)

11.802674231567671

## Regression analysis


In [58]:
X_train, X_test, y_train, y_test = train_test_split(Df.iloc[:, :-1], Df.iloc[:, -1], train_size=0.75,  random_state=0)


In [59]:
X_test = X_test[X_test['DepDelayMinutes'] > 15]

In [60]:
X_test.shape

(72164, 4)

In [61]:
X_test['Range'] = pd.cut(X_test['DepDelayMinutes'], bins=[15,100,200,500, 1000, 2000])

In [62]:

X_test

Unnamed: 0,Origin,Dest,DepDelayMinutes,ArrDel15,Range
641339,ORD,SFO,52.0,1.0,"(15, 100]"
380259,CLT,LAX,176.0,1.0,"(100, 200]"
1401916,MCO,ATL,34.0,1.0,"(15, 100]"
185114,DFW,ATL,63.0,1.0,"(15, 100]"
1304715,ORD,ATL,47.0,1.0,"(15, 100]"
...,...,...,...,...,...
1571945,JFK,PHX,64.0,1.0,"(15, 100]"
963254,ORD,DFW,23.0,1.0,"(15, 100]"
596254,SFO,EWR,55.0,1.0,"(15, 100]"
1151708,SFO,EWR,62.0,1.0,"(15, 100]"


In [82]:
for i in range(5):
    x_temp = X_test[X_test['Range'] ==  X_test['Range'].unique()[i]]
    y_temp = y_test[x_temp.index]
    X_test_processed = full_pipeline_r.transform(x_temp.iloc[:, :-1])
    prediction = model.predict(X_test_processed)
    print('Range: ',  X_test['Range'].unique()[i], end= '\t')
    print('Count: ', len(x_temp), end='\t')
    print(f' MSE: {mean_squared_error(y_pred=prediction, y_true=y_temp)}', end = '\t')
    print(f' MAE: {mean_absolute_error(y_pred=prediction, y_true=y_temp)}')      

Range:  (15, 100]	Count:  56452	 MSE: 280.73067020070823	 MAE: 11.669831591882991
Range:  (100, 200]	Count:  11764	 MSE: 403.8431759699066	 MAE: 14.197253291763364
Range:  (200, 500]	Count:  3598	 MSE: 464.7078372836462	 MAE: 15.477036553001723
Range:  (500, 1000]	Count:  311	 MSE: 650.1326963854744	 MAE: 18.015952941305766
Range:  (1000, 2000]	Count:  39	 MSE: 565.1307494675693	 MAE: 17.967593462039265


In [64]:
X_test[X_test['Range'] == X_test['Range'].values[0]]

Unnamed: 0,Origin,Dest,DepDelayMinutes,ArrDel15,Range
641339,ORD,SFO,52.0,1.0,"(15, 100]"
1401916,MCO,ATL,34.0,1.0,"(15, 100]"
185114,DFW,ATL,63.0,1.0,"(15, 100]"
1304715,ORD,ATL,47.0,1.0,"(15, 100]"
1757143,ATL,DEN,38.0,1.0,"(15, 100]"
...,...,...,...,...,...
1571945,JFK,PHX,64.0,1.0,"(15, 100]"
963254,ORD,DFW,23.0,1.0,"(15, 100]"
596254,SFO,EWR,55.0,1.0,"(15, 100]"
1151708,SFO,EWR,62.0,1.0,"(15, 100]"
