In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../cleaned_data.csv')

**Details of few columns:**

Col Name | Description
------------- | -------------
DepTime  | Actual departure time
DepDel15  | If departure delay is $\geq$ 15 then 1 else 0
CRSDepTime | Scheduled Departure Time
DepDelayMinutes | Difference between scheduled and actual departure time in minutes
ArrTime | Actual Arrival time
CRSArrTime | Scheduled Arrival Time
ArrDel15 (label) | If arrival delay is $\geq$ 15 then 1 else 0
ArrDelayMinutes (target) | Difference between scheduled and actual arrival time in minutes

In [3]:
columns_for_regression = ['Origin', 'Dest' , 'DepDelayMinutes' , 'ArrDel15', 'ArrDelayMinutes']

In [4]:
Df = data[columns_for_regression]

In [5]:
Df = Df[Df.ArrDel15 == 1]
Df.head()

Unnamed: 0,Origin,Dest,DepDelayMinutes,ArrDel15,ArrDelayMinutes
13,SFO,IAH,639.0,1.0,645.0
22,LAX,ORD,7.0,1.0,18.0
32,DEN,LAS,79.0,1.0,67.0
40,MIA,IAH,99.0,1.0,87.0
43,PHX,LAX,0.0,1.0,32.0


In [6]:
numerical_r = ['DepDelayMinutes']
categorical_r = ['Origin', 'Dest']

In [7]:
categorical_pipeline_r = Pipeline(
    steps=[
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)
numerical_pipeline_r = Pipeline(
    steps=[
        ("scale", StandardScaler())
    ]
)

full_pipeline_r = ColumnTransformer(
    transformers=[
        ("numeric", numerical_pipeline_r, numerical_r),
        ("categorical", categorical_pipeline_r, categorical_r),
    ],
    remainder='passthrough'
)

model_pipeline_r = Pipeline(
    steps=[
        ('preprocess',  full_pipeline_r),
        ('xgb', RandomForestRegressor() )
    ]
)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(Df.iloc[:, :-1], Df.iloc[:, -1], train_size=0.75,  random_state=0)


In [9]:
# xgb = XGBRegressor(n_estimators=300, colsample_bytree= 0.8, gamma= 0, learning_rate= 0.3, max_depth= 9, reg_lambda= 10, scale_pos_weight= 1, subsample= 0.8)
rf = RandomForestRegressor()
X_train_processed = full_pipeline_r.fit_transform(X_train)
X_train_processed[0]

array([-0.77310031,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  1.        ])

In [10]:
%%time
model = rf.fit(X_train_processed ,y_train)

CPU times: user 55.8 s, sys: 294 ms, total: 56.1 s
Wall time: 56.3 s


In [11]:
X_test_processed = full_pipeline_r.transform(X_test)
prediction = model.predict(X_test_processed)

In [12]:
prediction.shape, y_test.shape

((97014,), (97014,))

In [13]:
prediction[:5], y_test[:5].values

(array([ 51.62450848, 182.10308333,  25.86473789,  33.17190674,
         52.43376984]),
 array([ 61., 157.,  24.,  30.,  51.]))

In [14]:
r2_score(y_pred=prediction, y_true=y_test)

0.9366199408663023

In [15]:
mean_squared_error(y_pred=prediction, y_true=y_test)

328.7191433866348

In [16]:
mean_absolute_error(y_pred=prediction, y_true=y_test)

12.565337573417274