In [1]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.pipeline                import Pipeline
from sklearn.model_selection         import train_test_split, GridSearchCV
from sklearn.linear_model            import LogisticRegression
from sklearn.ensemble                import BaggingClassifier,RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier

from sklearn.tree                    import DecisionTreeClassifier
from sklearn.svm                     import SVC

import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = pd.read_csv('~/ga/projects/capstone_data/data/data_model.csv')

In [3]:
data.shape

(1775112, 25)

In [4]:
data.columns

Index(['Unnamed: 0', 'month', 'day_of_month', 'day_of_week', 'fl_date',
       'op_carrier', 'op_carrier_fl_num', 'origin', 'origin_city_name',
       'origin_state_abr', 'origin_state_nm', 'dest', 'dest_city_name',
       'dest_state_abr', 'dest_state_nm', 'dep_delay', 'arr_delay', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'carrier', 'total_delay'],
      dtype='object')

In [5]:
data.drop(columns = ['Unnamed: 0'], inplace=True)

In [6]:
data.columns

Index(['month', 'day_of_month', 'day_of_week', 'fl_date', 'op_carrier',
       'op_carrier_fl_num', 'origin', 'origin_city_name', 'origin_state_abr',
       'origin_state_nm', 'dest', 'dest_city_name', 'dest_state_abr',
       'dest_state_nm', 'dep_delay', 'arr_delay', 'distance', 'carrier_delay',
       'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay',
       'carrier', 'total_delay'],
      dtype='object')

In [7]:
data['total_delay'].value_counts()

0.0       1452996
15.0        12506
16.0        11791
17.0        10953
18.0        10445
           ...   
1071.0          1
1076.0          1
1078.0          1
701.0           1
1497.0          1
Name: total_delay, Length: 1186, dtype: int64

In [8]:
data['delay_indicator'] = np.where(data['total_delay']>14,1,0)

In [9]:
data['delay_indicator'].value_counts(normalize=True)

0    0.818538
1    0.181462
Name: delay_indicator, dtype: float64

In [10]:
data_negative = data[data['delay_indicator'] == 1]

data_pozitive = data[data['delay_indicator'] == 0].sample(data_negative.shape[0])

data_balanced = pd.concat([data_pozitive,data_negative])
data_balanced.shape

(644232, 25)

In [11]:
data_balanced.head()

Unnamed: 0,month,day_of_month,day_of_week,fl_date,op_carrier,op_carrier_fl_num,origin,origin_city_name,origin_state_abr,origin_state_nm,...,arr_delay,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,carrier,total_delay,delay_indicator
37046,10,19,5,2018-10-19,OH,5171,BDL,"Hartford, CT",CT,Connecticut,...,-10.0,313.0,0.0,0.0,0.0,0.0,0.0,American,0.0,0
924838,11,12,1,2018-11-12,YX,3746,ABQ,"Albuquerque, NM",NM,New Mexico,...,-38.0,1118.0,0.0,0.0,0.0,0.0,0.0,American,0.0,0
1233720,12,22,6,2018-12-22,AS,1037,JFK,"New York, NY",NY,New York,...,1.0,2569.0,0.0,0.0,0.0,0.0,0.0,Alaska Airlines,0.0,0
1318397,12,29,6,2018-12-29,F9,1777,TTN,"Trenton, NJ",NJ,New Jersey,...,-29.0,896.0,0.0,0.0,0.0,0.0,0.0,Frontier Airlines,0.0,0
468910,10,19,5,2018-10-19,WN,2488,MDW,"Chicago, IL",IL,Illinois,...,-8.0,1855.0,0.0,0.0,0.0,0.0,0.0,SouthWest,0.0,0


In [12]:
X = data_balanced.drop(columns=['delay_indicator', 'total_delay','dep_delay','arr_delay'])
y = data_balanced['delay_indicator']

In [13]:
X.shape

(644232, 21)

In [14]:
X.dtypes

month                    int64
day_of_month             int64
day_of_week              int64
fl_date                 object
op_carrier              object
op_carrier_fl_num        int64
origin                  object
origin_city_name        object
origin_state_abr        object
origin_state_nm         object
dest                    object
dest_city_name          object
dest_state_abr          object
dest_state_nm           object
distance               float64
carrier_delay          float64
weather_delay          float64
nas_delay              float64
security_delay         float64
late_aircraft_delay    float64
carrier                 object
dtype: object

In [15]:
y.shape

(644232,)

In [16]:
y.mean()

0.5

In [17]:
X = pd.get_dummies(X,columns = ['origin','dest','carrier'],drop_first=True )

In [18]:
X.head()

Unnamed: 0,month,day_of_month,day_of_week,fl_date,op_carrier,op_carrier_fl_num,origin_city_name,origin_state_abr,origin_state_nm,dest_city_name,...,dest_YUM,carrier_Allegiant Air,carrier_American,carrier_Delta,carrier_Frontier Airlines,carrier_Hawaiian Airlines,carrier_JetBlue,carrier_SouthWest,carrier_Spirit Airlines,carrier_United
37046,10,19,5,2018-10-19,OH,5171,"Hartford, CT",CT,Connecticut,"Washington, DC",...,0,0,1,0,0,0,0,0,0,0
924838,11,12,1,2018-11-12,YX,3746,"Albuquerque, NM",NM,New Mexico,"Chicago, IL",...,0,0,1,0,0,0,0,0,0,0
1233720,12,22,6,2018-12-22,AS,1037,"New York, NY",NY,New York,"San Jose, CA",...,0,0,0,0,0,0,0,0,0,0
1318397,12,29,6,2018-12-29,F9,1777,"Trenton, NJ",NJ,New Jersey,"Orlando, FL",...,0,0,0,0,1,0,0,0,0,0
468910,10,19,5,2018-10-19,WN,2488,"Chicago, IL",IL,Illinois,"San Francisco, CA",...,0,0,0,0,0,0,0,1,0,0


In [19]:
X.shape

(644232, 725)

In [20]:
X = X._get_numeric_data()
X.shape

(644232, 717)

## Logistic Regression##

In [21]:
lr = LogisticRegression()

In [22]:
X.shape

(644232, 717)

In [23]:
y.shape

(644232,)

In [24]:
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: delay_indicator, dtype: float64

In [25]:
#Training and testing sets split with stratification 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
lr.score(X_train,y_train)

0.9859160468071544

In [28]:
lr.score(X_test, y_test)

0.9859553701151138

In [29]:
y_hat = lr.predict(X_test)

In [30]:
from sklearn.metrics import mean_squared_error as mse

In [31]:
mse(y_hat, y_test)

0.01404462988488619

In [32]:
coefs = lr.coef_

In [33]:
pd.DataFrame(coefs)

array([[ 4.53900158e-02,  1.13983971e-02,  5.10471916e-03,
         5.61382791e-05,  7.35927237e-06,  1.03386056e+01,
         9.00803913e+00,  1.15060886e+01,  6.40630716e+00,
         1.15665552e+01,  1.02851946e-02,  7.26563105e-02,
        -1.09752354e-02, -3.41907109e-02, -1.43647751e-02,
        -1.74924927e-02, -3.71377849e-02, -8.64581574e-02,
         8.63186660e-02, -6.71744112e-03, -3.73448629e-04,
        -5.47855729e-03,  1.60549050e-01,  1.46413438e-02,
        -1.32052933e-01, -3.55315233e-01, -1.82933952e-02,
         1.18863083e-03,  3.91026285e-04,  4.24555146e-01,
        -4.58086782e-02, -2.84825501e-03, -1.63270419e-01,
        -2.23362021e-02,  1.84098073e-01, -1.77377011e-02,
        -2.46337307e-02,  2.14398149e-02, -2.73666590e-02,
        -9.31437333e-02,  2.62187670e-02,  1.50656070e-02,
        -1.54286439e-01, -6.40503725e-02, -3.98931548e-02,
        -1.30449638e-02, -9.49380753e-03, -3.44139949e-02,
        -2.84624920e-03,  9.11158310e-02, -3.27692487e-0