# Delay estimation

## 1. Load the data

In [1]:
import numpy as np
import pandas as pd
import datetime as datetime
import os
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
# from IPython.core.interactiveshell import InteractiveShell # show all results of commands
# InteractiveShell.ast_node_interactivity = "all"


In [2]:
path= 'F:\\Dropbox\\Flight\\DATA\\cleaned_data' #for load the data
df= pd.read_csv(path+'\\prepared_data.csv', parse_dates=['time','S_Dep_time','S_Arr_time'])
#parse data help the conversion string to datetime dtype.

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
print(df['Diverted'].value_counts())

print(df['Cancelled'].value_counts())
df=df[df['Cancelled']==0]


0.0    842990
1.0      1734
Name: Diverted, dtype: int64
0.0    831698
1.0     13026
Name: Cancelled, dtype: int64


In [122]:
len(df)
df= df[df['Diverted']==0]
print(df['Diverted'].value_counts())

0.0    829964
Name: Diverted, dtype: int64


## 2. Make dummy variables for

In [4]:
 
# 1. make dictionary 
city_to_dic={ 'BOS': 'North', 'PVD' :'North', 'MHT':'North', 'ORH': 'North', \
             'LAX' : 'South', 'LGB':'South', 'BUR':'South','SNA':'South' }
# 2. maping the function
print(df['Diverted'].value_counts())
# df=df[df['Diverted']==0]
df['loca_dep']= df['Dest'].map(city_to_dic)
df['loca_arr']= df['Origin'].map(city_to_dic)
print(df['loca_dep'].value_counts())
print(df['loca_arr'].value_counts())
# location north = 1 or south =0

mask = (df['loca_dep']=='North')|(df['loca_arr']=='North')
df['north'] =mask.astype(int)
df['north'].value_counts()
# dep: data set split  departure = 1 arrrival = 0
print(df['con'].value_counts())


## use the delay at 8 airports
df['airport']= np.where(df['con']=='Dep', df['Origin'],df['Dest'])
print(df['airport'].head())
df['delay']=np.where(df['con']=='Dep', df['DepDelay'],df['ArrDelay'])

df['taxitime']= np.where(df['con']=='Dep', df['TaxiOut'],df['TaxiIn'])
## Snow if weather type contain any SN 
mask =df['WeatherType'].str.contains("SN", na=False)
df['snow'] =mask.astype(int)
df['snow'].value_counts()

df['temp']=pd.to_numeric(df.DryBulbFarenheit, errors='coerce')
# print(df['temp'].value_counts())
# # df['temp']# # M is missin
df['HourlyPrecip']=df['HourlyPrecip'].str.replace('T', '0.005')
df['HourlyPrecip'].value_counts()
df['precip']=pd.to_numeric(df['HourlyPrecip'], errors='coerce')
# #how much of data is missing
df['precip'].value_counts()
df['precip']=np.nan_to_num(df['precip']) # Nan chage to '0'
df['vis']= pd.to_numeric(df.Visibility, errors='coerce')

## winter and summer

mask= (df.Month== 11)|(df.Month== 12)|(df.Month== 1)|(df.Month==2)|(df.Month== 3)
df['winter']= mask.astype(int)

## give id to airport

df['airs'] = df['airport'].astype('category')

df['airs'].dtypes
df['airs_cat']= df['airs'].cat.codes
df['airs_cat'].value_counts()

0.0    829964
1.0      1734
Name: Diverted, dtype: int64
South    291490
North    133286
Name: loca_dep, dtype: int64
South    291618
North    133226
Name: loca_arr, dtype: int64
Dep    415883
Arr    415815
Name: con, dtype: int64
0    BOS
1    BOS
2    BOS
3    BOS
4    BOS
Name: airport, dtype: object


2    433291
0    219234
7     77681
1     41386
6     23806
3     21789
4     13124
5      1387
Name: airs_cat, dtype: int64

In [5]:
df['FlightDate']=pd.to_datetime(df['FlightDate'])
# print(df.FlightDate)
df['DateStr']=df['FlightDate'].apply(lambda x: x.strftime('%Y%m%d'))


In [6]:
df['FlightDate'].head(3)

0   2014-10-21
1   2014-10-21
2   2014-10-21
Name: FlightDate, dtype: datetime64[ns]

In [8]:
snow_airport = df.pivot_table(index='Month', columns='airport', values='snow', aggfunc='sum')
snow_airport.head(12)

airport,BOS,BUR,LAX,LGB,MHT,ORH,PVD,SNA
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1912,0,0,0,98,14,172,0
2,2859,0,0,0,136,16,314,0
3,1519,0,0,0,66,14,184,0
4,0,0,0,0,3,0,0,0
5,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0


In [9]:
### dummies for airport
s = pd.get_dummies(df.airport)
df= df.join(s)

In [109]:
len(df.delay)
mask =df.delay.isnull()
mask.value_counts()

False    831291
True      13433
Name: delay, dtype: int64

In [None]:
df['airs'].dtypes
df['airs_cat']= df['airs'].cat.codes
df['airs_cat'].value_counts()

In [20]:
df_dep = df[df['con']=='Dep']
df_arr = df[df['con']=='Arr']


## 3. Estimations

In [30]:
#set panel data using multi indexing.
df.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis =1, inplace= True)
df.set_index(['airport','FlightDate'],drop = False, inplace= True)

ValueError: labels ['Unnamed: 0' 'Unnamed: 0.1'] not contained in axis

### 3.1 LSDV (Least Square Dummy Variable)
 *regression with group dummies*

### Delay (in minutes)

In [27]:
Xs= "temp+vis+north+Distance+precip+snow+BUR+LAX+LGB+MHT+ORH+PVD+SNA+winter"
result = smf.ols(formula = "delay ~" + Xs, data= df).fit()
# omit the BOS dummy
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:                  delay   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     375.0
Date:                Fri, 03 Nov 2017   Prob (F-statistic):               0.00
Time:                        11:37:50   Log-Likelihood:            -4.0974e+06
No. Observations:              824652   AIC:                         8.195e+06
Df Residuals:                  824637   BIC:                         8.195e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     18.2013      0.489     37.251      0.0

### Weather dealy

In [26]:
Xs= "temp+vis+north+Distance+precip+snow+BUR+LAX+LGB+MHT+ORH+PVD+SNA+winter"
result = smf.ols(formula ="WeatherDelay ~"+Xs, data= df).fit()
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:           WeatherDelay   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     74.30
Date:                Fri, 03 Nov 2017   Prob (F-statistic):          1.78e-212
Time:                        11:37:36   Log-Likelihood:            -6.9502e+05
No. Observations:              169059   AIC:                         1.390e+06
Df Residuals:                  169044   BIC:                         1.390e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      3.5784      0.454      7.880      0.0

### Runway delay

In [25]:
Xs= "temp+vis+north+Distance+precip+snow+BUR+LAX+LGB+MHT+ORH+PVD+SNA+winter"
result = smf.ols(formula ="taxitime ~"+ Xs, data= df).fit()
# print(result.params)
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:               taxitime   R-squared:                       0.085
Model:                            OLS   Adj. R-squared:                  0.085
Method:                 Least Squares   F-statistic:                     5486.
Date:                Fri, 03 Nov 2017   Prob (F-statistic):               0.00
Time:                        11:37:29   Log-Likelihood:            -2.8905e+06
No. Observations:              825161   AIC:                         5.781e+06
Df Residuals:                  825146   BIC:                         5.781e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     15.8354      0.113    140.633      0.0

### Runway Delay : Taxi in

In [24]:
Xs= "temp+vis+north+Distance+precip+snow+BUR+LAX+LGB+MHT+ORH+PVD+SNA+winter"
result = smf.ols(formula ="TaxiIn ~" +Xs, data= df_arr).fit()
# print(result.params)
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:                 TaxiIn   R-squared:                       0.208
Model:                            OLS   Adj. R-squared:                  0.208
Method:                 Least Squares   F-statistic:                     7715.
Date:                Fri, 03 Nov 2017   Prob (F-statistic):               0.00
Time:                        11:37:05   Log-Likelihood:            -1.2932e+06
No. Observations:              412422   AIC:                         2.586e+06
Df Residuals:                  412407   BIC:                         2.587e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      7.2571      0.111     65.215      0.0

### Runway delay : Taxi out

In [31]:
Xs= "temp+vis+north+Distance+precip+snow+BUR+LAX+LGB+MHT+ORH+PVD+SNA+winter"
result = smf.ols(formula ="TaxiOut ~" +Xs, data= df_dep).fit()
# print(result.params)
print (result.summary())

                            OLS Regression Results                            
Dep. Variable:                TaxiOut   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.158
Method:                 Least Squares   F-statistic:                     5542.
Date:                Fri, 03 Nov 2017   Prob (F-statistic):               0.00
Time:                        11:42:23   Log-Likelihood:            -1.4277e+06
No. Observations:              412739   AIC:                         2.856e+06
Df Residuals:                  412724   BIC:                         2.856e+06
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     23.1734      0.151    153.188      0.0