In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sqlite3 import Error

## Using pandas

In [3]:
cols = ['Year',
    'Quarter',
    'Month',
    'DayofMonth',
    'DayOfWeek',
    'FlightDate',
    'DOT_ID_Reporting_Airline',
    'IATA_CODE_Reporting_Airline',
    'Tail_Number',
    'Flight_Number_Reporting_Airline',
    'OriginAirportID',
    'DestAirportID',
    'CRSDepTime',
    'DepTime',
    'DepDelay',
    'DepDelayMinutes',
    'CRSArrTime',
    'ArrTime',
    'ArrDelay',
    'ArrDelayMinutes',
    'Cancelled',
    'CancellationCode',
    'Diverted',
    'CRSElapsedTime',
    'ActualElapsedTime',
    'AirTime',
    'Flights',
    'Distance',
    'CarrierDelay',
    'WeatherDelay',
    'NASDelay',
    'SecurityDelay',
    'LateAircraftDelay'
]

In [4]:
df = pd.read_csv("../../datasets/delays.csv", nrows=4, names=cols)

In [5]:
df

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,...,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2014,1,1,30,4,2014-01-30,19805,AA,N006AA,2377,...,75,76,56,1,328,0,0,0,0,0
1,2014,1,1,31,5,2014-01-31,19805,AA,N003AA,2377,...,75,84,54,1,328,11,0,9,0,0
2,2014,1,1,1,3,2014-01-01,19805,AA,N002AA,2377,...,85,78,57,1,328,0,0,0,0,0
3,2014,1,1,2,4,2014-01-02,19805,AA,N002AA,2377,...,85,79,53,1,328,0,0,0,0,0


## Using sqlite

In [6]:
### download the db from link in readme
db_path = "../../dbs/delays.db"

In [7]:
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        connection.text_factory = str
    except Error as e:
        print("Error occurred: " + str(e))
    return connection


def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        if query == "":
            return "Query Blank"
        else:
            df = pd.read_sql_query(query, connection)
            return df
    except Error as e:
        return "Error occurred: " + str(e)

In [8]:
connection = create_connection(db_path)

In [13]:
sample_query = \
    """
        select *
        from delay 
        where year = 2019
        limit 10;
    """

execute_query(connection, sample_query)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,...,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2019,1,1,4,5,2019-01-04,20304,OO,N945SW,5657,...,70,51,37,1,190,0,0,0,0,0
1,2019,1,1,4,5,2019-01-04,20304,OO,N932SW,5658,...,103,109,76,1,438,0,0,0,0,0
2,2019,1,1,4,5,2019-01-04,20304,OO,N932SW,5658,...,132,121,88,1,513,0,0,0,0,0
3,2019,1,1,4,5,2019-01-04,20304,OO,N916SW,5659,...,118,110,80,1,576,0,0,0,0,0
4,2019,1,1,4,5,2019-01-04,20304,OO,N107SY,5660,...,184,163,127,1,896,0,0,0,0,0
5,2019,1,1,4,5,2019-01-04,20304,OO,N114SY,5661,...,136,134,110,1,679,0,0,0,0,0
6,2019,1,1,4,5,2019-01-04,20304,OO,N945SW,5664,...,101,105,72,1,522,0,0,0,0,0
7,2019,1,1,4,5,2019-01-04,20304,OO,N679SA,5665,...,115,93,71,1,524,0,0,0,0,0
8,2019,1,1,4,5,2019-01-04,20304,OO,N925SW,5667,...,74,85,37,1,190,0,0,0,0,0
9,2019,1,1,4,5,2019-01-04,20304,OO,N145SY,5668,...,124,129,95,1,641,0,0,0,0,0


In [10]:
sample_query = \
    """
        select * 
        from delay 
        where Distance > 1000
        limit 10;
    """

execute_query(connection, sample_query)

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,Tail_Number,Flight_Number_Reporting_Airline,...,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2014,1,1,24,5,2014-01-24,19930,AS,N596AS,807,...,385,333,321,1,2681,0,0,0,0,0
1,2014,1,1,24,5,2014-01-24,19930,AS,N593AS,809,...,375,330,315,1,2562,0,0,0,0,0
2,2014,1,1,24,5,2014-01-24,19930,AS,N565AS,812,...,307,301,276,1,2355,0,0,0,0,0
3,2014,1,1,24,5,2014-01-24,19930,AS,N590AS,813,...,340,314,301,1,2378,0,0,0,0,0
4,2014,1,1,24,5,2014-01-24,19930,AS,N559AS,820,...,303,299,282,1,2404,0,0,0,0,0
5,2014,1,1,24,5,2014-01-24,19930,AS,N577AS,822,...,332,309,295,1,2562,0,0,0,0,0
6,2014,1,1,24,5,2014-01-24,19930,AS,N534AS,826,...,316,309,280,1,2378,0,0,0,0,0
7,2014,1,1,24,5,2014-01-24,19930,AS,N506AS,832,...,323,315,284,1,2417,0,0,0,0,0
8,2014,1,1,24,5,2014-01-24,19930,AS,N536AS,833,...,375,331,313,1,2603,0,0,0,0,0
9,2014,1,1,24,5,2014-01-24,19930,AS,N536AS,834,...,335,327,308,1,2603,0,0,0,0,0


In [12]:
df.describe()

Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,Flight_Number_Reporting_Airline,OriginAirportID,DestAirportID,CRSDepTime,...,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,2014.0,1.0,1.0,16.0,4.0,19805.0,2377.0,11788.0,11788.0,1037.5,...,80.0,79.25,55.0,1.0,328.0,2.75,0.0,2.25,0.0,0.0
std,0.0,0.0,0.0,16.753109,0.816497,0.0,0.0,565.803264,565.803264,112.583302,...,5.773503,3.40343,1.825742,0.0,0.0,5.5,0.0,4.5,0.0,0.0
min,2014.0,1.0,1.0,1.0,3.0,19805.0,2377.0,11298.0,11298.0,940.0,...,75.0,76.0,53.0,1.0,328.0,0.0,0.0,0.0,0.0,0.0
25%,2014.0,1.0,1.0,1.75,3.75,19805.0,2377.0,11298.0,11298.0,940.0,...,75.0,77.5,53.75,1.0,328.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,1.0,1.0,16.0,4.0,19805.0,2377.0,11788.0,11788.0,1037.5,...,80.0,78.5,55.0,1.0,328.0,0.0,0.0,0.0,0.0,0.0
75%,2014.0,1.0,1.0,30.25,4.25,19805.0,2377.0,12278.0,12278.0,1135.0,...,85.0,80.25,56.25,1.0,328.0,2.75,0.0,2.25,0.0,0.0
max,2014.0,1.0,1.0,31.0,5.0,19805.0,2377.0,12278.0,12278.0,1135.0,...,85.0,84.0,57.0,1.0,328.0,11.0,0.0,9.0,0.0,0.0


In [None]:
1. Categorical variables - use IATA code
2. Correlation plot
3. Stepwise regression

In [14]:
#get 2019 delay data

sample_query = \
    """
        select *
        from delay 
        where year = 2019
    """

df_2019 = execute_query(connection, sample_query)

In [15]:
#removing the ID columns 
corr = df_2019.corr()
corr.style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(s.to_numpy()) if vmin is None else vmin
  smax = np.nanmax(s.to_numpy()) if vmax is None else vmax


Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Reporting_Airline,OriginAirportID,DestAirportID,CRSDepTime,DepTime,DepDelay,DepDelayMinutes,CRSArrTime,ArrTime,ArrDelay,ArrDelayMinutes,Cancelled,Diverted,CRSElapsedTime,ActualElapsedTime,AirTime,Flights,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
Year,,,,,,,,,,,,,,,,,,,,,,,,,,,,
Quarter,,1.0,0.970532,0.005507,-0.005224,0.005589,-0.002101,-0.002062,-0.000297,0.012589,-0.01381,-0.014247,-0.010532,0.007328,-0.01188,-0.016774,-0.045986,-0.003699,-0.007035,-0.000289,-0.00061,,-0.00143,-0.005941,-0.007721,-0.011167,-0.002011,-0.011014
Month,,0.970532,1.0,0.00414,0.005966,0.006097,-0.001997,-0.00196,-9e-06,0.013514,-0.013348,-0.013444,-0.01077,0.007657,-0.011596,-0.016042,-0.047177,-0.003743,-0.006185,0.00076,0.000412,,-0.000789,-0.005415,-0.008395,-0.011527,-0.00154,-0.009823
DayofMonth,,0.005507,0.00414,1.0,0.007803,0.000412,0.000163,0.000197,-0.001098,-0.004286,0.001485,0.001906,-0.002121,-0.005776,0.001715,0.001382,0.009042,0.000826,0.002715,-7e-05,0.0003,,0.002441,-0.000185,0.001183,0.003026,-0.002206,0.000157
DayOfWeek,,-0.005224,0.005966,0.007803,1.0,0.005142,0.004139,0.003825,0.004656,0.005225,-0.000764,-0.000213,0.0036,0.004307,0.000693,-0.00105,-0.006883,0.002243,0.012052,0.010899,0.013077,,0.014111,0.003834,0.001779,-0.00624,0.002047,-0.003002
DOT_ID_Reporting_Airline,,0.005589,0.006097,0.000412,0.005142,1.0,-0.047182,-0.047214,0.000388,-0.0064,0.054408,0.036491,-0.001465,-0.00549,0.062432,0.050417,0.008846,0.006124,-0.088696,-0.072538,-0.113898,,-0.133538,0.016987,0.016402,0.043519,-0.003968,0.035776
OriginAirportID,,-0.002101,-0.001997,0.000163,0.004139,-0.047182,1.0,0.014709,-0.033106,-0.030116,-0.001552,-0.004245,-0.006062,-0.001569,-0.001237,-0.004081,-0.000893,0.001204,0.066822,0.064516,0.066296,,0.088994,-0.003611,-0.006139,-0.000277,-0.001231,0.000218
DestAirportID,,-0.002062,-0.00196,0.000197,0.003825,-0.047214,0.014709,1.0,0.028974,0.028719,-0.000777,-0.000114,0.021107,0.019009,-0.001593,0.00031,-0.001849,-0.003754,0.100871,0.0986,0.105425,,0.089323,0.000492,-0.002577,0.002874,0.000565,-0.001852
CRSDepTime,,-0.000297,-9e-06,-0.001098,0.004656,0.000388,-0.033106,0.028974,1.0,0.889347,0.085162,0.091524,0.683573,0.547747,0.065773,0.081091,0.016146,0.000916,-0.019527,-0.022386,-0.019161,,-0.010483,0.015975,0.01261,0.028456,0.000984,0.101693
DepTime,,0.012589,0.013514,-0.004286,0.005225,-0.0064,-0.030116,0.028719,0.889347,1.0,0.12309,0.127757,0.635612,0.658195,0.10797,0.117692,-0.321245,0.004888,-0.012142,0.058887,0.047707,,-0.004752,0.029831,0.009402,0.056464,0.004014,0.133194


In [19]:
df_2019.nunique()

Year                                  1
Quarter                               4
Month                                12
DayofMonth                           31
DayOfWeek                             7
FlightDate                          365
DOT_ID_Reporting_Airline             17
IATA_CODE_Reporting_Airline          17
Tail_Number                        5891
Flight_Number_Reporting_Airline    7089
OriginAirportID                     360
DestAirportID                       360
CRSDepTime                         1379
DepTime                            1441
DepDelay                           1537
DepDelayMinutes                    1537
CRSArrTime                         1436
ArrTime                            1441
ArrDelay                           1519
ArrDelayMinutes                    1519
Cancelled                             2
CancellationCode                      4
Diverted                              2
CRSElapsedTime                      577
ActualElapsedTime                   712


In [62]:
#random sampling on df_2019 due to computational limitations & converting factors to categorical variables 
df_2019_sample = df_2019.sample(frac=0.05, random_state=3)

df_2019_sample['OriginAirportID'] = df_2019_sample['OriginAirportID'].astype('category')
df_2019_sample['DestAirportID'] = df_2019_sample['DestAirportID'].astype('category')
df_2019_sample['CancellationCode'] = df_2019_sample['CancellationCode'].astype('category')
df_2019_sample['Diverted'] = df_2019_sample['Diverted'].astype('category')
df_2019_sample['Quarter'] = df_2019_sample['Quarter'].astype('category')
df_2019_sample['Month'] = df_2019_sample['Month'].astype('category')
df_2019_sample['DayofMonth'] = df_2019_sample['DayofMonth'].astype('category')
df_2019_sample['DayOfWeek'] = df_2019_sample['DayOfWeek'].astype('category')

## Testing Categorical variables on Arrival delay: {2019 Sample}

In [68]:
#ANOVA for testing if variance in delay times changes according to orgin/destination IDs

import statsmodels.api as sm
from statsmodels.formula.api import ols

#Performing two-way ANOVA
model = ols('ArrDelayMinutes ~ C(OriginAirportID) + C(DestAirportID)',
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=2)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Fstatistic based on 5% sample shows variance in delay is significant across origin & destination
#### since F statistic > 1 & probability of F-statistic < 0.05 (95% significance)

                          df        sum_sq       mean_sq         F  \
C(OriginAirportID)     359.0  5.064444e+06  14107.086698  6.406732   
C(DestAirportID)       359.0  5.126685e+06  14280.459075  6.485469   
Residual            370383.0  8.155523e+08   2201.916049       NaN   

                           PR(>F)  
C(OriginAirportID)  1.402813e-278  
C(DestAirportID)    9.708795e-284  
Residual                      NaN  


0,1,2,3
Dep. Variable:,ArrDelayMinutes,R-squared:,0.012
Model:,OLS,Adj. R-squared:,0.01
Method:,Least Squares,F-statistic:,6.446
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,13:02:28,Log-Likelihood:,-1954400.0
No. Observations:,371102,AIC:,3910000.0
Df Residuals:,370383,BIC:,3918000.0
Df Model:,718,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.0508,4.287,3.278,0.001,5.649,22.453
C(OriginAirportID)[T.10136],-7.2244,5.389,-1.341,0.180,-17.787,3.338
C(OriginAirportID)[T.10140],-2.9777,3.193,-0.933,0.351,-9.235,3.280
C(OriginAirportID)[T.10141],12.5387,8.565,1.464,0.143,-4.249,29.327
C(OriginAirportID)[T.10146],1.7837,6.759,0.264,0.792,-11.464,15.031
C(OriginAirportID)[T.10154],-4.9801,6.960,-0.716,0.474,-18.621,8.661
C(OriginAirportID)[T.10155],-6.3208,5.971,-1.059,0.290,-18.023,5.381
C(OriginAirportID)[T.10157],11.7155,5.480,2.138,0.033,0.974,22.457
C(OriginAirportID)[T.10158],-7.0923,4.601,-1.542,0.123,-16.109,1.925

0,1,2,3
Omnibus:,592610.36,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,597900323.291
Skew:,10.293,Prob(JB):,0.0
Kurtosis:,198.56,Cond. No.,800.0


In [59]:
#ANOVA for testing if variance in delay times changes according diversion and cancellation code

df_2019_sample['CancellationCode'].unique()
df_2019_sample['CancellationCode'].fillna('N/A',axis=0,inplace=True)

#Performing two-way ANOVA
model = ols('ArrDelayMinutes ~ CancellationCode + Diverted' ,
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=2)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Fstatistic based on 5% sample shows diverted flights can affect delays.
####Cancellation code is not significant factor (since Probabilty of F-statistic val. > 0.05 ) and is more of a post-facto effect of delay

                        df        sum_sq       mean_sq           F  \
CancellationCode       3.0  3.461055e+06  1.153685e+06  558.603760   
Diverted               1.0  4.861577e+05  4.861577e+05  235.393107   
Residual          371097.0  7.664271e+08  2.065301e+03         NaN   

                        PR(>F)  
CancellationCode  0.000000e+00  
Diverted          4.126631e-53  
Residual                   NaN  


0,1,2,3
Dep. Variable:,ArrDelay,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,477.8
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,12:43:25,Log-Likelihood:,-1942900.0
No. Observations:,371102,AIC:,3886000.0
Df Residuals:,371097,BIC:,3886000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.319e-13,1.048,6.03e-13,1.000,-2.054,2.054
CancellationCode[T.B],-2.38e-12,1.295,-1.84e-12,1.000,-2.538,2.538
CancellationCode[T.C],1.711e-12,1.656,1.03e-12,1.000,-3.246,3.246
CancellationCode[T.N/A],22.9794,1.051,21.868,0.000,20.920,25.039
Diverted,-22.9794,1.498,-15.343,0.000,-25.915,-20.044

0,1,2,3
Omnibus:,612540.355,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,743268247.511
Skew:,10.976,Prob(JB):,0.0
Kurtosis:,221.144,Cond. No.,42.4


In [79]:
#ANOVA for testing if variance in delay times changes due to Airline carrier

#Performing one-way ANOVA
model = ols('ArrDelayMinutes ~ IATA_CODE_Reporting_Airline' ,
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=1)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Fstatistic based on 5% sample shows airline carrier plays affects delays significantly 

                                   df        sum_sq        mean_sq  \
IATA_CODE_Reporting_Airline      16.0  4.244421e+06  265276.282895   
Residual                     371085.0  8.214990e+08    2213.775766   

                                      F  PR(>F)  
IATA_CODE_Reporting_Airline  119.829789     0.0  
Residual                            NaN     NaN  


0,1,2,3
Dep. Variable:,ArrDelayMinutes,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,119.8
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,13:38:18,Log-Likelihood:,-1955800.0
No. Observations:,371102,AIC:,3912000.0
Df Residuals:,371085,BIC:,3912000.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.5022,0.417,34.798,0.000,13.685,15.319
IATA_CODE_Reporting_Airline[T.AA],0.3010,0.470,0.641,0.522,-0.619,1.221
IATA_CODE_Reporting_Airline[T.AS],-3.9765,0.584,-6.807,0.000,-5.122,-2.831
IATA_CODE_Reporting_Airline[T.B6],7.3183,0.569,12.869,0.000,6.204,8.433
IATA_CODE_Reporting_Airline[T.DL],-3.8310,0.467,-8.200,0.000,-4.747,-2.915
IATA_CODE_Reporting_Airline[T.EV],6.6389,0.710,9.357,0.000,5.248,8.029
IATA_CODE_Reporting_Airline[T.F9],2.7439,0.709,3.870,0.000,1.354,4.133
IATA_CODE_Reporting_Airline[T.G4],-0.2968,0.771,-0.385,0.700,-1.808,1.215
IATA_CODE_Reporting_Airline[T.HA],-8.8046,0.838,-10.508,0.000,-10.447,-7.162

0,1,2,3
Omnibus:,593461.224,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,599014213.147
Skew:,10.325,Prob(JB):,0.0
Kurtosis:,198.737,Cond. No.,24.1


In [73]:
#ANOVA for testing if variance in delay depends on Time period

#Performing two-way ANOVA
model = ols('ArrDelay ~ Quarter + Month + DayofMonth + DayOfWeek',
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=2)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#ANOVA for testing if variance in diferent types of delays depends on Time period

from statsmodels.multivariate.manova import MANOVA

# fit manova on various types of delays
manova_result1 = MANOVA.from_formula('CarrierDelay + WeatherDelay + NASDelay + SecurityDelay + LateAircraftDelay ~  Month + DayofMonth + DayOfWeek', df_2019_sample)
print(manova_result1.mv_test())

manova_result2 = MANOVA.from_formula('CarrierDelay + WeatherDelay + NASDelay + SecurityDelay + LateAircraftDelay ~  Quarter', df_2019_sample)
print(manova_result2.mv_test())

#### Inference - Fstatistic based on 5% sample shows time period affects delays
#### especially 'Quarter' & 'Month' affects delays significantly - due to weather conditions

                  df        sum_sq        mean_sq          F         PR(>F)
Quarter          3.0  3.206361e+05  106878.693122  51.666105   2.235427e-33
Month           11.0  1.437638e+06  130694.376008  63.178816  8.878691e-142
DayofMonth      30.0  8.810680e+05   29368.932460  14.197202   1.757538e-71
DayOfWeek        6.0  1.614780e+05   26913.004242  13.009984   9.018308e-15
Residual    371053.0  7.675760e+08    2068.642359        NaN            NaN


0,1,2,3
Dep. Variable:,ArrDelay,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,28.18
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,2.82e-251
Time:,13:07:48,Log-Likelihood:,-1943200.0
No. Observations:,371102,AIC:,3886000.0
Df Residuals:,371053,BIC:,3887000.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,24.2224,0.527,45.996,0.000,23.190,25.255
Quarter[T.2],2.081e+12,2.74e+12,0.759,0.448,-3.29e+12,7.45e+12
Quarter[T.3],-1.074e+13,1.87e+13,-0.575,0.565,-4.73e+13,2.59e+13
Quarter[T.4],-2.557e+12,6.76e+12,-0.378,0.705,-1.58e+13,1.07e+13
Month[T.2],2.1735,0.386,5.635,0.000,1.417,2.929
Month[T.3],-2.0549,0.370,-5.552,0.000,-2.780,-1.329
Month[T.4],-2.081e+12,2.74e+12,-0.759,0.448,-7.45e+12,3.29e+12
Month[T.5],-2.081e+12,2.74e+12,-0.759,0.448,-7.45e+12,3.29e+12
Month[T.6],-2.081e+12,2.74e+12,-0.759,0.448,-7.45e+12,3.29e+12

0,1,2,3
Omnibus:,611156.668,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,736120671.886
Skew:,10.924,Prob(JB):,0.0
Kurtosis:,220.093,Cond. No.,606000000000000.0


## Testing Categorical variables on Departure delay: {2019 Sample}

In [81]:
#Performing two-way ANOVA
model = ols('DepDelayMinutes ~ C(OriginAirportID) + C(DestAirportID)',
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=2)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Fstatistic based on 5% sample shows variance in delay is significant for origin & destination 

                          df        sum_sq       mean_sq         F  \
C(OriginAirportID)     359.0  4.355332e+06  12131.843587  5.421939   
C(DestAirportID)       359.0  4.755704e+06  13247.087536  5.920361   
Residual            370383.0  8.287494e+08   2237.547081       NaN   

                           PR(>F)  
C(OriginAirportID)  4.266613e-215  
C(DestAirportID)    5.750904e-247  
Residual                      NaN  


0,1,2,3
Dep. Variable:,DepDelayMinutes,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,5.671
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.0
Time:,13:44:17,Log-Likelihood:,-1957400.0
No. Observations:,371102,AIC:,3916000.0
Df Residuals:,370383,BIC:,3924000.0
Df Model:,718,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,13.5583,4.321,3.138,0.002,5.089,22.028
C(OriginAirportID)[T.10136],-5.3960,5.433,-0.993,0.321,-16.044,5.252
C(OriginAirportID)[T.10140],-1.0353,3.218,-0.322,0.748,-7.343,5.273
C(OriginAirportID)[T.10141],13.2376,8.634,1.533,0.125,-3.686,30.161
C(OriginAirportID)[T.10146],3.1134,6.814,0.457,0.648,-10.241,16.468
C(OriginAirportID)[T.10154],-3.7487,7.016,-0.534,0.593,-17.500,10.002
C(OriginAirportID)[T.10155],-5.4117,6.019,-0.899,0.369,-17.208,6.385
C(OriginAirportID)[T.10157],12.7798,5.524,2.313,0.021,1.952,23.607
C(OriginAirportID)[T.10158],-6.7731,4.638,-1.460,0.144,-15.863,2.317

0,1,2,3
Omnibus:,591016.595,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,583555578.016
Skew:,10.243,Prob(JB):,0.0
Kurtosis:,196.184,Cond. No.,800.0


In [82]:
#ANOVA for testing if variance in delay times changes according diversion and cancellation code

#Performing two-way ANOVA
model = ols('DepDelayMinutes ~ CancellationCode + Diverted' ,
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=2)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Departure delay not dependant on cancellation code / diversion F<1


                      df        sum_sq     mean_sq         F    PR(>F)
CancellationCode     2.0  1.383193e+02   69.159650  0.088209  0.915570
Diverted             1.0  6.193093e+00    6.193093  0.007899  0.929183
Residual          6706.0  5.257776e+06  784.040547       NaN       NaN


  return np.sqrt(eigvals[0]/eigvals[-1])


0,1,2,3
Dep. Variable:,DepDelayMinutes,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.08821
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,0.916
Time:,13:52:02,Log-Likelihood:,-31874.0
No. Observations:,6709,AIC:,63750.0
Df Residuals:,6706,BIC:,63770.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.2229,0.646,3.442,0.001,0.957,3.489
CancellationCode[T.B],0.3246,0.798,0.407,0.684,-1.239,1.888
CancellationCode[T.C],0.3042,1.020,0.298,0.766,-1.696,2.305
Diverted[T.1],0,0,,,0,0

0,1,2,3
Omnibus:,15636.811,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159379266.286
Skew:,22.866,Prob(JB):,0.0
Kurtosis:,756.693,Cond. No.,inf


In [83]:
#ANOVA for testing if variance in delay times changes due to Airline carrier

#Performing one-way ANOVA
model = ols('DepDelayMinutes ~ IATA_CODE_Reporting_Airline' ,
            data=df_2019_sample).fit()
result = sm.stats.anova_lm(model, type=1)
  
# Print the result for F-statistic of each column
print(result)
model.summary()

#### Inference - Fstatistic based on 5% sample shows airline carrier plays affects delays significantly 

                                   df        sum_sq        mean_sq          F  \
IATA_CODE_Reporting_Airline      16.0  3.519947e+06  219996.699261  97.846714   
Residual                     371085.0  8.343405e+08    2248.381070        NaN   

                                    PR(>F)  
IATA_CODE_Reporting_Airline  1.976263e-323  
Residual                               NaN  


0,1,2,3
Dep. Variable:,DepDelayMinutes,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,97.85
Date:,"Tue, 25 Oct 2022",Prob (F-statistic):,2e-323
Time:,13:54:01,Log-Likelihood:,-1958600.0
No. Observations:,371102,AIC:,3917000.0
Df Residuals:,371085,BIC:,3917000.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,14.3330,0.420,34.126,0.000,13.510,15.156
IATA_CODE_Reporting_Airline[T.AA],0.0999,0.473,0.211,0.833,-0.828,1.027
IATA_CODE_Reporting_Airline[T.AS],-4.6409,0.589,-7.882,0.000,-5.795,-3.487
IATA_CODE_Reporting_Airline[T.B6],7.8860,0.573,13.760,0.000,6.763,9.009
IATA_CODE_Reporting_Airline[T.DL],-3.6184,0.471,-7.685,0.000,-4.541,-2.696
IATA_CODE_Reporting_Airline[T.EV],5.6941,0.715,7.963,0.000,4.293,7.096
IATA_CODE_Reporting_Airline[T.F9],3.2443,0.714,4.541,0.000,1.844,4.645
IATA_CODE_Reporting_Airline[T.G4],-0.9387,0.777,-1.208,0.227,-2.462,0.585
IATA_CODE_Reporting_Airline[T.HA],-9.5573,0.844,-11.319,0.000,-11.212,-7.902

0,1,2,3
Omnibus:,592018.552,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,585916616.825
Skew:,10.28,Prob(JB):,0.0
Kurtosis:,196.571,Cond. No.,24.1


## Testing Continuous variables: {2019 Sample}

In [75]:
df_2019_filtered = df_2019[[
                    'DepDelayMinutes',
                    'ArrDelayMinutes',
                    'Diverted',
                    'AirTime',
                    'Distance',
                    'CarrierDelay',
                    'WeatherDelay',
                    'NASDelay',
                    'SecurityDelay',
                    'LateAircraftDelay'
                ]]
corr = df_2019_filtered.corr()
corr.style.background_gradient(cmap='coolwarm')

##Airtime and distance do not impact delays
## Arrival & departure delays are related --> cascading effect
## Carrier, Weather, NAS and Late Aircraft delays are highly correlated with total Arrival/Departure delays

Unnamed: 0,DepDelayMinutes,ArrDelayMinutes,Diverted,AirTime,Distance,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
DepDelayMinutes,1.0,0.972485,0.017613,0.015011,0.009766,0.666625,0.311543,0.343039,0.033182,0.60615
ArrDelayMinutes,0.972485,1.0,-0.014847,0.026906,0.006582,0.662232,0.319832,0.426359,0.032796,0.597136
Diverted,0.017613,-0.014847,1.0,-0.07705,0.013403,-0.00671,-0.002553,-0.00855,-0.000625,-0.010168
AirTime,0.015011,0.026906,-0.07705,1.0,0.956692,0.01364,-0.001731,0.045168,0.004721,-0.002763
Distance,0.009766,0.006582,0.013403,0.956692,1.0,0.009177,-0.005624,0.01749,0.004512,-0.01048
CarrierDelay,0.666625,0.662232,-0.00671,0.01364,0.009177,1.0,-0.003263,0.015036,-0.000627,0.054433
WeatherDelay,0.311543,0.319832,-0.002553,-0.001731,-0.005624,-0.003263,1.0,0.022077,0.000168,0.029592
NASDelay,0.343039,0.426359,-0.00855,0.045168,0.01749,0.015036,0.022077,1.0,0.001167,0.04642
SecurityDelay,0.033182,0.032796,-0.000625,0.004721,0.004512,-0.000627,0.000168,0.001167,1.0,0.003962
LateAircraftDelay,0.60615,0.597136,-0.010168,-0.002763,-0.01048,0.054433,0.029592,0.04642,0.003962,1.0


In [176]:
df_2019_sample = df_2019.sample(frac=0.05, random_state=3)

In [177]:
X_train = df_2019_sample[df_2019_sample.columns.difference(['DepDelayMinutes', 'ArrDelayMinutes'])]
X_train['Total_delay_mins'] = df_2019_sample['DepDelayMinutes'] + df_2019_sample['ArrDelayMinutes']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [134]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

In [138]:
FS_model2 = forward_selected(X_train,"Total_delay_mins")

MemoryError: Unable to allocate 7.32 GiB for an array with shape (6616, 148441) and data type float64

In [182]:
##Runing regression individually on 5% sample
model = smf.ols('Total_delay_mins ~ Month + IATA_CODE_Reporting_Airline + OriginAirportID + DestAirportID + CarrierDelay + LateAircraftDelay + NASDelay + WeatherDelay +SecurityDelay + Distance + Diverted ', X_train).fit()
model.rsquared_adj
#0.9849939801906086

0.9849939801906086