In [1]:
import pandas as pd
import numpy as np
import pickle

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.metrics import (mean_squared_error, mean_absolute_error, mean_absolute_percentage_error)

# Arima
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

## Lectura de los datos

In [3]:
df = pd.read_parquet("forecast_data.parquet")

airports = list(df['ORIGIN_AIRPORT'].unique())
a1 = df[df['ORIGIN_AIRPORT'] == airports[0]].reset_index(drop=True) # .drop('ORIGIN_AIRPORT', axis = 1)
a2 = df[df['ORIGIN_AIRPORT'] == airports[1]].reset_index(drop=True)
a3 = df[df['ORIGIN_AIRPORT'] == airports[2]].reset_index(drop=True)
a4 = df[df['ORIGIN_AIRPORT'] == airports[3]].reset_index(drop=True)
a5 = df[df['ORIGIN_AIRPORT'] == airports[4]].reset_index(drop=True)

a1.head()

Unnamed: 0,DATE,ORIGIN_AIRPORT,DELAYED_FLIGHTS
0,2015-01-01,LAX,236
1,2015-01-02,LAX,409
2,2015-01-03,LAX,437
3,2015-01-04,LAX,467
4,2015-01-05,LAX,400


## Airport 1

In [4]:
# Preparamos los datos
airport_1 = str(a1['ORIGIN_AIRPORT'].unique())
data_1 = a1.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_1.columns = ['ds','y'] 
data_1['ds'] = pd.to_datetime(data_1['ds'],format = "%m/%d/%Y")

# Modelamos
stepwise_fit = auto_arima(data_1['y'], start_p = 1, start_q = 1,max_p = 3, max_q = 3, m = 12,start_P = 0, 
                          seasonal = True, d = None, D = 1, trace = True,error_action ='ignore',  
                          suppress_warnings = True,stepwise = True)          
  
stepwise_fit.summary()

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=3.95 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=4206.486, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=4041.631, Time=0.87 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=2.34 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=4204.526, Time=0.04 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=4169.646, Time=0.10 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=4027.953, Time=1.91 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=2.60 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.08 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=4124.787, Time=0.85 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=4029.519, Time=1.70 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=4028.996, Time=3.03 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=4042.821, Time=1.78 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=4028.439, Time=5.29 sec
 ARIMA(1,0,0)(2,1,0)[12]             : 

0,1,2,3
Dep. Variable:,y,No. Observations:,365.0
Model:,"SARIMAX(1, 0, 0)x(2, 1, 0, 12)",Log Likelihood,-2008.985
Date:,"Mon, 05 Dec 2022",AIC,4025.969
Time:,18:27:57,BIC,4041.435
Sample:,0,HQIC,4032.123
,- 365,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.5211,0.046,11.349,0.000,0.431,0.611
ar.S.L12,-0.7360,0.057,-12.977,0.000,-0.847,-0.625
ar.S.L24,-0.2261,0.057,-3.968,0.000,-0.338,-0.114
sigma2,5048.7440,372.086,13.569,0.000,4319.469,5778.019

0,1,2,3
Ljung-Box (L1) (Q):,0.13,Jarque-Bera (JB):,0.24
Prob(Q):,0.72,Prob(JB):,0.89
Heteroskedasticity (H):,1.03,Skew:,-0.05
Prob(H) (two-sided):,0.86,Kurtosis:,3.09


### Save

In [5]:
model_1 = SARIMAX(data_1['y'], order = (1, 0, 0),seasonal_order =(2, 1, 0, 12))
final_model_1 = model_1.fit()
pickle.dump(final_model_1,open("final_model_1.pickle",'wb'))

In [6]:
del data_1

## Airport 2

In [7]:
# Preparamos los datos
airport_2 = str(a2['ORIGIN_AIRPORT'].unique())
data_2 = a2.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_2.columns = ['ds','y'] 
data_2['ds'] = pd.to_datetime(data_2['ds'],format = "%m/%d/%Y")

# Modelamos
stepwise_fit = auto_arima(data_2['y'], start_p = 1, start_q = 1,max_p = 3, max_q = 3, m = 12,start_P = 0, 
                          seasonal = True, d = None, D = 1, trace = True,error_action ='ignore',  
                          suppress_warnings = True,stepwise = True)          
  
stepwise_fit.summary()


Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=2.99 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=4364.593, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=4183.435, Time=1.02 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.89 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=4362.702, Time=0.04 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=4267.498, Time=0.25 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=4153.917, Time=1.69 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=3.07 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.00 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=4270.724, Time=0.92 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=4155.862, Time=1.61 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=4155.828, Time=2.72 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=4175.729, Time=1.79 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=4.60 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=4

0,1,2,3
Dep. Variable:,y,No. Observations:,365.0
Model:,"SARIMAX(1, 0, 0)x(2, 1, 0, 12)",Log Likelihood,-2071.971
Date:,"Mon, 05 Dec 2022",AIC,4151.941
Time:,18:28:38,BIC,4167.407
Sample:,0,HQIC,4158.095
,- 365,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.5371,0.042,12.887,0.000,0.455,0.619
ar.S.L12,-0.6397,0.049,-12.953,0.000,-0.736,-0.543
ar.S.L24,-0.3156,0.057,-5.551,0.000,-0.427,-0.204
sigma2,7234.6210,499.714,14.478,0.000,6255.200,8214.042

0,1,2,3
Ljung-Box (L1) (Q):,0.02,Jarque-Bera (JB):,6.33
Prob(Q):,0.9,Prob(JB):,0.04
Heteroskedasticity (H):,0.56,Skew:,0.25
Prob(H) (two-sided):,0.0,Kurtosis:,3.42


In [8]:
model_2 = SARIMAX(data_2['y'], order = (1, 0, 0),seasonal_order =(2, 1, 0, 12))
final_model_2 = model_2.fit()
pickle.dump(final_model_2,open("final_model_2.pickle",'wb'))

In [9]:
del data_2

## Airport 3

In [10]:
# Preparamos los datos
airport_3 = str(a3['ORIGIN_AIRPORT'].unique())
data_3 = a3.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_3.columns = ['ds','y'] 
data_3['ds'] = pd.to_datetime(data_3['ds'],format = "%m/%d/%Y")

# Modelamos
stepwise_fit = auto_arima(data_3['y'], start_p = 1, start_q = 1,max_p = 3, max_q = 3, m = 12,start_P = 0, 
                          seasonal = True, d = None, D = 1, trace = True,error_action ='ignore',  
                          suppress_warnings = True,stepwise = True)          
  
stepwise_fit.summary()


Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=3.96 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=4678.509, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=4539.376, Time=0.96 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.99 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=4676.674, Time=0.03 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=4633.235, Time=0.25 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=4511.045, Time=2.09 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=4.52 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.19 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=4568.499, Time=0.91 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=4512.952, Time=1.74 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=4512.811, Time=2.23 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=4513.505, Time=1.06 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=2.87 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=4

0,1,2,3
Dep. Variable:,y,No. Observations:,365.0
Model:,"SARIMAX(1, 0, 0)x(2, 1, 0, 12)",Log Likelihood,-2250.593
Date:,"Mon, 05 Dec 2022",AIC,4509.187
Time:,18:29:17,BIC,4524.652
Sample:,0,HQIC,4515.341
,- 365,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.4006,0.045,8.810,0.000,0.311,0.490
ar.S.L12,-0.6649,0.052,-12.723,0.000,-0.767,-0.562
ar.S.L24,-0.3083,0.054,-5.658,0.000,-0.415,-0.201
sigma2,1.994e+04,1546.738,12.891,0.000,1.69e+04,2.3e+04

0,1,2,3
Ljung-Box (L1) (Q):,0.01,Jarque-Bera (JB):,10.7
Prob(Q):,0.94,Prob(JB):,0.0
Heteroskedasticity (H):,0.93,Skew:,0.43
Prob(H) (two-sided):,0.68,Kurtosis:,2.96


In [11]:
model_3 = SARIMAX(data_3['y'], order = (1, 0, 0),seasonal_order =(2, 1, 0, 12))
final_model_3 = model_3.fit()
pickle.dump(final_model_3,open("final_model_3.pickle",'wb'))

In [14]:
del data_3

## Airport 4

In [15]:
# Preparamos los datos
airport_4 = str(a4['ORIGIN_AIRPORT'].unique())
data_4 = a4.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_4.columns = ['ds','y'] 
data_4['ds'] = pd.to_datetime(data_4['ds'],format = "%m/%d/%Y")

# Modelamos
stepwise_fit = auto_arima(data_4['y'], start_p = 1, start_q = 1,max_p = 3, max_q = 3, m = 12,start_P = 0, 
                          seasonal = True, d = None, D = 1, trace = True,error_action ='ignore',  
                          suppress_warnings = True,stepwise = True)          
  
stepwise_fit.summary()


Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=2.44 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=4430.122, Time=0.05 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=4275.149, Time=0.95 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=2.20 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=4428.695, Time=0.05 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=4381.770, Time=0.22 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=4250.401, Time=1.72 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=3.22 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.10 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=4336.793, Time=1.36 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=4252.181, Time=2.37 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=4250.470, Time=2.96 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=4260.895, Time=1.85 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=4.54 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=4

0,1,2,3
Dep. Variable:,y,No. Observations:,365.0
Model:,"SARIMAX(1, 0, 0)x(0, 1, [1], 12)",Log Likelihood,-2093.611
Date:,"Mon, 05 Dec 2022",AIC,4193.221
Time:,18:30:33,BIC,4204.821
Sample:,0,HQIC,4197.837
,- 365,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.5235,0.045,11.648,0.000,0.435,0.612
ma.S.L12,-0.8820,0.037,-23.635,0.000,-0.955,-0.809
sigma2,7910.8171,637.086,12.417,0.000,6662.151,9159.483

0,1,2,3
Ljung-Box (L1) (Q):,1.15,Jarque-Bera (JB):,20.56
Prob(Q):,0.28,Prob(JB):,0.0
Heteroskedasticity (H):,0.65,Skew:,0.46
Prob(H) (two-sided):,0.02,Kurtosis:,3.75


In [16]:
model_4 = SARIMAX(data_4['y'], order = (1, 0, 0),seasonal_order =(0, 1, 1, 12))
final_model_4 = model_4.fit()
pickle.dump(final_model_4,open("final_model_4.pickle",'wb'))

In [17]:
del data_4

## Airport 5

In [18]:
# Preparamos los datos
airport_5 = str(a5['ORIGIN_AIRPORT'].unique())
data_5 = a5.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_5.columns = ['ds','y'] 
data_5['ds'] = pd.to_datetime(data_5['ds'],format = "%m/%d/%Y")

# Modelamos
stepwise_fit = auto_arima(data_5['y'], start_p = 1, start_q = 1,max_p = 3, max_q = 3, m = 12,start_P = 0, 
                          seasonal = True, d = None, D = 1, trace = True,error_action ='ignore',  
                          suppress_warnings = True,stepwise = True)          
  
stepwise_fit.summary()


Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=3.04 sec
 ARIMA(0,0,0)(0,1,0)[12] intercept   : AIC=4772.708, Time=0.03 sec
 ARIMA(1,0,0)(1,1,0)[12] intercept   : AIC=4617.305, Time=0.87 sec
 ARIMA(0,0,1)(0,1,1)[12] intercept   : AIC=inf, Time=0.77 sec
 ARIMA(0,0,0)(0,1,0)[12]             : AIC=4770.761, Time=0.02 sec
 ARIMA(1,0,0)(0,1,0)[12] intercept   : AIC=4739.122, Time=0.16 sec
 ARIMA(1,0,0)(2,1,0)[12] intercept   : AIC=4572.238, Time=1.46 sec
 ARIMA(1,0,0)(2,1,1)[12] intercept   : AIC=inf, Time=2.43 sec
 ARIMA(1,0,0)(1,1,1)[12] intercept   : AIC=inf, Time=1.26 sec
 ARIMA(0,0,0)(2,1,0)[12] intercept   : AIC=4634.445, Time=0.85 sec
 ARIMA(2,0,0)(2,1,0)[12] intercept   : AIC=4574.193, Time=1.78 sec
 ARIMA(1,0,1)(2,1,0)[12] intercept   : AIC=4574.002, Time=3.75 sec
 ARIMA(0,0,1)(2,1,0)[12] intercept   : AIC=4573.637, Time=1.12 sec
 ARIMA(2,0,1)(2,1,0)[12] intercept   : AIC=inf, Time=3.97 sec
 ARIMA(1,0,0)(2,1,0)[12]             : AIC=4

0,1,2,3
Dep. Variable:,y,No. Observations:,365.0
Model:,"SARIMAX(1, 0, 0)x(2, 1, 0, 12)",Log Likelihood,-2281.139
Date:,"Mon, 05 Dec 2022",AIC,4570.279
Time:,18:31:32,BIC,4585.745
Sample:,0,HQIC,4576.433
,- 365,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.4221,0.045,9.424,0.000,0.334,0.510
ar.S.L12,-0.7851,0.049,-16.187,0.000,-0.880,-0.690
ar.S.L24,-0.3702,0.052,-7.172,0.000,-0.471,-0.269
sigma2,2.354e+04,1546.660,15.217,0.000,2.05e+04,2.66e+04

0,1,2,3
Ljung-Box (L1) (Q):,0.01,Jarque-Bera (JB):,17.96
Prob(Q):,0.94,Prob(JB):,0.0
Heteroskedasticity (H):,0.85,Skew:,0.45
Prob(H) (two-sided):,0.39,Kurtosis:,3.65


In [19]:
model_5 = SARIMAX(data_5['y'], order = (1, 0, 0),seasonal_order =(2, 1, 0, 12))
final_model_5 = model_5.fit()
pickle.dump(final_model_5,open("final_model_5.pickle",'wb'))

In [20]:
del data_5