In [32]:
# Data processing
import pandas as pd
import numpy as np
# Prophet model for time series forecast
from prophet import Prophet
# Visulaization
import plotly.graph_objects as go
import plotly.express as px
# Hyperparameter tuning
import itertools
from prophet.diagnostics import cross_validation, performance_metrics

from sklearn.metrics import (mean_squared_error, mean_absolute_error)#,mean_absolute_percentage_error)

# Prophet Models

In [2]:
# read data
df = pd.read_parquet("forecast_data.parquet")

airports = list(df['ORIGIN_AIRPORT'].unique())
a1 = df[df['ORIGIN_AIRPORT'] == airports[0]].reset_index(drop=True) # .drop('ORIGIN_AIRPORT', axis = 1)
a2 = df[df['ORIGIN_AIRPORT'] == airports[1]].reset_index(drop=True)
a3 = df[df['ORIGIN_AIRPORT'] == airports[2]].reset_index(drop=True)
a4 = df[df['ORIGIN_AIRPORT'] == airports[3]].reset_index(drop=True)
a5 = df[df['ORIGIN_AIRPORT'] == airports[4]].reset_index(drop=True)

a1.head()

Unnamed: 0,DATE,ORIGIN_AIRPORT,DELAYED_FLIGHTS
0,2015-01-01,LAX,236
1,2015-01-02,LAX,409
2,2015-01-03,LAX,437
3,2015-01-04,LAX,467
4,2015-01-05,LAX,400


In [6]:
# Preparamos los datos
airport_1 = str(a1['ORIGIN_AIRPORT'].unique())
data_1 = a1.loc[:,['DATE', 'DELAYED_FLIGHTS']]
data_1.columns = ['ds','y'] 
data_1['ds'] = pd.to_datetime(data_1['ds'],format = "%m/%d/%Y")

# Ajustamos el modelo al conjunto de datos
train = data_1.iloc[:len(data_1)-31]
test = data_1.iloc[len(data_1)-31:]

16:43:52 - cmdstanpy - INFO - Chain [1] start processing
16:43:52 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x224db5cce80>

## Basic Model

In [12]:
# Modelamos
# Initiate the model
baseline_model = Prophet()
# Fit the model on the training dataset
baseline_model.fit(train)    

16:58:54 - cmdstanpy - INFO - Chain [1] start processing
16:58:54 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x224df3c54c0>

In [13]:
# Prediction
future = baseline_model.make_future_dataframe(periods=31)
future.tail()

Unnamed: 0,ds
360,2015-12-27
361,2015-12-28
362,2015-12-29
363,2015-12-30
364,2015-12-31


In [17]:
# Forecast
forecast = baseline_model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
360,2015-12-27,159.974589,81.02337,238.040999
361,2015-12-28,161.805919,86.115036,234.438343
362,2015-12-29,117.717436,41.975805,198.906615
363,2015-12-30,132.231098,57.057309,207.745556
364,2015-12-31,168.372344,92.220334,239.249897


In [25]:
# Predictions
test['predictions'] = forecast["yhat"]

# Prediction vs Actual values representation
fig = go.Figure()

fig.add_trace(go.Scatter(x=test['ds'], y=test['y'], name = "actual", line_color = px.colors.qualitative.Vivid[5]))
fig.add_trace(go.Scatter(x=test['ds'], y=test['predictions'], name = "predictions",line_color = px.colors.qualitative.Vivid[3]))

fig.update_layout(
    title="ARIMA Model. December predictions vs. Actual values",
    xaxis_title="Dates",
    yaxis_title="Number of delays",
    legend_title="Leyend",
    template="plotly_dark",
    #color_discrete_sequence=px.colors.qualitative.Vivid,
    hovermode="x unified",    
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [33]:
# Check the MSE value
performance_arima_MSE = mean_squared_error(test['y'],test['predictions'])
print(f'The MSE for the baseline model is {performance_arima_MSE}')

# Check the MAE value
performance_arima_MAE = mean_absolute_error(test['y'],test['predictions'])
print(f'The MAE for the baseline model is {performance_arima_MAE}')

# Check the MAPE value
# performance_arima_MAPE = mean_absolute_percentage_error(test['y'],test['predictions'])
# print(f'The MAPE for the baseline model is {performance_arima_MAPE}')

The MSE for the baseline model is 15370.29392552304
The MAE for the baseline model is 92.75484653551476


## Automatic Hyperparameter Tuning with Holidays

Tras varias pruebas automatizadas hemos cogido un grupo de parámetros cercanos a los que daban un mejor resultado, para acotar más la búsqueda del modelo ideal

In [63]:
# Set up parameter grid
param_grid = {  
    'changepoint_prior_scale': [0.03, 0.03, 0.04, 0.05],
    'seasonality_prior_scale': [7, 8, 9, 10, 11],
    'seasonality_mode': ['additive', 'multiplicative'],
    'holidays_prior_scale': [0.005,0.01,0.02]
}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
# Create a list to store MAPE values for each combination
mapes = [] 
# Use cross validation to evaluate all parameters
for params in all_params:
    # Fit a model using one parameter combination
    m = Prophet(**params,growth='linear')
    # add holidays
    m.add_country_holidays(country_name='US') 
    # fit data
    m.fit(data_1) 
    # Cross-validation
    df_cv = cross_validation(m, initial='333 days', period='31 days', horizon = '31 days', parallel="processes")
    # Model performance
    df_p = performance_metrics(df_cv, rolling_window=1)
    # Save model performance metrics
    mapes.append(df_p['mape'].values[0])
    
# Tuning results
tuning_results = pd.DataFrame(all_params)
tuning_results['mape'] = mapes
# Find the best parameters
best_params = all_params[np.argmin(mapes)]
print(best_params)

18:17:53 - cmdstanpy - INFO - Chain [1] start processing
18:17:53 - cmdstanpy - INFO - Chain [1] done processing
18:17:57 - cmdstanpy - INFO - Chain [1] start processing
18:17:57 - cmdstanpy - INFO - Chain [1] done processing
18:18:00 - cmdstanpy - INFO - Chain [1] start processing
18:18:00 - cmdstanpy - INFO - Chain [1] done processing
18:18:03 - cmdstanpy - INFO - Chain [1] start processing
18:18:03 - cmdstanpy - INFO - Chain [1] done processing
18:18:06 - cmdstanpy - INFO - Chain [1] start processing
18:18:06 - cmdstanpy - INFO - Chain [1] done processing
18:18:10 - cmdstanpy - INFO - Chain [1] start processing
18:18:10 - cmdstanpy - INFO - Chain [1] done processing
18:18:13 - cmdstanpy - INFO - Chain [1] start processing
18:18:13 - cmdstanpy - INFO - Chain [1] done processing
18:18:16 - cmdstanpy - INFO - Chain [1] start processing
18:18:17 - cmdstanpy - INFO - Chain [1] done processing
18:18:20 - cmdstanpy - INFO - Chain [1] start processing
18:18:20 - cmdstanpy - INFO - Chain [1]

{'changepoint_prior_scale': 0.03, 'seasonality_prior_scale': 10, 'seasonality_mode': 'additive', 'holidays_prior_scale': 0.01}


In [64]:
# Fit the model using the best parameters
auto_model = Prophet(changepoint_prior_scale=best_params['changepoint_prior_scale'], 
                     seasonality_prior_scale=best_params['seasonality_prior_scale'], 
                     seasonality_mode=best_params['seasonality_mode'])
# Fit the model on the training dataset
auto_model.add_country_holidays(country_name='US').fit(data_1)
# Cross validation
auto_model_cv = cross_validation(auto_model, initial='333 days', period='31 days', horizon = '31 days', parallel="processes")
# Model performance metrics
auto_model_p = performance_metrics(auto_model_cv, rolling_window=1)
auto_model_p['mape'].values[0]

18:27:49 - cmdstanpy - INFO - Chain [1] start processing
18:27:49 - cmdstanpy - INFO - Chain [1] done processing


0.306843591902117

In [79]:
# Predictions
test['predictions'] = auto_model_cv["yhat"].values

# Prediction vs Actual values representation
fig = go.Figure()

fig.add_trace(go.Scatter(x=test['ds'], y=test['y'], name = "actual", line_color = px.colors.qualitative.Vivid[5]))
fig.add_trace(go.Scatter(x=test['ds'], y=test['predictions'], name = "predictions",line_color = px.colors.qualitative.Vivid[3]))

fig.update_layout(
    title="ARIMA Model. December predictions vs. Actual values",
    xaxis_title="Dates",
    yaxis_title="Number of delays",
    legend_title="Leyend",
    template="plotly_dark",
    #color_discrete_sequence=px.colors.qualitative.Vivid,
    hovermode="x unified",    
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [80]:
# Check the MSE value
performance_arima_MSE = mean_squared_error(test['y'],test['predictions'])
print(f'The MSE for the baseline model is {performance_arima_MSE}')

# Check the MAE value
performance_arima_MAE = mean_absolute_error(test['y'],test['predictions'])
print(f'The MAE for the baseline model is {performance_arima_MAE}')

# Check the MAPE value
# performance_arima_MAPE = mean_absolute_percentage_error(test['y'],test['predictions'])
# print(f'The MAPE for the baseline model is {performance_arima_MAPE}')

The MSE for the baseline model is 13351.450463065948
The MAE for the baseline model is 84.21782484814962


Este modelo es el mejor de prophet obtenido hasta ahora, aunque no es capaz de predecir la época de altibajos en vuelos navideños.

## Automatic Hyperparameter Tuning with Holidays in Log Scale

The prophet model documentation[2] mentioned some hyperparameters are best tuned in log scale.

In [66]:
# Create a copy of the data
data_log = data_1.copy()
# Create the log scale data by taking the natual log of the stock prices.
data_log['y_log'] = np.log(data_1['y'])
# Delete the stock price and rename the log scale stock price to y
data_log = data_log.drop('y', axis=1).rename(columns={'y_log': 'y'})
# Take a look at the data
data_log.head()

Unnamed: 0,ds,y
0,2015-01-01,5.463832
1,2015-01-02,6.013715
2,2015-01-03,6.079933
3,2015-01-04,6.146329
4,2015-01-05,5.991465


In [67]:
# Parameter grid
param_grid = {  
    'changepoint_prior_scale': [0.03, 0.03, 0.04, 0.05],
    'seasonality_prior_scale': [7, 8, 9, 10, 11],
    'seasonality_mode': ['additive', 'multiplicative'],
    'holidays_prior_scale': [0.005,0.01,0.02]
}
# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
mapes = []  # Store the MAPEs for each params here
# Use cross validation to evaluate all parameters
for params in all_params:
    # Fit a model using one parameter combination
    m = Prophet(**params, growth='linear').add_country_holidays(country_name='US').fit(data_log)
    # Cross-validation
    df_cv = cross_validation(m, initial='333 days', period='31 days', horizon = '31 days', parallel="processes")
    # Model performance
    df_p = performance_metrics(df_cv, rolling_window=1)
    # Save model performance metrics
    mapes.append(df_p['mape'].values[0])
# Tuning results
best_params = all_params[np.argmin(mapes)]
# Best parameters
print(best_params)
# Train model using best parameters
auto_model_log = Prophet(changepoint_prior_scale=best_params['changepoint_prior_scale'], 
                     seasonality_prior_scale=best_params['seasonality_prior_scale'], 
                     seasonality_mode=best_params['seasonality_mode'])
# Fit the model on the training dataset
auto_model_log.fit(data_log)
# Cross validation
auto_model_log_cv = cross_validation(auto_model_log, initial='333 days', period='31 days', horizon = '31 days', parallel="processes")
# Model performance metrics
auto_model_log_p = performance_metrics(auto_model_log_cv, rolling_window=1)
auto_model_log_p['mape'].values[0]

18:35:06 - cmdstanpy - INFO - Chain [1] start processing
18:35:06 - cmdstanpy - INFO - Chain [1] done processing
18:35:10 - cmdstanpy - INFO - Chain [1] start processing
18:35:10 - cmdstanpy - INFO - Chain [1] done processing
18:35:13 - cmdstanpy - INFO - Chain [1] start processing
18:35:13 - cmdstanpy - INFO - Chain [1] done processing
18:35:17 - cmdstanpy - INFO - Chain [1] start processing
18:35:17 - cmdstanpy - INFO - Chain [1] done processing
18:35:20 - cmdstanpy - INFO - Chain [1] start processing
18:35:20 - cmdstanpy - INFO - Chain [1] done processing
18:35:23 - cmdstanpy - INFO - Chain [1] start processing
18:35:23 - cmdstanpy - INFO - Chain [1] done processing
18:35:27 - cmdstanpy - INFO - Chain [1] start processing
18:35:27 - cmdstanpy - INFO - Chain [1] done processing
18:35:30 - cmdstanpy - INFO - Chain [1] start processing
18:35:30 - cmdstanpy - INFO - Chain [1] done processing
18:35:34 - cmdstanpy - INFO - Chain [1] start processing
18:35:34 - cmdstanpy - INFO - Chain [1]

{'changepoint_prior_scale': 0.03, 'seasonality_prior_scale': 7, 'seasonality_mode': 'additive', 'holidays_prior_scale': 0.02}


0.0883266223256827

In [77]:
# Predictions
test['predictions'] = np.exp(auto_model_log_cv["yhat"]).values

# Prediction vs Actual values representation
fig = go.Figure()

fig.add_trace(go.Scatter(x=test['ds'], y=test['y'], name = "actual", line_color = px.colors.qualitative.Vivid[5]))
fig.add_trace(go.Scatter(x=test['ds'], y=test['predictions'], name = "predictions",line_color = px.colors.qualitative.Vivid[3]))

fig.update_layout(
    title="ARIMA Model. December predictions vs. Actual values",
    xaxis_title="Dates",
    yaxis_title="Number of delays",
    legend_title="Leyend",
    template="plotly_dark",
    #color_discrete_sequence=px.colors.qualitative.Vivid,
    hovermode="x unified",    
)

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [78]:
# Check the MSE value
performance_arima_MSE = mean_squared_error(test['y'],test['predictions'])
print(f'The MSE for the baseline model is {performance_arima_MSE}')

# Check the MAE value
performance_arima_MAE = mean_absolute_error(test['y'],test['predictions'])
print(f'The MAE for the baseline model is {performance_arima_MAE}')

# Check the MAPE value
# performance_arima_MAPE = mean_absolute_percentage_error(test['y'],test['predictions'])
# print(f'The MAPE for the baseline model is {performance_arima_MAPE}')

The MSE for the baseline model is 16443.76549849607
The MAE for the baseline model is 99.16965331562102


El modelo logarítmico no es bueno

# FIN