<a href="https://colab.research.google.com/github/youssefHosni/Time-Series-With-Python/blob/main/Arima%20Models%20in%20Python/ARIMA_Models_In%C2%A0Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# import the important libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
DATA_DIRECTORY='../data/forecasting'

# 1. ARMA Models

## 1.1. Introduction to stationarity

In [None]:
# Load in the time series
candy = pd.read_csv('{}/candy_production.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)
# change the plot style into fivethirtyeight 
plt.style.use('fivethirtyeight')

# Plot and show the time series on axis ax1
fig, ax1 = plt.subplots()
candy.plot(ax=ax1, figsize=(12,10))
plt.title('Monthly production of candy in US')
plt.xlabel('Date')
plt.ylabel('Production')
plt.show()


In [None]:
# Split the data into a train and test set
candy_train = candy.loc[:'2006']
candy_test = candy.loc['2007':]

# Create an axis
fig, ax = plt.subplots()

# Plot the train and test sets on the axis ax
candy_train.plot(ax=ax, figsize=(12,10))
candy_test.plot(ax=ax)
plt.title('train - test split of the monthly production of candy in US')
plt.xlabel('Date')
plt.ylabel('Production')
plt.show()

## 1.2. Making a time series stationary

In [None]:
from statsmodels.tsa.stattools import adfuller
results = adfuller(candy)
print(results)

In [None]:
# Calculate the first difference and drop the nans
candy_diff = candy.diff()
candy_diff = candy_diff.dropna()

# Run test and print
result_diff = adfuller(candy_diff)
print(result_diff)


In [None]:
earthquake = pd.read_csv('{}/earthquakes.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)

In [None]:
# Calculate log-return and drop nans
earthquake_log = np.log(earthquake)
earthquake_log = earthquake_log.dropna()

# Run test and print
result_log = adfuller(earthquake_log['earthquakes_per_year'])
result_log

## 1.3. Introduction to AR, MA adn ARMA models

In [None]:
from statsmodels.tsa.arima_process import arma_generate_sample
ar_coefs = [1, -0.5] 
ma_coefs = [1, 0.2]
y = arma_generate_sample(ar_coefs, ma_coefs, nsample=100, scale=0.5)

In [None]:
from statsmodels.tsa.arima.model import ARIMA
# Instantiate model object
model = ARIMA(y, order=(1,0,1))
# Fit model
results = model.fit()


# 2. Fitting the Future

## 2.1. Fitting time series models


In [None]:
from statsmodels.tsa.arima.model import ARIMA
# Instantiate model object
model = ARIMA(y, order=(1,0,1))
# Fit model
results = model.fit()
results.summary()

## 2.2. Forecasting 

In [None]:
amazon = pd.read_csv('{}/amazon_close.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)
amazon.plot()

In [None]:
results = adfuller(amazon)
print(results)

In [None]:
# Instantiate the model
model = ARIMA(amazon, order=(1,1,1))

# Fit the model
results = model.fit()

# Print model fit summary
results.summary()

In [None]:
# Generate predictions
one_step_forecast = results.get_prediction(end=20)

# Extract prediction mean
mean_forecast = one_step_forecast.predicted_mean

# Get confidence intervals of  predictions
confidence_intervals = one_step_forecast.conf_int()

# Select lower and upper confidence limits
lower_limits = confidence_intervals.loc[:,'lower close']
upper_limits = confidence_intervals.loc[:,'upper close']

# Print best estimate  predictions
mean_forecast.head(5)

In [None]:
mean_forecast = mean_forecast[1:]

In [None]:
mean_forecast.head(5)

In [None]:
# plot the amazon data
plt.plot(amazon.index, amazon, label='observed')

# plot your mean predictions
plt.plot(mean_forecast.index,mean_forecast,color='r', label='forecast')

# shade the area between your confidence limits
plt.fill_between(lower_limits.index, lower_limits, upper_limits, color='pink')

# set labels, legends and show plot
plt.xlabel('Date')
plt.ylabel('Amazon Stock Price - Close USD')
plt.legend()
plt.show()

In [None]:
# Generate predictions
dynamic_forecast = results.get_prediction(end= 20, dynamic=True)

# Extract prediction mean
mean_forecast = dynamic_forecast.predicted_mean

# Get confidence intervals of predictions
confidence_intervals = dynamic_forecast.conf_int()

# Select lower and upper confidence limits
lower_limits = confidence_intervals.loc[:,'lower close']
upper_limits = confidence_intervals.loc[:,'upper close']

# Print best estimate predictions
mean_forecast.head(5)

In [None]:
# plot the amazon data
plt.plot(amazon.index, amazon, label='observed')

# plot your mean forecast
plt.plot(mean_forecast.index, mean_forecast, color='r', label='forecast')

# shade the area between your confidence limits
plt.fill_between(lower_limits.index, lower_limits, 
         upper_limits, color='pink')

# set labels, legends and show plot
plt.xlabel('Date')
plt.ylabel('Amazon Stock Price - Close USD')
plt.legend()
plt.show()

## 2.3. ARIMA models for non-stationary time series

In [None]:
amazon = pd.read_csv('{}/amazon_close.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)
amazon.plot()
plt.title('Amazon stock price change with time')
plt.ylabel('Stock price')

In [None]:
from statsmodels.tsa.stattools import adfuller

# Run Dicky-Fuller test
result = adfuller(amazon)

# Print test statistic
print('The test stastics:', round(result[0], 4))

# Print p-value
print("The p-value:", round(result[1], 4))


The data is not stationary and we have to take a tranform of it.

In [None]:
# take the first diff
amazon_diff = amazon.diff()
amazon_diff.dropna(inplace=True)

# Run Dicky-Fuller test
result = adfuller(amazon_diff)

# Print test statistic
print('The test stastics:', round(result[0], 4))

# Print p-value
print("The p-value:", round(result[1], 4))

The p-value is less than 0.05, therefore the data is stationary

In [None]:
from statsmodels.tsa.arima.model import ARIMA
# Instantiate model object
model = ARIMA(amazon_diff, order=(1,0,1))
# Fit model
results = model.fit()
results.summary()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
# Instantiate model object
model = ARIMA(amazon, order=(1,1,1))
# Fit model
results = model.fit()
results.summary()

# 3. Finding the Best Models

## 3.1. Introduction to ACF and PACF

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Make ACF plot
plot_acf(earthquake['earthquakes_per_year'], lags=10, zero=False)
plt.show()

In [None]:
# Make PACF plot
plot_pacf(earthquake['earthquakes_per_year'], lags=10, zero=False)
plt.show()

## 3.2. Intro to AIC and BIC

In [None]:
import statsmodels.api as sm
order_aic_bic =[]
# Loop over p values from 0-2
for p in range(3):
    # Loop over q values from 0-2
    for q in range(3):
      
        try:
            # create and fit ARMA(p,q) model
            model = sm.tsa.statespace.SARIMAX(earthquake['earthquakes_per_year'], order=(p, 0, q))
            results = model.fit()
            
            # Print order and results
            order_aic_bic.append((p, q, results.aic, results.bic)) 
            print("==================================================\n")
        except:
            print(p, q, None, None)

In [None]:
# Make DataFrame of model order and AIC/BIC scores
order_df = pd.DataFrame(order_aic_bic, columns=['p', 'q', 'aic','bic'])

Let's sort them by AIC and BIC

Models sorted by AIC

In [None]:
order_df.sort_values('aic').reset_index(drop=True)

Models sorted by BIC

In [None]:
order_df.sort_values('bic').reset_index(drop=True)

## 3.3. The model diagnostic

In [None]:
# The model with the best p and q found from pervious step
model = sm.tsa.statespace.SARIMAX(earthquake['earthquakes_per_year'], order=(1, 0, 1))
# Fit model
results = model.fit()
# Assign residuals to variable
residuals = results.resid

In [None]:
residuals

In [None]:
# The mean absolute error
mae = np.mean(np.abs(residuals))
print("MAE: {}".format(round(mae, 4)))

In [None]:
# Create the 4 diagostics plots
results.plot_diagnostics(figsize=(10,10))
plt.show()

In [None]:
# Summary statistics
results.summary()

# 3.4. The Box-Jenkins method

## Identification 

In [None]:
co2 = pd.read_csv('{}/co2.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)
co2.plot()

In [None]:
from statsmodels.tsa.stattools import adfuller

# Run Dicky-Fuller test
result = adfuller(co2)

# Print test statistic
print('The test stastics:', round(result[0], 4))

# Print p-value
print("The p-value:", round(result[1], 4))

In [None]:
co2_diff = co2.diff()
co2_diff = co2_diff.dropna()
co2_diff.plot()

# Run Dicky-Fuller test
result_diff = adfuller(co2_diff)

# Print test statistic
print('The test stastics:', round(result_diff[0], 4))

# Print p-value
print("The p-value:", round(result_diff[1], 4))

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Create figure
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
 
# Plot the ACF of savings on ax1
plot_acf(co2_diff, lags=20, zero=False, ax=ax1)

# Plot the PACF of savings on ax2
plot_pacf(co2_diff, lags=20, zero=False, ax=ax2)

plt.show()

# Estimation 

In [None]:
import statsmodels.api as sm
order_aic_bic =[]

# Loop over p values from 0-4
for p in range(5):
    # Loop over q values from 0-4
    for q in range(5):
      
        try:
            # create and fit ARMA(p,q) model
            model = sm.tsa.statespace.SARIMAX(co2, order=(p, 1, q))
            results = model.fit()
            
            # Print order and results
            order_aic_bic.append((p, q, results.aic, results.bic))
            print("====================================================")
        except:
            print(p, q, None, None)

In [None]:
# Make DataFrame of model order and AIC/BIC scores
order_df = pd.DataFrame(order_aic_bic, columns=['p', 'q', 'aic','bic'])

Models sorted by AIC

In [None]:
order_df.sort_values('aic').reset_index(drop=True)

Models sorted by BIC

In [None]:
order_df.sort_values('bic').reset_index(drop=True)

In [None]:
# Create and fit model
model = sm.tsa.statespace.SARIMAX(co2, order=(4,1,4), trend='c')
results = model.fit()

# Create the 4 diagostics plots
results.plot_diagnostics(figsize=(10,10))
plt.show()

# Print summary
results.summary()

# 4. Seasonal ARIMA Models

## 4.1. Seasonal time series

In [None]:
# load the candy production data
candy = pd.read_csv('{}/candy_production.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)

# Plot and show the time series on axis ax1
fig, ax1 = plt.subplots()
candy.plot(ax=ax1, figsize=(12,10))
plt.title('Monthly production of candy in US')
plt.xlabel('Date')
plt.ylabel('Production')
plt.show()

In [None]:
# Import
from statsmodels.tsa.seasonal import seasonal_decompose
# Decompose data
decomp_results = seasonal_decompose(candy, period=12)

# Plot decomposed data
plt.rcParams["figure.figsize"] = (10,15)
decomp_results.plot()
plt.show()

In [None]:
# Subtract long rolling average over 5 steps
candy = candy - candy.rolling(5).mean()
# Drop NaN values
candy = candy.dropna()

# Identifying seasonal data using ACF

# Create figure
fig, ax = plt.subplots(1,1, figsize=(8,4))
# Plot ACF
plot_acf(candy.dropna(), ax=ax, lags=25, zero=False)
plt.show()

## 4.2. Seasonal ARIMA model

In [None]:
# load the candy production data
candy = pd.read_csv('{}/candy_production.csv'.format(DATA_DIRECTORY), index_col='date', parse_dates=True)

# Plot and show the time series on axis ax1
fig, ax1 = plt.subplots()
candy.plot(ax=ax1, figsize=(12,10))
plt.title('Monthly production of candy in US')
plt.xlabel('Date')
plt.ylabel('Production')
plt.show()

In [None]:
# Seasonal differencing
S = 12
candy_diff = candy.diff(S)
candy_diff.plot()

In [None]:
# one step differencing
candy_diff = candy.diff()
candy_diff = candy_diff.dropna()
candy_diff.plot()

In [None]:
# Run Dicky-Fuller test
result = adfuller(candy_diff)

# Print test statistic
print('The test stastics:', round(result[0], 4))

# Print p-value
print("The p-value:", round(result[1], 4))


In [None]:
# find the non-seasonal model prameters 
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Create figure
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
 
# Plot the ACF of savings on ax1
plot_acf(candy_diff, lags=20, zero=False, ax=ax1)

# Plot the PACF of savings on ax2
plot_pacf(candy_diff, lags=20, zero=False, ax=ax2)

plt.show()

We cannot estimate the values of the non-seasonal from the PACF and ACF plot, so we will use AIC and BIC

In [None]:
import statsmodels.api as sm
order_aic_bic =[]

# Loop over p values from 0-4
for p in range(5):
    # Loop over q values from 0-4
    for q in range(5):
      
        try:
            # create and fit ARMA(p,q) model
            model = sm.tsa.statespace.SARIMAX(candy_diff, order=(p, 1, q))
            results = model.fit()
            
            # Print order and results
            order_aic_bic.append((p, q, results.aic, results.bic))
            print("======================================================")
        except:
            print(p, q, None, None)

In [None]:
order_df = pd.DataFrame(order_aic_bic, columns=['p', 'q', 'aic','bic'])

Models sorted by AIC

In [None]:
order_df.sort_values('aic').reset_index(drop=True)

Models sorted by BIC

In [None]:
order_df.sort_values('bic').reset_index(drop=True)

The best prameters to be used is (4,3)

In [None]:
# Plotting seasonal ACF and PACF

# Create figure
fig, (ax1, ax2) = plt.subplots(2,1)

# Plot seasonal ACF
plot_acf(candy_diff, lags=[12,24,36,48,60,72, 84, 96 ], ax=ax1)

# Plot seasonal PACF
plot_pacf(candy_diff, lags=[12,24,36,48,60,72, 84, 96], ax=ax2)
plt.show()

ACF tails offf and PACF cuts off after lag of 3

In [None]:
# Fitting a SARIMA model
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Instantiate model
S = 12
D = 1
d = 1
P = 0
Q = 3
p = 4
q = 3
model = SARIMAX(candy, order=(p,d,q), seasonal_order=(P,D,Q,S))

# Fit model
results = model.fit()

## 4.3. Automation and saving

In [None]:
# Search over model orders
import pmdarima as pm

results = pm.auto_arima(candy)
results

In [None]:
results.summary()

In [None]:
results.plot_diagnostics()

In [None]:
# Seasonal search parameters

results = pm.auto_arima(candy, # data
                        seasonal=True, # is the time series seasonal
                        m=12, # the seasonal period
                        D=1, # seasonal difference order
                        start_P=1, # initial guess for P
                        start_Q=1, # initial guess for Q
                        max_P=4, # max value of P to test
                        max_Q=4, # max value of Q to test
                        information_criterion='aic', # used to select the best model
                        trace=True, # print results while training
                        error_action='ignore', # ignore orders that don't work
                        stepwise=True,
                       )
print(results)
print(results.summary())
results.plot_diagnostics()

In [None]:
#Saving model objects
# Import joblib
import joblib
# Select a filepath
filepath = 'model.pkl'
# Save model to file path
joblib.dump(results, filepath)

# Load the model 
# Select a filepath
filepath ='model.pkl'
# Load model object from file path
loaded_model = joblib.load(filepath)