<a href="https://colab.research.google.com/github/youssefHosni/Time-Series-With-Python/blob/main/Time_Series_Data%C2%A0Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
DATA_DIRECTORY='../data/analysis'

# 1. Correlation and Autocorrelation

## 1.1. Correlation of Two Time Series

In [None]:
# Compute the correlation between levels
levels = pd.read_csv('{}/DJI.csv'.format(DATA_DIRECTORY), parse_dates=['Date'], index_col='Date')

correlation1 = round(levels['DJI'].corr(levels['UFO']), 4)
print("Correlation [levels]: ", correlation1)

# Compute correlation of percent changes
changes = levels.pct_change()
correlation2 = round(changes['DJI'].corr(changes['UFO']), 4)
print("Correlation [percent changes]: ", correlation2)

In [None]:
levels.plot()

## 1.2. Simple Linear Regression

In [None]:
import statsmodels.api as sm

# Compute correlation of x and y
data = pd.read_csv('{}/price.csv'.format(DATA_DIRECTORY), parse_dates=['DATE'], index_col='DATE')
data = data.dropna()
x = data['SP500']
y = data['Oil']

correlation = round(x.corr(y), 4)
print("The correlation between x and y is %4.2f" %(correlation))

# Convert the Series x to a DataFrame and name the column x
dfx = pd.DataFrame(x.values, columns=['x'])

# Add a constant to the DataFrame dfx
dfx1 = sm.add_constant(dfx)

# Regress y on dfx1
result = sm.OLS(y.values, dfx1).fit()

# Print out the results and look at the relationship between R-squared and the correlation above
result.summary()

## 1.3. Autocorrelation

## 1.4. Autocorrelation Function 

In [None]:
HRB = pd.read_csv('{}/HRB.csv'.format(DATA_DIRECTORY), parse_dates=['Quarter'], index_col='Quarter')

In [None]:
# Import the acf module and the plot_acf module from statsmodels
from statsmodels.tsa.stattools import acf
from statsmodels.graphics.tsaplots import plot_acf

# Compute the acf array of HRB
acf_array = acf(HRB)
#print(acf_array)

# Plot the acf function
fig = plot_acf(HRB, alpha=0.05)
fig.set_size_inches(18.5, 10.5)

plt.show()

# 2. Simple Time Series

## 2.1. White Noise 

In [None]:
import numpy as np
noise = np.random.normal(loc=0, scale=1, size=500)

In [None]:
plt.plot(noise)
plt.show()
plot_acf(noise, lags=50)
plt.show()

## 2.2 Random walk

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
data = pd.read_csv('{}/price.csv'.format(DATA_DIRECTORY), parse_dates=['DATE'], index_col='DATE')
data = data.dropna()
SPX = data['SP500']

# We perform the Augmented Dickey-Fuller unit root test.
# Check the following docs for more information: https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html
results = adfuller(SPX)
pvalue=round(results[1], 4)

In [None]:
print("We test H0: There is a unit root in an AR model, which implies that the data series is not stationary.")
print("We reject the H0 if p-value less than 0.05. p-value={}".format(pvalue))

## 2.3. Stationary

In [None]:
# Transforming Nonstationary Series Into Stationary Series
fig, axs = plt.subplots(2)
fig.suptitle('Transforming Nonstationary Series Into Stationary Series')
fig.set_size_inches(10, 10.5)
axs[0].plot(SPX)
axs[0].set_title('S&P 500 prices')

axs[1].plot(SPX.diff())
axs[1].set_title('S&P 500 prices first differnce')

In [None]:
# Transforming Nonstationary Series Into Stationary Series
fig, axs = plt.subplots(2)
fig.suptitle('Transforming Nonstationary Series Into Stationary Series')
fig.set_size_inches(10, 10.5)

axs[0].plot(HRB)
axs[0].set_title('Quarterly earnings for H&R Block')

axs[1].plot(HRB.diff(4))
axs[1].set_title('Quarterly earnings for H&R Block seasonal difference')

In [None]:
# Transforming Nonstationary Series Into Stationary Series
AMZN =  pd.read_csv('{}/AMZN.csv'.format(DATA_DIRECTORY), parse_dates=['Date'], index_col='Date')

fig, axs = plt.subplots(3)
fig.suptitle('Transforming Nonstationary Series Into Stationary Series')
fig.set_size_inches(10, 10.5)

axs[0].plot(AMZN)
axs[0].set_title('Amazon quarterly revenue')

axs[1].plot(np.log(AMZN))
axs[1].set_title('Log of Amazon quarterly revenue')

axs[2].plot(np.log(AMZN).diff(4))
axs[2].set_title('Seasonal difference of Amazon quarterly revenue log')

# 3. Autoregressive (AR) Models

In [None]:
from statsmodels.tsa.arima_process import ArmaProcess
ar = np.array([1, -0.9])
ma = np.array([1])
AR_object = ArmaProcess(ar, ma)
simulated_data = AR_object.generate_sample(nsample=1000)
plt.plot(simulated_data)

In [None]:
# import the module for simulating data
from statsmodels.tsa.arima_process import ArmaProcess

fig, axs = plt.subplots(4)
fig.set_size_inches(12, 14.5)
fig.suptitle('Simulated data with different AR parameters')

# Plot 1: AR parameter = +0.9
ar1 = np.array([1, -0.9])
ma1 = np.array([1])
AR_object1 = ArmaProcess(ar1, ma1)
simulated_data_1 = AR_object1.generate_sample(nsample=1000)
axs[0].plot(simulated_data_1)
axs[0].set_title('Simulated data with Phi = +0.9 ')

# Plot 2: AR parameter = -0.9
ar2 = np.array([1, 0.9])
ma2 = np.array([1])
AR_object2 = ArmaProcess(ar2, ma2)
simulated_data_2 = AR_object2.generate_sample(nsample=1000)
axs[1].plot(simulated_data_2)
axs[1].set_title('Simulated data with Phi = -0.9 ')


# Plot 3: AR parameter = +0.5
ar3 = np.array([1, -0.5])
ma3 = np.array([1])
AR_object3 = ArmaProcess(ar3, ma3)
simulated_data_3 = AR_object3.generate_sample(nsample=1000)
axs[2].plot(simulated_data_3)
axs[2].set_title('Simulated data with Phi = +0.5')


# Plot 3: AR parameter = -0.5
ar4 = np.array([1, 0.5])
ma4 = np.array([1])
AR_object4 = ArmaProcess(ar4, ma4)
simulated_data_4 = AR_object4.generate_sample(nsample=1000)
axs[3].plot(simulated_data_4)
axs[3].set_title('Simulated data with Phi = -0.5')

In [None]:
# Import the plot_acf module from statsmodels
from statsmodels.graphics.tsaplots import plot_acf

fig, axs = plt.subplots(2,2,figsize=(15,10))
fig.suptitle('Autocorrelation functions for different AR parameters')

# Plot 1: AR parameter = +0.9
plot_acf(simulated_data_1 , alpha=1, lags=20, ax=axs[0,0], title='Autocorrelation function for Phi = +0.9')

# Plot 2: AR parameter = -0.9
plot_acf(simulated_data_2 , alpha=1, lags=20, ax=axs[0,1], title='Autocorrelation function for Phi = -0.9')

# Plot 3: AR parameter = +0.5
plot_acf(simulated_data_3, alpha=1, lags=20,  ax=axs[1,0], title='Autocorrelation function for Phi = +0.5')

# Plot 4: AR parameter = -0.5
plot_acf(simulated_data_4, alpha=1, lags=20,  ax=axs[1,1], title='Autocorrelation function for Phi = -0.5')


## 3.2. Estimating & Forecasting AR Models

In [None]:
from statsmodels.tsa.arima.model import ARIMA
mod = ARIMA(simulated_data, order=(1,0,0))
result = mod.fit()
result.summary()

In [None]:
print("Fitted parameters (see const, ar.L1, and sigma2 coefficients above): {}".format(result.params))

## 3.3. Choosing the Right Model

We prepare simulated data, so that we can later check which model fits best. 
By simulating data, we know beforehand which models should be best. 

In [None]:
# Import the modules for simulating data and for plotting the PACF
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.tsaplots import plot_pacf

# Simulate AR(1) with phi=+0.6
ma = np.array([1])
ar = np.array([1, -0.6])
AR_object = ArmaProcess(ar, ma)
simulated_data_1 = AR_object.generate_sample(nsample=5000)

# Plot PACF for AR(1)
plot_pacf(simulated_data_1, lags=20)
plt.show()

# Simulate AR(2) with phi1=+0.6, phi2=+0.3
ma = np.array([1])
ar = np.array([1, -0.6, -0.3])
AR_object = ArmaProcess(ar, ma)
simulated_data_2 = AR_object.generate_sample(nsample=5000)

# Plot PACF for AR(2)
plot_pacf(simulated_data_2, lags=20)
plt.show()

In [None]:
mod = ARIMA(simulated_data_2, order=(2,0,0))
result = mod.fit()
result.summary()

## Estimating model goodness with the Bayesian Information Criterion
We test different ARIMA parameters, and check for the lowest Bayesian Information Criterion

In [None]:
# Import the module for estimating an ARMA model
from statsmodels.tsa.arima.model import ARIMA

# Fit the data to an AR(p) for p = 0,...,6 , and save the BIC
BIC = np.zeros(7)
for p in range(7):
    mod = ARIMA(simulated_data_1, order=(p,0,0))
    res = mod.fit()
# Save BIC for AR(p)    
    BIC[p] = res.bic
    
# Plot the BIC as a function of p
plt.plot(range(1,7), BIC[1:7], marker='o')
plt.xlabel('Order of AR Model')
plt.ylabel('Bayesian Information Criterion')
plt.show()

In [None]:
# Import the module for estimating an ARMA model
from statsmodels.tsa.arima.model import ARIMA

# Fit the data to an AR(p) for p = 0,...,6 , and save the BIC
BIC = np.zeros(7)
for p in range(7):
    mod = ARIMA(simulated_data_2, order=(p,0,0))
    res = mod.fit()
# Save BIC for AR(p)    
    BIC[p] = res.bic
    
# Plot the BIC as a function of p
plt.plot(range(1,7), BIC[1:7], marker='o')
plt.xlabel('Order of AR Model')
plt.ylabel('Bayesian Information Criterion')
plt.show()

# 4. Moving Average and ARMA Models

## 4.1.Describe Model

In [None]:
from statsmodels.tsa.arima_process import ArmaProcess
ar = np.array([1])
ma = np.array([1, 0.5])
AR_object = ArmaProcess(ar, ma)
simulated_data = AR_object.generate_sample(nsample=1000)
plt.plot(simulated_data)

## 4.2. Estimation and Forecasting of MA Model

In [None]:
from statsmodels.tsa.arima.model import ARIMA
mod = ARIMA(simulated_data, order=(0,0,1))
result = mod.fit()
result.summary()