# A Step-by-Step Guide to Calculating Autocorrelation and Partial Autocorrelation

## Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.tsatools import lagmat

import matplotlib.pyplot as plt

# settings
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (16, 8)

In [None]:
%config InlineBackend.figure_format = "retina"

## Data

In [None]:
df = pd.read_csv("../data/air_passengers.csv", index_col=0)
df.index = pd.to_datetime(df.index)
y = df["#Passengers"]
y

In [None]:
fig, ax = plt.subplots(2, 1)

plot_acf(df, ax=ax[0])
plot_pacf(df, ax=ax[1], method="ols")

## Autocorrelation Function (ACF)

In [None]:
acf(y, nlags=10)

### Replicating it the wrong way

In [None]:
acf_df = pd.DataFrame()
for lag in range(0, 11):
    acf_df[f"y_lag_{lag}"] = y.shift(lag)

acf_df

In [None]:
acf_df.corr()["y_lag_0"].values

### Replicating it the right way

In [None]:
# benchmark
acf(y, nlags=10)

In [None]:
# replicating the acf function
acf_list = []
mu = y.mean() 

for lag in range(0, 11):
    acf_list.append(np.dot((y - mu).iloc[lag:], (y.shift(lag) - mu).iloc[lag:]) / sum((y - mu) ** 2))

np.array(acf_list)

In [None]:
# alternative way to write the same using sum instead of the dot product
acf_list = []
mu = y.mean() 

for lag in range(0, 11):
    acf_list.append(sum((y - mu).iloc[lag:] * (y.shift(lag) - mu).iloc[lag:]) / sum((y - mu) ** 2))

np.array(acf_list)

## Partial Autocorrelation Function (PACF)

In [None]:
pacf(df, nlags=10, method="ols")

In [None]:
N_LAGS = 10

# the first partial autocorrelation is always equal to 1
pacf_list = [1]

X = pd.DataFrame(lagmat(y, N_LAGS))
X.columns = [f"lag_{lag+1}" for lag in range(10)]

for k in range(1, N_LAGS + 1):
    fitted_model = LinearRegression().fit(X.iloc[k:, :k], 
                                          y.iloc[k:])
    pacf_list.append(fitted_model.coef_[-1])

np.array(pacf_list)

Below you can see how the PACF values change when including more lags using the inefficient method

In [None]:
pacf(y, 2, method="ols-inefficient")

In [None]:
pacf(y, 4, method="ols-inefficient")

For comparison's sake, we run the same two calls to the pacf function using the efficient method.

In [None]:
pacf(y, 2, method="ols")

In [None]:
pacf(y, 4, method="ols")