# Understanding time series forecasting

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Introducing time series

In [None]:
df = pd.read_csv("data/quarterly_earnings.csv")
df['year'] = pd.DatetimeIndex(df['date']).year
df.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(df.date, df.data)
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')

plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

**Visualizing the components of a time series is known as `decomposition`**. 


In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose, STL

advanced_decomposition = STL(df.data, period=4).fit()

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1, sharex=True)

ax1.plot(advanced_decomposition.observed)
ax1.set_ylabel('Observed')

ax2.plot(advanced_decomposition.trend)
ax2.set_ylabel('Trend')

ax3.plot(advanced_decomposition.seasonal)
ax3.set_ylabel('Seasonal')

ax4.plot(advanced_decomposition.resid)
ax4.set_ylabel('Residuals')


plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

## Bird’s-eye view of time series forecasting

<img src="images/tsf_01.png">

Forecasting project roadmap.
1. **The first step is naturally to set a goal that justifies the need for forecasting.**
2. **Then you must determine what needs to be forecast in order to achieve that goal.**
3. **Then you set the horizon of the forecast.**
4. **Once that’s done, you can gather the data and develop a forecasting model.**
5. **Then the model is deployed to production, its performance is monitored, and new data is collected in order to retrain the forecasting model and make sure it is still relevant.**

> It is important to have a goal when forecasting and to monitor the model once
it’s deployed. This will ensure the success and longevity of the project.

## How time series forecasting is different from other regression tasks

However, there are some key differences between time series forecasting and regression
for time-independent scenarios:
- **Time series have an order**
- **Time series sometimes do not have features**

> Never change the order of a time series when modeling. Shuffling the data is
not allowed.

## Defining a baseline model

- In the context of time series, **one simple statistic we can use to build a baseline is the
arithmetic mean.**

- Another possible baseline is to naively forecast the **last recorded data point**.

- Or, if we see a cyclical pattern in our data, we can **simply repeat that pattern into the
future.**

> A baseline model is a trivial solution to our forecasting problem because it only
uses heuristics, or simple statistics, such as the mean.

## Forecasting the historical mean

In [None]:
df.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(df['date'], df['data'])
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')
ax.axvspan(80, 83, color='#808080', alpha=0.2)

plt.xticks(np.arange(0, 81, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

In [None]:
train = df[:-4].copy()
test = df[-4:].copy()

In [None]:
historical_mean = np.mean(train['data'])
historical_mean

In [None]:
test.loc[:, 'pred_mean'] = historical_mean
test

Next, we need to define and **calculate an error metric in order to evaluate the performance
of our forecasts on the test set.**

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
mape_hist_mean = mape(test['data'], test['pred_mean'])
mape_hist_mean

In [None]:
fig, ax = plt.subplots()

ax.plot(train['date'], train['data'], 'g-.', label='Train')
ax.plot(test['date'], test['data'], 'b-', label='Test')
ax.plot(test['date'], test['pred_mean'], 'r--', label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')
ax.axvspan(80, 83, color='#808080', alpha=0.2)
ax.legend(loc=2)

plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

## Forecasting last year’s mean

In [None]:
last_year_mean = np.mean(train['data'][-4:])
last_year_mean

In [None]:
test.loc[:, 'pred__last_yr_mean'] = last_year_mean

test

In [None]:
mape_last_year_mean = mape(test['data'], test['pred__last_yr_mean'])
mape_last_year_mean

In [None]:
fig, ax = plt.subplots()

ax.plot(train['date'], train['data'], 'g-.', label='Train')
ax.plot(test['date'], test['data'], 'b-', label='Test')
ax.plot(test['date'], test['pred__last_yr_mean'], 'r--', label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')
ax.axvspan(80, 83, color='#808080', alpha=0.2)
ax.legend(loc=2)

plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

## Predicting using the last known value

In [None]:
last = train['data'].iloc[-1]
last

In [None]:
test.loc[:, 'pred_last'] = last

test

In [None]:
mape_last = mape(test['data'], test['pred_last'])
mape_last

In [None]:
fig, ax = plt.subplots()

ax.plot(train['date'], train['data'], 'g-.', label='Train')
ax.plot(test['date'], test['data'], 'b-', label='Test')
ax.plot(test['date'], test['pred_last'], 'r--', label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')
ax.axvspan(80, 83, color='#808080', alpha=0.2)
ax.legend(loc=2)

plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

## Implementing the naive seasonal forecast

In [None]:
test.loc[:, 'pred_last_season'] = train['data'][-4:].values

test

In [None]:
mape_naive_seasonal = mape(test['data'], test['pred_last_season'])
mape_naive_seasonal

In [None]:
fig, ax = plt.subplots()

ax.plot(train['date'], train['data'], 'g-.', label='Train')
ax.plot(test['date'], test['data'], 'b-', label='Test')
ax.plot(test['date'], test['pred_last_season'], 'r--', label='Predicted')
ax.set_xlabel('Date')
ax.set_ylabel('Earnings per share (USD)')
ax.axvspan(80, 83, color='#808080', alpha=0.2)
ax.legend(loc=2)

plt.xticks(np.arange(0, 85, 8), [1960, 1962, 1964, 1966, 1968, 1970, 1972, 1974, 1976, 1978, 1980])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

**Keep in mind that a baseline model serves as a basis for comparison.**

> Time series forecasting starts with a baseline model that serves as a benchmark
for comparison with more complex models.

## The random walk process

In [None]:
df = pd.read_csv('data/GOOGL.csv')
df.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Date'], df['Close'])
ax.set_xlabel('Date')
ax.set_ylabel('Closing price (USD)')

plt.xticks(
    [4, 24, 46, 68, 89, 110, 132, 152, 174, 193, 212, 235], 
    ['May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Jan 2021', 'Feb', 'Mar', 'April'])

fig.autofmt_xdate()
plt.tight_layout()
plt.show()

**A random walk is a process in which there is an equal chance of going up or down by a
random number.** This is usually observed in **financial and economic data**.

### Simulating a random walk process

In [None]:
np.random.seed(42)

In [None]:
steps = np.random.standard_normal(1000)

In [None]:
steps[0]=0

In [None]:
random_walk = np.cumsum(steps)

In [None]:
fig, ax = plt.subplots()

ax.plot(random_walk)
ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.tight_layout()
plt.show()

## Identifying a random walk

In the context of time series, a **random walk is defined
as a series whose first difference is stationary and uncorrelated.**

> **This means that the process moves completely at random.**

<img src="images/tsf_02.png">

### Stationarity

A stationary time series is one whose **statistical properties do not change over time**. In
other words, it **has a constant mean, variance, and autocorrelation, and these properties
are independent of time**.

> The augmented Dickey-Fuller (ADF) test helps us determine if a time series is stationary
by testing for the presence of a unit root. If a unit root is present, the time series
is not stationary.

In [None]:
def simulate_process(is_stationary: bool) -> np.array:
    np.random.seed(42)
    process = np.empty(400)
    
    if is_stationary:
        alpha = 0.5
        process[0] = 0
    else:
        alpha = 1
        process[0] = 10
        
    for i in range(400):
        if i+1 < 400:
            process[i+1] = alpha*process[i] + np.random.standard_normal()
        else:
            break
        
    return process

stationary = simulate_process(True)
non_stationary = simulate_process(False)

fig, ax = plt.subplots()

ax.plot(stationary, linestyle='-', label='stationary')
ax.plot(non_stationary, linestyle='--', label='non-stationary')
ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')
ax.legend(loc=2)

plt.tight_layout()
plt.show()

In [None]:
import warnings
warnings.filterwarnings('ignore')

def mean_over_time(process: np.array) -> np.array:
    mean_func = []
    
    for i in range(len(process)):
        mean_func.append(np.mean(process[:i]))
    
    return mean_func

stationary_mean = mean_over_time(stationary)
non_stationary_mean = mean_over_time(non_stationary)

fig, ax = plt.subplots()

ax.plot(stationary_mean, label='stationary')
ax.plot(non_stationary_mean, linestyle='--', label='non-stationary')
ax.set_xlabel('Timesteps')
ax.set_ylabel('Mean')
ax.legend(loc=1)

plt.tight_layout()
plt.show()

In [None]:
def var_over_time(process: np.array) -> np.array:
    var_func = []
    
    for i in range(len(process)):
        var_func.append(np.var(process[:i]))
    
    return var_func

stationary_var = var_over_time(stationary)
non_stationary_var = var_over_time(non_stationary)

fig, ax = plt.subplots()

ax.plot(stationary_var, label='stationary')
ax.plot(non_stationary_var, linestyle='--', label='non-stationary')
ax.set_xlabel('Timesteps')
ax.set_ylabel('Variance')
ax.legend(loc=2)

plt.tight_layout()
plt.show()

### The autocorrelation function

**We know that correlation measures the extent of a linear relationship between two
variables.**

**Autocorrelation therefore measures the linear relationship between lagged
values of a time series.** Thus, the ACF reveals how the correlation between any two values
changes as the lag increases. Here, the lag is simply the number of timesteps separating
two values.

### Putting it all together

In [None]:
fig, ax = plt.subplots()

ax.plot(random_walk)
ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.tight_layout()
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
ADF_result = adfuller(random_walk)

print(f'ADF Statistic: {ADF_result[0]}')
print(f'p-value: {ADF_result[1]}')

In [None]:
from statsmodels.graphics.tsaplots import plot_acf

plot_acf(random_walk, lags=20);
plt.tight_layout()
plt.show()

In [None]:
diff_random_walk = np.diff(random_walk, n=1)

In [None]:
plt.plot(diff_random_walk)
plt.title('Differenced Random Walk')
plt.xlabel('Timesteps')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
ADF_result = adfuller(diff_random_walk)

print(f'ADF Statistic: {ADF_result[0]}')
print(f'p-value: {ADF_result[1]}')

In [None]:
plot_acf(diff_random_walk, lags=20);

plt.tight_layout()
plt.show()

**We have demonstrated that our simulated data is indeed a random walk: the series
is stationary and uncorrelated after a first-order differencing, which corresponds to
the definition of a random walk.**

## Is GOOGL a random walk?

In [None]:
df = pd.read_csv('data/GOOGL.csv')

In [None]:
fig, ax = plt.subplots()
ax.plot(df['Date'], df['Close'])
ax.set_xlabel('Date')
ax.set_ylabel('Closing price (USD)')
plt.xticks([4, 24, 46, 68, 89, 110, 132, 152, 174, 193, 212, 235],
['May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 2021, 'Feb', 'Mar', 'April'])
fig.autofmt_xdate()
plt.tight_layout()
plt.show()

In [None]:
GOOGL_ADF_result = adfuller(df['Close'])

print(f'ADF Statistic: {GOOGL_ADF_result[0]}')
print(f'p-value: {GOOGL_ADF_result[1]}')

In [None]:
diff_close = np.diff(df['Close'], n=1)

In [None]:
GOOGL_diff_ADF_result = adfuller(diff_close)

print(f'ADF Statistic: {GOOGL_diff_ADF_result[0]}')
print(f'p-value: {GOOGL_diff_ADF_result[1]}')

In [None]:
plot_acf(diff_close, lags=20);

## Forecasting a random walk

**Since the values change randomly, no statistical learning model can be applied.**

### Forecasting on a long horizon

In [None]:
df = pd.DataFrame({'value': random_walk})

train = df[:800]
test = df[800:]

fig, ax = plt.subplots()

ax.plot(random_walk)
ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')
ax.axvspan(800, 1000, color='#808080', alpha=0.2)

plt.tight_layout()

In [None]:
mean = np.mean(train.value)
test.loc[:, 'pred_mean'] = mean
test.head()

In [None]:
last_value = train.iloc[-1].value
test.loc[:, 'pred_last'] = last_value
test.head()

In [None]:
deltaX = 800 - 1
deltaY = last_value - 0

drift = deltaY / deltaX

x_vals = np.arange(801, 1001, 1)

pred_drift = drift * x_vals

test.loc[:, 'pred_drift'] = pred_drift

test.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(train.value, 'b-')
ax.plot(test['value'], 'b-')
ax.plot(test['pred_mean'], 'r-.', label='Mean')
ax.plot(test['pred_last'], 'g--', label='Last value')
ax.plot(test['pred_drift'], 'k:', label='Drift')

ax.axvspan(800, 1000, color='#808080', alpha=0.2)
ax.legend(loc=2)

ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.tight_layout()

In [None]:
from sklearn.metrics import mean_squared_error

mse_mean = mean_squared_error(test['value'], test['pred_mean'])
mse_last = mean_squared_error(test['value'], test['pred_last'])
mse_drift = mean_squared_error(test['value'], test['pred_drift'])

print(mse_mean, mse_last, mse_drift)

### Forecasting the next timestep

In [None]:
df_shift = df.shift(periods=1)

df_shift.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(df, 'b-', label='actual')
ax.plot(df_shift, 'r-.', label='forecast')

ax.legend(loc=2)

ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.tight_layout()

In [None]:
mse_one_step = mean_squared_error(test['value'], df_shift[800:])

mse_one_step

In [None]:
fig, ax = plt.subplots()

ax.plot(df, 'b-', label='actual')
ax.plot(df_shift, 'r-.', label='forecast')

ax.legend(loc=2)

ax.set_xlim(900, 1000)
ax.set_ylim(15, 28)

ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.tight_layout()

Because a **random process takes random steps into the future, we cannot use statistical
or deep learning techniques to fit such a process: there is nothing to learn from
randomness and it cannot be predicted. Instead, we must rely on naive forecasting
methods.**

> Ultimately, you cannot predict random
movements.

**We cannot use statistical or deep learning techniques on a random walk, since it
moves at random in the future. Therefore, we must use naive forecasts.**

## Forecast the daily closing price of GOOGL

In [None]:
df = pd.read_csv('data/GOOGL.csv')
df.head()

googl_train = df[['Date','Close']][:-5]
googl_test = df[['Date','Close']][-5:]

In [None]:
# Forecast the historical mean
mean = np.mean(googl_train['Close'])
googl_test.loc[:, 'pred_mean'] = mean

# Forecast using the last knwon value
last_value = googl_train['Close'].iloc[-1]
googl_test.loc[:, 'pred_last'] = last_value

# Forecast using the drift
deltaX = len(googl_train)    # The deltaX is basically the length of the training set
deltaY = last_value - googl_train['Close'].iloc[0]    # Remember to subtract the initial value of the training set

drift = deltaY / deltaX

x_vals = np.arange(248, 253, 1)

pred_drift = drift * x_vals + googl_train['Close'].iloc[0]    #A Add the initial value back in the predictions

googl_test.loc[:, 'pred_drift'] = pred_drift

googl_test

In [None]:
googl_mse_mean = mean_squared_error(googl_test['Close'], googl_test['pred_mean'])
googl_mse_last = mean_squared_error(googl_test['Close'], googl_test['pred_last'])
googl_mse_drift = mean_squared_error(googl_test['Close'], googl_test['pred_drift'])

print(googl_mse_mean, googl_mse_last, googl_mse_drift)

In [None]:
fig, ax = plt.subplots()

ax.plot(googl_train['Close'], 'b-')
ax.plot(googl_test['Close'], 'b-')
ax.plot(googl_test['pred_mean'], 'r-.', label='Mean')
ax.plot(googl_test['pred_last'], 'g--', label='Last value')
ax.plot(googl_test['pred_drift'], 'k:', label='Drift')

ax.axvspan(248, 252, color='#808080', alpha=0.2)
ax.legend(loc=2)

ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.xlim(230, 252)
plt.tight_layout()

In [None]:
# Forecast the next timestep
df_shift = df.shift(periods=1)

mse_one_step = mean_squared_error(googl_test['Close'], df_shift['Close'].iloc[248:])

mse_one_step

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Close'], 'b-', label='actual')
ax.plot(df_shift['Close'].iloc[248:], 'r-.', label='forecast')

ax.axvspan(248, 252, color='#808080', alpha=0.2)

ax.legend(loc='best')

ax.set_xlabel('Timesteps')
ax.set_ylabel('Value')

plt.xlim(240, 252)
plt.ylim(2200, 2400)
plt.tight_layout()