In [None]:
# %%capture
# !pip install pmdarima # optuna

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
import statsmodels.api as sm
from tqdm.auto import tqdm, trange
from statsmodels.tsa.arima.model import ARIMAResults, ARIMA
# import optuna
# import pmdarima as pm

from windpred import read_file, find_sequences, predict, pretrain, update, trial, train_model

pd.options.plotting.backend = "matplotlib"
path = '/content/drive/MyDrive/Job-Vandy Research Engineer/ERDC/'
READ_CSV=False

## Data extraction/pre-processing

In [None]:
from windpred import read_file
df = read_file(processed=True)
df.describe()

In [None]:
dfog = df # original dataframe with all data
# Testing data
dft = dfog.loc['2022-06-28':'2022-07-07']
# Training data
df = dfog.loc['2022-03-10':'2022-06-27']
INTERVAL = pd.Timedelta(minutes=15)

In [None]:
# separate into contiguous sequences
from windpred import find_sequences
seqs = find_sequences(df)
print(len(seqs))

## Timeseries Analysis

In [None]:
seqsog = find_sequences(dfog, INTERVAL)
print(len(seqsog))

In [None]:
# Stationarity
from statsmodels.tsa.stattools import kpss

# Null hypothesis: process is stationary

def kpss_test(timeseries):
    # print("Results of KPSS Test:")
    kpsstest = kpss(timeseries, regression="c", nlags="auto")
    kpss_output = pd.Series(
        kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
    )
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value
    return kpss_output

tests = [kpss_test(s.Speed) for s in seqs]
pd.concat(tests, axis=1)

In [None]:
# Correlation
corrs = df.rolling(1).mean().corr()
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = corrs.columns,
        y = corrs.index,
        z = corrs,
        text=corrs.values,
        texttemplate='%{text:.2f}'
    )
)
fig.update_layout(height=300, width=300)
fig.show()

In [None]:
# Filtering
from statsmodels.tsa.filters import hp_filter, bk_filter
seq = seqs[0]
plt.plot(seq.Speed)
cycle = bk_filter.bkfilter(seq.Speed, low=1e1, high=1e2, K=10)
plt.plot(seq.Speed.mean()+cycle)

In [None]:
# Timeseries plot of contiguous sequences
plt.figure(figsize=(12,6))
for s in seqsog:
    plt.plot(s.Speed)
plt.tight_layout()

In [None]:
# Autocorrelation, used to select order of Moving Average (MA) in ARIMA models
plt.figure(figsize=(8,4))
ax = plt.subplot(1,1,1)
for s in seqsog:
    sm.graphics.tsa.plot_acf(s.Speed, lags=min(100, len(s)-1), ax=ax)
plt.xlabel(f'Time interval {s.index.freq}')
plt.tight_layout()

In [None]:
# Partial Autocorrelation, used to select order of Autoregression (AR) in ARIMAX models
plt.figure(figsize=(8,4))
ax = plt.subplot(1,1,1)
for s in seqsog:
    sm.graphics.tsa.plot_pacf(s.Speed, lags=min(100, len(s)//2-1), ax=ax)
plt.xlabel(f'Time interval {s.index.freq}')
plt.tight_layout()

## ARIMA models

In [None]:
seqs = find_sequences(df, pd.Timedelta(minutes=15), agg_fn='max')
model = sm.tsa.ARIMA(seqs[0].Speed.iloc[:-100], order=(1,0,20))
result = model.fit()
actual = seqs[0].Speed.iloc[-100:]
_, predictions = predict(result, actual, None, INTERVAL)
_, predictions_recursive = predict(result, actual, None, INTERVAL, in_sample=False)

plt.figure(figsize=(8,4))
plt.plot(predictions, label='1-Step')
plt.plot(predictions_recursive, label='Recursive')
plt.plot(seqs[0].Speed.iloc[-100:], label='Actual')
plt.plot(predictions - actual, label='Residual', ls=':')
plt.title('RMSE=%.2f' % sm.tools.eval_measures.rmse(predictions, actual))
plt.legend()

In [None]:
seqstrain = find_sequences(df, INTERVAL)
seqstest = find_sequences(dft, INTERVAL)
order = (1, 0, 20)
res = pretrain(ARIMA, seqstrain[0].Speed, exog=None, order=order)
for s in tqdm(seqstrain[1:], leave=False):
    res = update(res, s.Speed, None, INTERVAL)
n = 0
abserr = 0
for s in tqdm(seqstest, leave=False):
    res, pred = predict(res, s.Speed, None, INTERVAL)
    abserr += np.abs(np.sum(((s.Speed - pred) / s.Speed)))
    n += len(s)
mape = abserr * 100 / n
print('MAPE %.2f' % mape)

In [None]:
seqstrain = find_sequences(df, INTERVAL)
seqstest = find_sequences(dft, INTERVAL)
order = (1, 0, 20)
res = pretrain(ARIMA, seqstrain[0].dx, exog=None, order=order)
for s in tqdm(seqstrain[1:], leave=False):
    res = update(res, s.dx, None, INTERVAL)
dxs, dx_actual = [], []
for s in tqdm(seqstest, leave=False):
    res, pred = predict(res, s.dx, None, INTERVAL)
    dx_actual.extend(s.dx)
    dxs.extend(pred)

res = pretrain(ARIMA, seqstrain[0].dy, exog=None, order=order)
for s in tqdm(seqstrain[1:], leave=False):
    res = update(res, s.dy, None, INTERVAL)
dys, dy_actual = [], []
for s in tqdm(seqstest, leave=False):
    res, pred = predict(res, s.dy, None, INTERVAL)
    dy_actual.extend(s.dy)
    dys.extend(pred)

dxs = np.asarray(dxs)
dys = np.asarray(dys)
dx_actual = np.asarray(dx_actual)
dy_actual = np.asarray(dy_actual)

heading = np.arctan2(dys, dxs)
heading_actual = np.arctan2(dy_actual, dx_actual)

mape = np.mean(np.abs((heading_actual - heading)/heading_actual)) * 100

print('MAPE %.2f' % mape)

### ARIMA Grid Search

In [None]:
def trial(
    interval, order, df=df, seq_kwargs={}, filter_fn=lambda df: df, split=0.25,
    endog_col='Speed', exog_cols=None, model_cls=ARIMA
):
    df = filter_fn(df)
    seqs = find_sequences(df, interval, **seq_kwargs)
    # leave one out testing
    rmses = []
    rmses_persistent = []
    ntest = 0
    splits = int(len(seqs) * split)

    for i in trange(len(seqs), leave=False, desc='splits'):
        if isinstance(split, float) and 0<split<1:
            ntrain = int(len(seqs[i]) * (1-split))
            dtrain = [seqs[i].iloc[:ntrain]]
            dtest = [seqs[i].iloc[ntrain:]]
        else:
            if i==split:
                break
            dtest = [seqs[i]]
            dtrain = [seqs[j] for j in range(splits) if j!=i]

        endog = dtrain[0][endog_col]
        exog = exog_cols if exog_cols is None else dtrain[0][exog_cols]
        res = pretrain(model_cls, endog, exog=exog, order=order)
        for dft in dtrain[1:]:
            endog = dft[endog_col]
            exog = exog_cols if exog_cols is None else dft[exog_cols]
            res = update(res, endog, exog=exog, interval=interval)
        for dft in dtest:
            if dft.index[0] < res.data.dates[-1]:
                dft = dft.copy()
                dft.index += (res.data.dates[-1] - dft.index[0]) + (2 * interval)
                dft.index.freq = interval
            ntest += len(dft)
            # predictions = []
            # for k in trange(len(dft.Speed), leave=False, desc='instance'):
            #     res, pred = predict(res, dft.Speed.iloc[k:k+1], interval)
            #     predictions.append(pred)
            # pred = pd.concat(predictions)
            endog = dft[endog_col]
            exog = exog_cols if exog_cols is None else dft[exog_cols]
            res, pred = predict(res, endog, exog, interval)
            rmses.append(sm.tools.eval_measures.rmse(np.arccos(pred), np.arccos(endog)) * len(dft))
            rmses_persistent.append(sm.tools.eval_measures.rmse(np.arccos(endog.iloc[:-1]), np.arccos(endog.iloc[1:])) * (len(dft)-1))
            print(f'Split {i} \tRMSE: {rmses[-1]/len(dft):.2f}')
    return dict(
        rmse=sum(rmses) / ntest,
        rmse_persistent=sum(rmses_persistent) / (ntest - splits)
    )


In [None]:
trial(pd.Timedelta(minutes=15), (1,0,20), df, split=0.1)

In [None]:
trial(pd.Timedelta(minutes=1), (1,0,20), dfog, split=0.1, endog_col='dx', exog_cols=None)

### auto-fit using pmdarima

In [None]:
%%capture
!pip install pmdarima

In [None]:
import pmdarima as pm
seqs = find_sequences(df, pd.Timedelta(minutes=15))
seqst = find_sequences(dft, pd.Timedelta(minutes=15))
print(len(seqs))

In [None]:
model = pm.auto_arima(seqs[-1].dx, seqs[-1].dy.values.reshape(-1,1), seasonal=False)
model.summary()

In [None]:
plt.plot(model.predict())
plt.plot(seqst[0].Speed.iloc[:10])

## LSTM Pytorch

In [None]:
import torch
from torch import nn
from torch import optim

class LSTMModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=3, hidden_size=12, num_layers=2, dropout=0.1)
        self.predictor = nn.Linear(in_features=12, out_features=3)
    def forward(self, x):
        hidden, _ = self.lstm(x)
        pred = self.predictor(hidden)
        return pred

model = LSTMModel()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def prepare_lstm_inputs(dfs, length, batch_size):
    for df in dfs:
        for name, group in df.rolling(length):
            # pass

## TimeseriesAI

In [None]:
%%capture
!pip install tsai

In [None]:
from tsai.basics import *

X, y, splits = get_regression_data('AppliancesEnergy', split_data=False)

# tfms = [None, TSRegression()]
# batch_tfms = TSStandardize(by_sample=True)
# reg = TSRegressor(X, y, splits=splits, path='models', arch="TSTPlus", tfms=tfms, batch_tfms=batch_tfms, metrics=rmse, cbs=ShowGraph(), verbose=True)
# reg.fit_one_cycle(100, 3e-4)
# reg.export("reg.pkl")

In [None]:
splits[1]