# LSTM Autoencoder Anomaly Detection
## WMX communication anomaly detection through DC Diff training model

**DC diff data gathered by the WMX ethercat master is used to detect anomalies in a time-series pattern.**

In this notebook, we'll build an *LSTM Autoencoder representation learning model* . (See the following an example of autoencoder model.)

<img src="https://lilianweng.github.io/posts/2018-08-12-vae/autoencoder-architecture.png" width="500">

You can find **excellent reference projects** that help understand relevant concepts and techniques at the end of this notebook.

## Preparation
### Install neccessary Python libraries
Note that the Python environment where this notebook runs should already have **PyTorch** packages.

(To install **PyTorch**, go to https://pytorch.org/get-started/locally/)

In [None]:
!pip install scipy
!pip install pandas
!pip install seaborn
!pip install -U scikit-learn
!pip install -q -U watermark
!pip install datasets
!pip install huggingface-hub
!pip install ipywidgets


### Versions of the installed packages

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,scipy

### Import packages and initialize them

In [None]:
import torch

import copy
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split

from torch import nn, optim

import torch.nn.functional as F
from datasets import Dataset
from datasets import load_dataset
from huggingface_hub import login


%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

torch.manual_seed(RANDOM_SEED)

### Download the DC diff dataset from Hugging face and load the dataset
Log in to the Hugging face before downloading
(You may need a Hugging face account to login)


In [None]:
# Login to the Hugging face
login()

In [None]:
# Load the dataset
dataset = load_dataset("Jake5/wmxdata") 

hg_df = pd.DataFrame(dataset['train'])

print(f"Total Values: {hg_df.shape[0]}")
print(hg_df.head())

### Check if CUDA is available and use the CUDA avialble device

In [None]:
print(f"CUDA available={torch.cuda.is_available()}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Exploratory Data Analysis

In [None]:
# Plot the current the first 1000 values of downloaded dataframe
hg_df['Timestamp'] = pd.to_datetime(hg_df['Timestamp'])

plt.plot(hg_df['Timestamp'].head(1000), hg_df['DcDiffAvg'].head(1000))

# LSTM Autoencoder

## Data Preprocess the dataset (Normalization)

In [None]:
from copy import deepcopy as dc

def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)

    df.set_index('Timestamp', inplace=True)

    for i in range(1, n_steps+1):
        df[f'DcDiffAvg(t-{i})'] = df['DcDiffAvg'].shift(i)

    df.dropna(inplace=True)

    return df

lookback = 10
shifted_df = prepare_dataframe_for_lstm(hg_df, lookback)
shifted_df

In [None]:
shifted_df_as_np = shifted_df.to_numpy()

shifted_df_as_np

In [None]:
shifted_df_as_np.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))
shifted_df_as_np = scaler.fit_transform(shifted_df_as_np)

shifted_df_as_np

In [None]:
X = shifted_df_as_np[:, 1:]
y = shifted_df_as_np[:, 0]

X.shape, y.shape

In [None]:
X = dc(np.flip(X, axis=1))
X

In [None]:
split_index = int(len(X) * 0.95)

split_index

In [None]:
X_train = X[:split_index]
X_test = X[split_index:]

y_train = y[:split_index]
y_test = y[split_index:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train = X_train.reshape((-1, lookback, 1))
X_test = X_test.reshape((-1, lookback, 1))

y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).float()
X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).float()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from torch.utils.data import Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

In [None]:
train_dataset

In [None]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
for _, batch in enumerate(train_loader):
    x_batch, y_batch = batch[0].to(device), batch[1].to(device)
    print(x_batch.shape, y_batch.shape)
    break

## Building an LSTM Autoencoder

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers,
                            batch_first=True)

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

model = LSTM(1, 4, 1)
model.to(device)
model

In [None]:
def train_one_epoch():
    model.train(True)
    print(f'Epoch: {epoch + 1}')
    running_loss = 0.0

    for batch_index, batch in enumerate(train_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)

        output = model(x_batch)
        loss = loss_function(output, y_batch)
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_index % 100 == 99:  # print every 100 batches
            avg_loss_across_batches = running_loss / 100
            print('Batch {0}, Loss: {1:.3f}'.format(batch_index+1,
                                                    avg_loss_across_batches))
            running_loss = 0.0
    print()

In [None]:
def validate_one_epoch():
    model.train(False)
    running_loss = 0.0

    for batch_index, batch in enumerate(test_loader):
        x_batch, y_batch = batch[0].to(device), batch[1].to(device)

        with torch.no_grad():
            output = model(x_batch)
            loss = loss_function(output, y_batch)
            running_loss += loss.item()

    avg_loss_across_batches = running_loss / len(test_loader)

    print('Val Loss: {0:.3f}'.format(avg_loss_across_batches))
    print('***************************************************')
    print()

# Training

In [None]:
learning_rate = 0.001
num_epochs = 10
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    train_one_epoch()
    validate_one_epoch()

## Plot actual and predicted values

In [None]:
with torch.no_grad():
    predicted = model(X_train.to(device)).to('cpu').numpy()

plt.plot(y_train[:1000], label='Actual DcDiffAvg')
plt.plot(predicted[:1000], label='Predicted DcDiffAvg')
plt.xlabel('Timestamp')
plt.ylabel('DcDiffAvg')
plt.legend()
plt.show()

## Saving the model

## Loading the model if necessary

## Choosing a threashold

## Normal DC Diff

## Anomalies

## Looking at Examples

# References

- [Time Series Anomaly Detection Tutorial with PyTorch in Python | LSTM Autoencoder for ECG Data](https://www.youtube.com/watch?v=qN3n0TM4Jno)
- [Amazon Stock Forecasting in PyTorch with LSTM Neural Network (Time Series Forecasting) | Tutorial 3](https://www.youtube.com/watch?v=q_HS4s1L8UI&t=5s)
- [\[NHN Cloud make IT 2023\] 시계열 데이터 속에 숨어있는 이상 징후를 찾는 딥 러닝 기술](https://www.youtube.com/watch?v=bg2e60IZ40Q)
