In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import utils
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter


import os
import sys  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import sklearn as sk
from sklearn.preprocessing import StandardScaler, QuantileTransformer


In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
    
if IN_COLAB:
    !pip install lightning 
    import lightning.pytorch as pl
    DATA_PATH = '/content/drive/MyDrive/4. MODELS/NTL/data/data.csv'

else:
    sys.path.append('/Users/ivan_zorin/Documents/AIRI/code/ntl/')
    DATA_PATH = '/Users/ivan_zorin/Documents/AIRI/data/sgcc/data.csv'
    from data.data import *


In [None]:
def get_dataset(filepath):
    """## Saving "flags" """

    df_raw = pd.read_csv(filepath, index_col=0)
    flags = df_raw.FLAG.copy()

    df_raw.drop(["FLAG"], axis=1, inplace=True)

    """## Sorting"""

    df_raw = df_raw.T.copy()
    df_raw.index = pd.to_datetime(df_raw.index)
    df_raw.sort_index(inplace=True, axis=0)
    df_raw = df_raw.T.copy()
    df_raw["FLAG"] = flags
    return df_raw

In [5]:
N_EPOCHS = 100
BATCH_SIZE = 16
LR = 10 ** -5

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [11]:
class SGCCDataset(Dataset):
    def __init__(self, path, label=None, year=None):
        super(SGCCDataset).__init__()
        self.path = path
        self.label = label
        # loading dataset
        self.data = self._get_dataset()
        self.labels = self.data['FLAG'].to_numpy() # class labels
        self.data = self._filter_by_label(self.data, self.label) # extracting data of only selected class
        self.data = self.data.drop('FLAG', axis=1)
        self._fill_na_() # filling NaN in consumption
        self.consumers = self.data.reset_index()['CONS_NO'].to_list() # names of consumers

        if year:
            #TODO: slice data to have only selected year
            # transpose raw_data and pick year in index
            pass
        
        self.length = self.data.shape[0]
        self.data = self.data.to_numpy()

    def _filter_by_label(self, data, label):
        if label in ('normal', 0):
            data = data[data['FLAG'] == 0]
        elif label in ('anomal', 1):
            data = data[data['FLAG'] == 1]
        else:
            pass

        return data

    def _fill_na_(self):
        # filling with zeros
        self.data.fillna(0, inplace=True)

    def _get_dataset(self):
        return get_dataset(self.path)

    def _get_item(self, idx):
        return (self.labels[idx], self.data[idx, :], self.consumers[idx])

    def __getitem__(self, idx):
        return self._get_item(idx)

    def __len__(self):
        return self.length

In [12]:
class LSTMAE(nn.Module):
    def __init__(self, input_size=1, hidden_size=[64], n_lstms=1, activation_fn=nn.ReLU, **lstm_kwargs):
        super(LSTMAE, self).__init__()
        assert n_lstms == len(hidden_size)

        # self.activation_fn = activation_fn

        # encoder
        encoder, decoder = [], []
        self.enc_dims = [input_size] + hidden_size
        self.dec_dims = self.enc_dims[::-1]

        for i in range(len(self.enc_dims) - 1):
            encoder += [nn.LSTM(input_size=self.enc_dims[i], hidden_size=self.enc_dims[i+1], num_layers=1, batch_first=True, **lstm_kwargs)]

            decoder += [nn.LSTM(input_size=self.dec_dims[i], hidden_size=self.dec_dims[i+1], num_layers=1, batch_first=True, **lstm_kwargs)]

        self.encoder = nn.Sequential(*encoder)
        self.decoder = nn.Sequential(*decoder)


    def forward(self, x):
        N, L, H = x.shape # Batch x Length x Feature size
        # encoding
        _, (hn, _) = self.encoder(x) # latent representation
        hn.transpose_(0, 1) # batch first
        # decoding
        x_hat, (_, _) = self.decoder(hn.expand(-1, L, -1)) # expand to stratch embedding over length of the input

        return (hn.squeeze(1), x_hat.flip(1))


In [13]:
normal_dataset = SGCCDataset(path=DATA_PATH, label=0)
anomal_dataset = SGCCDataset(path=DATA_PATH, label=1)

train_data, val_data, test_normal_data = utils.data.random_split(normal_dataset, [len(normal_dataset) - 2*len(anomal_dataset), len(anomal_dataset), len(anomal_dataset)])
test_data = utils.data.ConcatDataset([test_normal_data, anomal_dataset])

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = LSTMAE().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.MSELoss()

logger = SummaryWriter()

In [137]:
train_len = len(train_loader)
val_len = len(val_loader)
train_losses = []
train_embed = []
for epoch in range(N_EPOCHS):

    # training step
    t = tqdm(train_loader)
    model.train()
    for ii, batch in enumerate(t):
        optimizer.zero_grad()
        _, x, _ = batch
        z, x_hat = model(x.to(device))
        loss = loss_fn(x, x_hat)
        loss.backward()
        optimizer.step()

        # logging
        train_losses.append(loss.item())
        train_embed.append(z.cpu().detach().numpu().squeeze())
        t.set_description('Train Loss: {.5%d}'.format(loss.item()))
        
        step = epoch * train_len + ii
        logger.add_scalar('Loss/Train', loss.item(), step)
        # TODO add embeddings logging

    model.eval()
    val_losses = []
    val_embed = []
    t = tqdm(val_loader)
    for ii, batch in enumerate(t):
        with torch.no_grad():
            y, x, _ = batch
            z, x_hat = model(x.to(device))
            loss = loss_fn(x, x_hat)
            
            val_losses.append(loss.item())
            val_embed.append(z.cpu().detach().numpy().squeeze())
            t.set_description('Val Loss: {.5%d}'.format(loss.item()))
            step = epoch * val_len + ii
            logger.add_scalar('Loss/Val', loss.item(), step)
            
            
    # test on normal and anomal 

    # avarage metrics after each epoch
    train_losses = np.array(train_losses)
    val_losses = np.array(val_losses)    
    train_mean, train_std = train_losses.mean(), train_losses.std()
    val_mean, val_std = val_losses.mean(), val_losses.std()
    
    
    
    

In [None]:
# TODO adapt this structure to different encoder-decoder architecture