In [2]:
%load_ext autoreload
%autoreload 2

# LOCAL
# PROJECT_PATH = '/Users/ivan_zorin/Documents/DEV/code/ntl/'
# DATA_PATH = '/Users/ivan_zorin/Documents/DEV/data/sgcc/data.csv'
# LOG_DIR = '/Users/ivan_zorin/Documents/DEV/runs/debug/trainer'

# ZHORES
PROJECT_PATH = '/trinity/home/ivan.zorin/dev/code/ntl/'
DATA_PATH = '/trinity/home/ivan.zorin/dev/data/sgcc/data.csv'
LOG_DIR = '/trinity/home/ivan.zorin/dev/logs/debug/one-batch/'

In [3]:
import numpy as np 
from numpy import ndarray
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, robust_scale
from sktime.transformations.series.impute import Imputer

from functools import partial
from types import SimpleNamespace
import os
from tqdm.auto import tqdm


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import sys
sys.path.append(PROJECT_PATH)
from ntl.data import SGCCDataset, data_train_test_split
from ntl.data import FillNA, Scale, Reshape, ToTensor, Cutout, Diff
from ntl.models import AE2dCNN
from ntl.trainer import ArgsTrainer
from ntl.utils import fix_seed, get_date

In [4]:
fix_seed(42)

transforms = [FillNA('drift'), 
            Cutout(256), 
            Scale('minmax'), 
            Reshape((16, 16)),
            lambda x: x[None],
            ToTensor()
]
normal_data = SGCCDataset(DATA_PATH, label=0, nan_ratio=0.75, transforms=transforms, year=2016)
anomal_data = SGCCDataset(DATA_PATH, label=1, nan_ratio=1.0, transforms=transforms, year=2016)

train, test = data_train_test_split(normal_data, anomal_data)

train_loader = DataLoader(train, batch_size=256, drop_last=False, shuffle=True)
test_loader = DataLoader(test, batch_size=256, shuffle=False)


Random seed set as 42


In [15]:
def plot_x_x_hat():
    idx = np.random.randint(len(train_loader.dataset))
    label, x, _ = train_loader.dataset[idx]
        
    
    
# plot_x_x_hat()

In [17]:
idx = np.random.randint(len(train_loader.dataset))
label, x, _ = train_loader.dataset[idx]

In [16]:
sample

(0,
 tensor([[[3.8368e-01, 2.8226e-01, 1.8744e-01, 2.3167e-01, 2.6857e-01,
           2.5122e-01, 2.7199e-01, 1.9062e-01, 2.5538e-01, 2.5244e-01,
           2.9668e-01, 2.2874e-01, 3.6364e-01, 2.5489e-01, 1.9624e-01,
           2.5342e-01],
          [3.3944e-01, 2.5953e-01, 2.6320e-01, 2.6588e-01, 2.9668e-01,
           3.7121e-01, 3.1525e-01, 2.5073e-01, 3.6657e-01, 3.8587e-01,
           4.3744e-01, 2.2923e-01, 3.6535e-01, 2.1188e-01, 2.1114e-01,
           2.2923e-01],
          [2.6295e-01, 2.2067e-01, 2.0381e-01, 1.3759e-01, 1.3368e-01,
           1.4296e-01, 1.4052e-01, 2.8030e-01, 2.0723e-01, 2.7542e-01,
           2.0308e-01, 2.6760e-01, 2.6466e-01, 2.9008e-01, 2.2092e-01,
           2.2898e-01],
          [2.1530e-01, 2.9497e-01, 2.7151e-01, 1.7107e-01, 2.0772e-01,
           1.4272e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
           0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
           0.0000e+00],
          [0.0000e+00, 0.0000e+00, 0.0000e+00, 0

In [5]:
model = AE2dCNN()

In [6]:
model

AE2dCNN(
  (encoder): Sequential(
    (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(4, 16, kernel_size=(3, 3), stride=(1, 1))
    (3): ReLU()
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(1, 1))
    (3): ReLU()
    (4): ConvTranspose2d(16, 4, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
    (6): ConvTranspose2d(4, 1, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
  )
)

In [15]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params

47473

### one batch over-fit

In [None]:
batch_size = 64
one_batch_idxs = np.random.randint(0, len(normal_data), batch_size)
loader = DataLoader(torch.utils.data.Subset(normal_data, one_batch_idxs), batch_size=batch_size)

In [4]:
model = AE2dCNN()
loss = nn.MSELoss(reduction='none')
optim = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, factor=0.5, patience=2)
logger = SummaryWriter(log_dir=os.path.join(LOG_DIR, get_date())) 

config = SimpleNamespace(**{
    'debug': True,
    'n_debug_batches': 1,
    'log_step': 5,
    'n_epochs': 100
})


trainer = ArgsTrainer(
    train_loader=loader,
    val_loader=loader,
    model=model,
    loss=loss,
    optim=optim,
    scheduler=scheduler,
    config=config,
    logger=logger
)


NameError: name 'loader' is not defined

In [None]:
losses = []
for epoch in tqdm(range(100)):
    train_loss = trainer.train_step(epoch)
    trainer.logger.add_scalars('loss', {'train': train_loss}, epoch)
    losses.append(train_loss.item())

In [None]:
plt.figure()
plt.plot(losses)
plt.show()

In [None]:
for sample in train:
    x = sample[1]
    emb, x_hat = trainer.model(x.to(trainer.device))

    x = x.flatten().numpy().squeeze()
    x_hat = x_hat.detach().flatten().cpu().numpy().squeeze()
    plt.figure()
    plt.plot(x, 'b--')
    plt.plot(x_hat, 'r')
    plt.show()

In [None]:
for i, sample in enumerate(train):
    x = sample[1].flatten().numpy().squeeze()
    plt.figure()
    plt.plot(np.diff(x, n=1))
    plt.title(i)
    plt.show()
    

#### check average values of normalized inputs

In [None]:
transforms = [FillNA('drift'), 
            Cutout(256), 
            Scale('minmax'), 
            # Reshape((16, 16)),
            # lambda x: x[None],
            # ToTensor()
]
normal_data = SGCCDataset(DATA_PATH, label=0, nan_ratio=0.75, transforms=transforms, year=2016)
anomal_data = SGCCDataset(DATA_PATH, label=1, nan_ratio=1.0, transforms=transforms, year=2016)

train, test = data_train_test_split(normal_data, anomal_data)

In [None]:
max_values, min_values, mean_values, median_values = [], [], [], []
for sample in tqdm(train):
    x = sample[1]
    max_values.append(x.max())
    min_values.append(x.min())
    mean_values.append(x.mean())
    median_values.append(np.median(x))
    

In [None]:
def plot_hist(data, title=''):
    plt.figure()
    plt.hist(data, density=True)
    plt.title(title)
    plt.show()
    


In [None]:
# plot_hist(max_values, 'max_values')
# plot_hist(min_values, 'min_values')
plot_hist(mean_values, 'mean_values')
# plot_hist(median_values, 'median_values')


In [None]:
idx = 10
sample = train[idx][1]
print(sample.mean())

plt.figure()
plt.plot(sample)
plt.show()


### Transformations 

In [None]:
transforms = [
    FillNA('drift'), 
    # Cutout(256), 
    # Scale('minmax'), 
    # Reshape((16, 16)),
    # lambda x: x[None],
    # ToTensor()
]
data = SGCCDataset(DATA_PATH, label=0, nan_ratio=0.75, transforms=transforms, year=2016)

In [None]:
diff_tr = Diff(1)
scale_tr = Scale('minmax')

In [None]:
idx = np.random.randint(0, len(data))
x = data[idx][1]

In [None]:
x1 = diff_tr(scale_tr(x))
x2 = scale_tr(diff_tr(x))

In [None]:
plt.figure()
plt.plot(x1, 'b', label='x1')
plt.plot(x2, 'r', label='x2')
plt.show()


* normalize data to 0,1 scale | check
* fill nans | check
* cut-out some piece | check
* reshape into matrix (7x52, square, other?) | check
* use this matrix as input into AE | check


Two ways of transformation 
The first one 
1. cut-out some piece
2. fill na
3. scale 
4. reshape

The second one 
1. fill na
2. scale
3. cut-out some piece
5. reshape

**#TODO**

* add x and x_hat plots
* split val_loss into two -- val_normal_loss, val_anomal_loss
* add GMM on scores
