In [5]:
%load_ext autoreload
%autoreload 2

# LOCAL
# PROJECT_PATH = '/Users/ivan_zorin/Documents/DEV/code/ntl/'
# DATA_PATH = '/Users/ivan_zorin/Documents/DEV/data/sgcc/data.csv'
# LOG_DIR = '/Users/ivan_zorin/Documents/DEV/runs/debug/trainer'

# ZHORES
PROJECT_PATH = '/trinity/home/ivan.zorin/dev/code/ntl/'
DATA_PATH = '/trinity/home/ivan.zorin/dev/data/sgcc/data.csv'
LOG_DIR = '/trinity/home/ivan.zorin/dev/logs/debug/one-batch/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import numpy as np 
from numpy import ndarray
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk 
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, robust_scale
from sktime.transformations.series.impute import Imputer

from functools import partial
from types import SimpleNamespace

from tqdm.auto import tqdm


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

import sys
sys.path.append(PROJECT_PATH)
from ntl.data import SGCCDataset, data_train_test_split
from ntl.data import FillNA, Scale, Reshape, ToTensor, Cutout
from ntl.models import AE2dCNN
from ntl.trainer import ArgsTrainer
from ntl.utils import fix_seed

In [4]:
fix_seed(42)

transforms = [FillNA('drift'), 
            Cutout(256), 
            Scale('minmax'), 
            Reshape((16, 16)),
            lambda x: x[None],
            ToTensor()
]
normal_data = SGCCDataset(DATA_PATH, label=0, nan_ratio=0.75, transforms=transforms, year=2016)
anomal_data = SGCCDataset(DATA_PATH, label=1, nan_ratio=1.0, transforms=transforms, year=2016)

train, test = data_train_test_split(normal_data, anomal_data)

train_loader = DataLoader(train, batch_size=64, drop_last=True, shuffle=True)
test_loader = DataLoader(test, batch_size=64, shuffle=False)


Random seed set as 42


### one batch over-fit

In [6]:

model = AE2dCNN()
loss = nn.MSELoss(reduction='none')
optim = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, factor=0.5, patience=2)
logger = SummaryWriter(log_dir=LOG_DIR) 

config = SimpleNamespace(**{
    'debug': True,
    'n_debug_batches': 1,
    'log_step': 5,
    'n_epochs': 100
})


In [7]:

trainer = ArgsTrainer(
    train_loader=train_loader,
    val_loader=test_loader,
    model=model,
    loss=loss,
    optim=optim,
    scheduler=scheduler,
    config=config,
    logger=logger
)




In [None]:

trainer.train()

#### check average values of normalized inputs

In [None]:
transforms = [FillNA('drift'), 
            Cutout(256), 
            Scale('minmax'), 
            # Reshape((16, 16)),
            # lambda x: x[None],
            # ToTensor()
]
normal_data = SGCCDataset(DATA_PATH, label=0, nan_ratio=0.75, transforms=transforms, year=2016)
anomal_data = SGCCDataset(DATA_PATH, label=1, nan_ratio=1.0, transforms=transforms, year=2016)

train, test = data_train_test_split(normal_data, anomal_data)

In [None]:
max_values, min_values, mean_values, median_values = [], [], [], []
for sample in tqdm(train):
    x = sample[1]
    max_values.append(x.max())
    min_values.append(x.min())
    mean_values.append(x.mean())
    median_values.append(np.median(x))
    

In [None]:
def plot_hist(data, title=''):
    plt.figure()
    plt.hist(data, density=True)
    plt.title(title)
    plt.show()
    


In [None]:
# plot_hist(max_values, 'max_values')
# plot_hist(min_values, 'min_values')
plot_hist(mean_values, 'mean_values')
# plot_hist(median_values, 'median_values')


In [None]:
idx = 10
sample = train[idx][1]
print(sample.mean())
plt.plot(sample)



* normalize data to 0,1 scale | check
* fill nans | check
* cut-out some piece | check
* reshape into matrix (7x52, square, other?) | check
* use this matrix as input into AE | check


Two ways of transformation 
The first one 
1. cut-out some piece
2. fill na
3. scale 
4. reshape

The second one 
1. fill na
2. scale
3. cut-out some piece
5. reshape
