In [1]:
%load_ext tensorboard
%load_ext autoreload
%autoreload 2

import torch
from torch import nn
import torch.nn.functional as F
from torch import utils
from torch.utils.data import DataLoader

import numpy as np
from tqdm import tqdm, trange
import os 
from datetime import datetime

# import tensorflow as tf
# import tensorboard as tb
# tf.io.gfile = tb.compat.tensorflow_stub.io.gfile

import sys
sys.path.append('/home/ivan.zorin/dev/code/ntl/')

from data.data import sgcc_train_test_split, SGCCDataset
from models.models import LSTMAE_old
from utils.utils import compute_roc_auc

In [2]:
# configs
# data_path = '/Users/ivan_zorin/Documents/DEV/data/sgcc/data.csv'

data_path = '/home/ivan.zorin/dev/data/sgcc/data.csv'
experiment_name = 'lstm_ae'
date = datetime.today().strftime('%Y-%m-%d_%H:%M:%S')
run_path = os.path.join('/home/ivan.zorin/dev/logs/', experiment_name, date)
scale = 'minmax'
nan_ratio = 0.7
batch_size = 32

input_size = 1
hidden_size = [64]
lr = 0.0001
factor = 0.5
patience = 3

N_epochs = 20
val_logging_step = 5

if torch.cuda.is_available():
    device = 'cuda'
else:
    assert False, 'cuda is not available'


# data
normal_dataset = SGCCDataset(path=data_path, label=0, scale=scale, nan_ratio=nan_ratio)
anomal_dataset = SGCCDataset(path=data_path, label=1, scale=scale)

train_data, val_data, test_normal_data = utils.data.random_split(normal_dataset, [len(normal_dataset) - 2*len(anomal_dataset), len(anomal_dataset), len(anomal_dataset)])
test_data = utils.data.ConcatDataset([test_normal_data, anomal_dataset])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

# model and train utils
model = LSTMAE_old(input_size, hidden_size).to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode='min', factor=factor, patience=patience, verbose=True)
loss_fn = nn.L1Loss()
logger = torch.utils.tensorboard.SummaryWriter(run_path)

In [3]:
def inspect_grad_norm(model, norm_type=2):
    name_norm = {}
    with torch.no_grad():
        for p in model.named_parameters():
            if p[1].grad is not None and p[1].requires_grad:
                name_norm[p[0]] = torch.norm(p[1], norm_type).item()
    
    return name_norm
            

In [33]:

model.train()
batch = next(iter(train_loader))

optim.zero_grad()
y, x, _ = batch
x = x.to(device)
z, x_hat = model(x)
loss = loss_fn(x, x_hat)

loss.backward()
optim.step()

In [37]:
norms = inspect_grad_norm(model)
norms

{'encoder.0.weight_ih_l0': 1.1585909128189087,
 'encoder.0.weight_hh_l0': 9.251689910888672,
 'encoder.0.bias_ih_l0': 1.153677225112915,
 'encoder.0.bias_hh_l0': 1.1564085483551025,
 'decoder.0.weight_ih_l0': 9.124003410339355,
 'decoder.0.weight_hh_l0': 1.101063847541809,
 'decoder.0.bias_ih_l0': 0.8523733019828796,
 'decoder.0.bias_hh_l0': 1.2631181478500366}

In [5]:
# training
train_len = len(train_loader)
val_len = len(val_loader)

for epoch in trange(N_epochs, total=N_epochs):
# for epoch in range(N_epochs):
    
    train_losses, val_losses = [], []
    train_embeddings, val_embeddings = [], []
    val_labels = []

    train_iterator = tqdm(train_loader, leave=False, desc='Train')
    val_iterator = tqdm(val_loader, leave=False, desc='Val')
    
    model.train()
    for i, batch in enumerate(train_iterator):
        optim.zero_grad()
        y, x, _ = batch
        x = x.to(device)
        z, x_hat = model(x)
        loss = loss_fn(x, x_hat)
        
        loss.backward()
        optim.step()
        
        train_losses.append(loss.item())
        train_embeddings.append(z.detach().cpu().numpy().squeeze())
        step = i + train_len * epoch
        logger.add_scalar('train/loss', loss.item(), step)
    
    
    train_embeddings = np.concatenate(train_embeddings)
    train_loss = sum(train_losses) / len(train_losses)
    logger.add_embedding(tag='train/embs', mat=train_embeddings, global_step=epoch)
    
    # inspect_grad_norm(model)
    # log gradient norms 
    # logger.

    model.eval()
    for i, batch in enumerate(val_iterator):
        with torch.no_grad():
            y, x, _ = batch
            x = x.to(device)
            z, x_hat = model(x)
            loss = loss_fn(x, x_hat)
            
            val_labels.append(y)
            val_losses.append(loss.item())
            val_embeddings.append(z.detach().cpu().numpy().squeeze())
            step = i + train_len * epoch
            logger.add_scalar('val/loss', loss.item(), step)
            
    val_loss = sum(val_losses) / len(val_losses)
    scheduler.step(val_loss)
    logger.add_scalars('loss', {'train': train_loss, 'val': val_loss}, epoch)
    
    val_embeddings = np.concatenate(val_embeddings)
    if epoch % val_logging_step == 0:
        logger.add_embedding(tag='val/embs', mat=val_embeddings, global_step=epoch)
    
    
    _, fig, (FPR, TPR, auc_score) = compute_roc_auc(val_losses, val_labels, pyplot=True)
    logger.add_scalar('val/auc-score', auc_score, epoch)
    logger.add_figure(tag='val/roc-auc', fig=fig, global_step=epoch)
    
    grad_norms = inspect_grad_norm(model)
    logger.add_scalars('grad_norm', grad_norms, epoch)
    
    
    
    

  0%|          | 0/20 [00:00<?, ?it/s]
[A




[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A



  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  array = numpy.asarray(array, order=order, dtype=dtype)
  0%|          | 0/20 [00:48<?, ?it/s]


ValueError: unknown format is not supported