In [11]:
%reload_ext autoreload
%autoreload 2   

import sys
sys.path.append('../src_jobs/')

In [12]:
import torch
import pickle
import warnings
import numpy as np
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.utils.data import DataLoader
from pathlib import Path
from itertools import repeat
from artifact import Saw, Saw_centered
from data import ArtifactDataset, CachedArtifactDataset, RealisticArtifactDataset, CenteredArtifactDataset
from sliding_window_detector import SlidingWindowTransformerDetector
from utilities import parameters_k
from datetime import datetime
import pytz

# stop warnings
torch.set_float32_matmul_precision("high")
warnings.filterwarnings("ignore", ".*does not have many workers.*")

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [13]:
# Storing hyperparameters as a dictionary, because we can directly log this config dict to W&B.
CONFIG = dict(
    # width of window
    width = 512,
    convolution_features=[256, 128, 64, 32],
    convolution_width=[5, 9, 17, 33],
    convolution_dropout=0.0,
    transformer_heads=2,
    transformer_feedforward=128,
    transformer_layers=2,
    transformer_dropout=0,
    loss="label",
    loss_boost_fp=0,
    
    artifact=Saw_centered(),
    # Optimizer Parameter

    # LearningRate Scheduler
    
    # parameters for study
    batch_size = 32, # 'values': [32, 64, 128]
    
    wandb_group_name = "test_setup",
    wandb_project_name = "artifactory"
)

All settings.

In [14]:
val_file = Path(f"../data/validation_mask_noLondon{CONFIG['width']}.pkl")
val_datasets = [
    "australian_electricity_demand_dataset",
    "electricity_hourly_dataset",
    "electricity_load_diagrams",
    "HouseholdPowerConsumption1",
    # "HouseholdPowerConsumption2",
    # "london_smart_meters_dataset_without_missing_values",
    "solar_10_minutes_dataset",
    "wind_farms_minutely_dataset_without_missing_values",
    'ACSF1',
    'CinCECGTorso',
    'HouseTwenty',
    'Mallat',
    'MixedShapesRegularTrain',
    'Phoneme',
    'PigArtPressure',
    'PigCVP',
    'Rock',
    'SemgHandGenderCh2',
    'mitbih',
    'ptbdb',
    'etth',
    'ettm'
]

In [5]:
# model
model = SlidingWindowTransformerDetector(window=CONFIG["width"],                    
                                  convolution_features=CONFIG["convolution_features"],
                                  convolution_width=CONFIG["convolution_width"],
                                  convolution_dropout=CONFIG["convolution_dropout"],
                                  transformer_heads=CONFIG["transformer_heads"],
                                  transformer_feedforward=CONFIG["transformer_feedforward"],
                                  transformer_layers=CONFIG["transformer_layers"],
                                  transformer_dropout=CONFIG["transformer_dropout"],
                                  loss=CONFIG["loss"],
                                  loss_boost_fp=CONFIG["loss_boost_fp"])
# model = ConvolutionDetector(convolution_features=[128, 64, 32],
#                             convolution_width=[5, 9, 33],
#                             convolution_dilation=[1, 1, 1],
#                             convolution_dropout=0.0,
#                             convolution_activation="sigmoid")
model_name = f"{model.__class__.__name__}_{parameters_k(model)}_{datetime.now(pytz.timezone('Europe/Amsterdam')).strftime('%d-%m-%Y_%H:%M:%S')}"
CONFIG['wandb_run_name'] = model_name


train_datasets = [
    "australian_electricity_demand_dataset",
    "electricity_hourly_dataset",
    "electricity_load_diagrams",
    "HouseholdPowerConsumption1",
    "HouseholdPowerConsumption2",
    #"london_smart_meters_dataset_without_missing_values",
    #"solar_10_minutes_dataset",
    #"wind_farms_minutely_dataset_without_missing_values",
]
print(model_name)

SlidingWindowTransformerDetector_528.96K_27-02-2024_11:13:18


  rank_zero_warn(


Loading data.

In [15]:
def load_series(names: list[str], split: str):
    series = list()
    counts = list()
    for name in names:
        with open(f"../data/processed/{name}_{split}.pickle", "rb") as f:
            raw = [a for a in pickle.load(f) if len(a) > CONFIG["width"]]
            series.extend(np.array(a).astype(np.float32) for a in raw)
            counts.extend(repeat(1 / len(raw), len(raw)))
    counts = np.array(counts)
    return series, counts / counts.sum()

In [8]:
# train
train_data, train_weights = load_series(train_datasets, "TRAIN")
train_dataset = RealisticArtifactDataset(train_data,
                                width=CONFIG["width"],
                                padding=64,
                                artifact=CONFIG["artifact"],
                                weight=train_weights) 
train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"])

NameError: name 'train_datasets' is not defined

In [16]:
# validation
if not val_file.exists():
    val_data, val_weights = load_series(val_datasets, "ALL")
    val_gen = RealisticArtifactDataset(val_data,
                              width=CONFIG["width"],
                              padding=64,
                              artifact=CONFIG["artifact"],
                              weight=val_weights)
    val = CachedArtifactDataset.generate(val_gen,
                                         n=2048,
                                         to=val_file)
else:
    val = CachedArtifactDataset(file=val_file)
val_loader = DataLoader(val, batch_size=CONFIG["batch_size"])

In [180]:
train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], pin_memory=False)
len(train_loader.dataset.data)

3531

Sanity check.

In [175]:
batch = next(iter(train_loader))
batch["data"]

tensor([[ 0.0326,  0.7382, -0.0911,  ...,  1.5170,  1.7919,  1.8148],
        [-0.9875, -0.9875, -0.9875,  ...,  1.6123,  1.6773,  1.8073],
        [ 0.8637,  0.6117,  0.5052,  ..., -1.2407, -1.1910, -1.4004],
        ...,
        [-0.6785, -0.6785, -0.6785,  ...,  2.2258,  2.1807,  2.1243],
        [ 0.4400, -0.1251,  0.1009,  ...,  0.7628,  0.7467,  0.5207],
        [-0.6412, -0.5803, -0.5803,  ..., -0.5803, -0.5803, -0.5803]])

In [None]:
batch["data"].size()

torch.Size([32, 512])

Training!

In [None]:
# # Initialize W&B run
# run = wandb.init(project=CONFIG["wandb_project_name"], 
#         config=CONFIG,
#         entity="hvonhue",
#         group=CONFIG["wandb_group_name"], 
#         job_type='train',
#         name=CONFIG["wandb_run_name"])

# wandb.config.type = 'baseline'

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhvonhue[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [181]:
# initialize callbacks
checkpointcallback = ModelCheckpoint(monitor="validation",
                                     mode="min",
                                     save_top_k=1)
lr_monitor = LearningRateMonitor(logging_interval='step')

# initialize logger
logger = MLFlowLogger(
                     log_model="all")

# initialize trainer
trainer = Trainer(logger=logger,
                  max_steps=50000,
                  val_check_interval=1000,
                  callbacks=[checkpointcallback,
                             lr_monitor])

# train
trainer.fit(model,
            train_dataloaders=train_loader,
            val_dataloaders=val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name         | Type           | Params
------------------------------------------------
0 | convolutions | Sequential     | 142 K 
1 | f1_score     | BinaryF1Score  | 0     
2 | accuracy     | BinaryAccuracy | 0     
------------------------------------------------
142 K     Trainable params
0         Non-trainable params
142 K     Total params
0.569     Total estimated model params size (MB)


Epoch 0: : 44it [00:07,  6.08it/s, v_num=a79a, train_loss_step=0.0633, train_accuracy_step=0.932, train_f1_score_step=0.000]

In [None]:
# # End Wandb run
# run.finish()