In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random


from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import h5py
import os
import sys
import pickle
import json

from sklearn.model_selection import train_test_split
from timeit import default_timer
from collections import OrderedDict
import dadaptation

import wandb
import datetime

In [3]:
# get device
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

PROJECT_NAME = 'EVCharging'

# Set the random seeds to improve reproducibility by removing stochasticity
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False # Force cuDNN to use a consistent convolution algorithm
    torch.backends.cudnn.deterministic = True # Force cuDNN to use deterministic algorithms if available
    torch.use_deterministic_algorithms(True) # Force torch to use deterministic algorithms if available

set_seeds(0)

# for deterministic pytorch algorithms, enable reproducibility.
os.environ['CUBLAS_WORKSPACE_CONFIG']= ":4096:8"

In [4]:
config = {
    'train_val_split': [0.80, 0.20], # These must sum to 1.0
    'batch_size' : 32, # Num samples to average over for gradient updates
    'EPOCHS' : 200, # Num times to iterate over the entire dataset
    'LEARNING_RATE' : 1e-3, # Learning rate for the optimizer
    'BETA1' : 0.9, # Beta1 parameter for the Adam optimizer
    'BETA2' : 0.999, # Beta2 parameter for the Adam optimizer
    'WEIGHT_DECAY' : 1e-4, # Weight decay parameter for the Adam optimizer
    'accum_iter': 5, # iterations to accumulate gradients
}


In [5]:
class ToTensor(object):
    """Convert numpy arrays to tensor arrays
    """
    def __init__(self, device=None):
        if device is None:
            device = "cpu"
        self.device = device
    
    def __call__(self, data):
        if self.device == "cpu":
            return torch.from_numpy(data)
        else:
            # to overlap data transfers with computation, use non_blocking=True
            return torch.from_numpy(data).to(self.device, non_blocking=True, dtype=torch.float32)

# %%
def get_transforms(transform_dict):
    """
    Given a dictionary of transform parameters, return a list of class instances for each transform
    Arguments:
        transform_dict (OrderedDict) with optional keys:
            ToTensor (dict) if present, requires the 'device' key that indicates the PyTorch device
    Returns:
        composed_transforms (PyTorch composed transform class) containing the requested transform steps in order
    """
    transform_functions = []
    for key in transform_dict.keys():
        if key=='ToTensor': # Convert array to a PyTorch Tensor
            transform_functions.append(ToTensor(
                transform_dict[key]['device']
            ))
        
    composed_transforms = transforms.Compose(transform_functions)
    return composed_transforms

# %%
# create a torch dataset
class EVCoordDataset(torch.utils.data.Dataset):
    def __init__(self, input_data, output_data, transform=None):
        self.input_data = input_data
        self.output_data = output_data
        self.transform = transform

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        input_data = self.input_data[idx]
        output_data = self.output_data[idx]

        if self.transform:
            input_data = self.transform(input_data)
            output_data = self.transform(output_data)
            

        return input_data, output_data

# %%
def make_split_artifact(run, train_rows, val_rows):
    """
    Creates a w&b artifact that contains the train and validation rows of the raw data
        run (wandb run) returned from wandb.init()
        train_rows (list of ints) indices that reference the training rows in the raw_data
        val_rows (list of ints) indices that reference the validation rows in the raw_data
    """
    split_artifact = wandb.Artifact(
        'data-splits', type='dataset',
        description='Train, validation, test dataset splits')

    # Our data split artifact will only store index references to the original dataset to save space
    split_artifact.add(wandb.Table(
        data=pd.DataFrame(train_rows, columns=['indices'])), 'train-data')

    split_artifact.add(wandb.Table(
        columns=['source'],
        data=pd.DataFrame(val_rows, columns=['indices'])), 'val-data')

    run.log_artifact(split_artifact)


def make_loaders(config, input_data, output_data):
    """
    Makes data loaders using a artifact containing the dataset splits (created using the make_split_artifact() function)
    The function assumes that you have created a data-splits artifact and a data-transforms artifact
    Arguments:
        config [dict] containing keys:
            batch_size (int) amount of rows (i.e. data instances) to be delivered in a single batch
    Returns:
        train_loader (PyTorch DataLoader) containing the training data
        val_loader (PyTorch DataLoader) containing the validation data
    """
    with wandb.init(project=PROJECT_NAME, job_type='package-data', config=config) as run:
        # Load transforms
        transform_dir = run.use_artifact('data-transforms:latest').download()
        transform_dict = json.load(open(os.path.join(transform_dir, 'transforms.txt')), object_pairs_hook=OrderedDict)
        composed_transforms = get_transforms(transform_dict)

        split_artifact = run.use_artifact('data-splits:latest')

        # Load splits
        # its a wandb.Table data type so we can use the get() method
        train_rows = split_artifact.get('train-data').get_column('indices', convert_to='numpy')
        val_rows = split_artifact.get('val-data').get_column('indices', convert_to='numpy')

        # Reformat data to (inputs, labels)
        train_loader = DataLoader(EVCoordDataset(
            input_data[train_rows], output_data=output_data, transform=composed_transforms),
            batch_size=config['batch_size'],
            shuffle=True,
            num_workers=0
        )
        val_loader = DataLoader(EVCoordDataset(
            input_data[val_rows], output_data=output_data, transform=composed_transforms),
            batch_size=config['batch_size'],
            batch_sampler=None,
            shuffle=False,
            num_workers=0)
    
    return train_loader, val_loader

In [7]:
with wandb.init(project=PROJECT_NAME, job_type="split-data", config=config) as run:

    # Define raw data splits
    raw_data = run.use_artifact('jyyresearch/EVCharging/EV-coord-raw_data:latest', type='raw_data')

    raw_data_dir = raw_data.download('./Data/')

    # read in the h5 files
    input_data = h5py.File(os.path.join(raw_data_dir, 'raw_data.hdf5'), 'r')['input_data'][:]
    output_data = h5py.File(os.path.join(raw_data_dir, 'raw_data.hdf5'), 'r')['output_data'][:]


    # train test split of gc_data and damage_data. Obtain the respective indices
    train_val_split = config['train_val_split']
    train_val_indices = np.split(np.random.permutation(len(input_data)), [int(train_val_split[0]*len(input_data))])
    
    make_split_artifact(run, train_val_indices[0], train_val_indices[1])
    

[34m[1mwandb[0m: Downloading large artifact EV-coord-raw_data:latest, 669.34MB. 1 files... Done. 0:0:0


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [8]:
# Define an initial set of transforms that we think will be useful
with wandb.init(project=PROJECT_NAME, job_type='define-transforms', config=config) as run:
    transform_dict = OrderedDict()
    transform_dict['ToTensor'] = {
        'device': DEVICE
    }
    # Include an operational index to verify the order
    for key_idx, key in enumerate(transform_dict.keys()):
        transform_dict[key]['order'] = key_idx
    # Create an artifact for logging the transforms
    data_transform_artifact = wandb.Artifact(
        'data-transforms', type='parameters',
        description='Data preprocessing functions and parameters.',
        metadata=transform_dict) # Optional for viewing on the web app; the data is also stored in the txt file below
    # Log the transforms in JSON format
    with data_transform_artifact.new_file('transforms.txt') as f:
        f.write(json.dumps(transform_dict, indent=4))
    run.log_artifact(data_transform_artifact)

config.update(transform_dict)

train_loader, val_loader = make_loaders(config, input_data=input_data, output_data=output_data)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [11]:
# config for model
step_size = 30
gamma = 0.5

In [22]:
def train(model, device, train_loader, optimizer, config):
    model.train()
    train_loss = 0

    # binary cross entropy loss, dont average
    myloss = nn.BCELoss(reduction='sum')

    accum_iter = config['accum_iter']

    for batch_idx, (input_data, output_data) in enumerate(train_loader):        
    
        input_data = input_data.to(device)

        predicted_output = model(input_data)

        # calculate loss

        loss =  myloss(predicted_output, output_data)/accum_iter
        loss.backward()

        # perform gradient accumulation
        if ((batch_idx + 1) % accum_iter == 0) or (batch_idx + 1 == len(train_loader)):
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)


        train_loss += loss.item()
        
    train_loss /= len(train_loader.dataset)
    return train_loss

In [33]:
def validate(model, device, valid_loader, config):

    model.eval()
    valid_loss = 0

    data_list = []
    output_list = []
    predicted_output_list = []


    with torch.no_grad():
        
        myloss = nn.BCELoss(reduction='sum')

        for batch_idx, (input_data, output_data) in enumerate(valid_loader):      
            input_data = input_data.to(device)

            predicted_output = model(input_data)

        # calculate loss

            loss =  myloss(predicted_output, output_data)
            
            valid_loss += loss.item()

            data_list.append(input_data)
            output_list.append(output_data)
            predicted_output_list.append(predicted_output)
        

    valid_loss /= len(valid_loader.dataset)

    return valid_loss, data_list, output_list, predicted_output_list

In [44]:
class BaseNet(nn.Module):
    def __init__(
        self, input_size: int, output_size: int, n_neurons: int,          
        ):

        super(BaseNet, self).__init__()

        self.input_size = input_size

        self.fc1 = nn.Linear(input_size, n_neurons)
        # weight initialization (Lecun Normal Default)
        nn.init.kaiming_normal_(self.fc1.weight, mode='fan_in', nonlinearity='linear')

        self.fc2 = nn.Linear(n_neurons, 2*n_neurons)
        nn.init.kaiming_normal_(self.fc2.weight, mode='fan_in', nonlinearity='linear')

        self.fc3 = nn.Linear(2*n_neurons, 2*n_neurons)
        nn.init.kaiming_normal_(self.fc3.weight, mode='fan_in', nonlinearity='linear')

        self.fc4 = nn.Linear(2*n_neurons, output_size)

        self.sigmoid = nn.Sigmoid()

        self.selu = nn.SELU(True)
        # self.relu = nn.ReLU(True)

        # layer normalization
        self.layer_norm1 = nn.LayerNorm(n_neurons)
        self.layer_norm2 = nn.LayerNorm(2*n_neurons)
        self.layer_norm3 = nn.LayerNorm(2*n_neurons)
        

    def forward(self, x):

        # selu
        x = self.selu(self.fc1(x))
        x = self.layer_norm1(x)
        x = self.selu(self.fc2(x))
        x = self.layer_norm2(x)
        x = self.selu(self.fc3(x))
        x = self.layer_norm3(x)

        x = self.selu(self.fc4(x))
        x = self.sigmoid(x)

        return x

In [37]:
input_data.shape

(3000, 14796)

In [45]:
# for deterministic pytorch algorithms, enable reproducibility.
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

# n in n_list > input_data.shape[-1]
n_list = [80]

BETA1 = config["BETA1"]
BETA2 = config["BETA2"]

EPOCHS = config["EPOCHS"]
BATCH_SIZE = config["batch_size"]

MODEL_NAME = "model_BaseNet"

LOCAL_MODEL_DIR_LIST = []

for i in range(len(n_list)):

    min_valid_loss = np.inf

    width = n_list[i]

    N_neurons_str = str(width)

    MODEL_NAME_RUN = MODEL_NAME + N_neurons_str
    LOCAL_MODEL_DIR = 'Model/' + MODEL_NAME_RUN + '.pt'
    LOCAL_MODEL_DIR_LIST.append(LOCAL_MODEL_DIR)
    MODEL_NAME_RUN = MODEL_NAME + N_neurons_str

    today = datetime.datetime.now()

    model = BaseNet(
        input_size=input_data.shape[-1],
        output_size=output_data.shape[-1],
        n_neurons=width,
    ).to(device=DEVICE)

    params = list(model.parameters())
    optimizer = AdamW(params, lr=config["LEARNING_RATE"], weight_decay=1e-4)
    optimizer = dadaptation.DAdaptAdam(
        params, lr=1, log_every=5, betas=(BETA1, BETA2), growth_rate=1.01
    )
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    train_loss_list = []
    valid_loss_list = []

    data_list = []
    output_list = []
    predicted_output_list = []

    wandb.init(
        anonymous="allow",
        project=PROJECT_NAME,
        name=today.strftime("%Y%m%d_%H%M"),
        config={
            "epochs": EPOCHS,
            "optimizer": "AdamW",
            "batch_size": BATCH_SIZE * config["accum_iter"],
            "lr": "1e-3",
            "step_size": step_size,
            "gamma": gamma,
            "width": width,
            "loss ": "L2Loss",
            "activation func": "SELU",
            "lr decay": "steplr, gamma=0.5",
            "architecture": "BaseNet",
        },
    )

    for epoch in range(1, EPOCHS + 1):

        # get current learning rate
        curr_lr = optimizer.param_groups[0]["lr"]

        train_loss = train(model, DEVICE, train_loader, optimizer, config)

        scheduler.step()

        valid_loss, data_list, output_list, predicted_output_list = validate(
            model, DEVICE, val_loader, config
        )
        print(
            "Epoch: {:03d}, Train Loss: {:.7f}, Valid Loss: {:.7f}, LR: {:.7f}".format(
                epoch, train_loss, valid_loss, curr_lr
            )
        )
        # wandb.log({'train_loss': train_loss, 'valid_loss': valid_loss})
        train_loss_list.append(train_loss)
        valid_loss_list.append(valid_loss)

        if valid_loss < min_valid_loss:
            print(
                "Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(
                    min_valid_loss, valid_loss
                )
            )
            min_valid_loss = valid_loss
            best_epoch = epoch
            # save model with current hour as name
            torch.save(model.state_dict(), LOCAL_MODEL_DIR )
            print("Saved model at epoch {}".format(epoch))


    # version control model
    run = wandb.init(project=PROJECT_NAME, job_type='version-model', config=config)
    trained_model_at = wandb.Artifact(PROJECT_NAME + N_neurons_str, type="model", description="trained baseline for " + PROJECT_NAME)
    trained_model_at.add_file(LOCAL_MODEL_DIR, name=MODEL_NAME_RUN + '.pt')
    run.log_artifact(trained_model_at)
    run.finish()

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# version control model
run = wandb.init(project=PROJECT_NAME, job_type='version-model', config=config)
trained_model_at = wandb.Artifact("FNO2D", type="model", description="trained baseline for FNO2D", aliases=['baseline', 'latest'])
trained_model_at.add_file(LOCAL_MODEL_DIR, name='model_FNO2D.pt')
trained_model_at.add_file(LOCAL_MODEL_IPHI_DIR, name='model_iphi_FNO2D.pt')
run.log_artifact(trained_model_at)
run.finish()

In [None]:
# version control model
run = wandb.init(project=PROJECT_NAME, job_type="inference", config=config)
trained_model_at = run.use_artifact("FNO2D:latest", type="model")
model_dir = trained_model_at.download()

# load best model
model = FNO2D.FNO2d(modes, modes, width=32, in_channels=3, out_channels=1, s1=s, s2=s).cuda()
model_iphi = FNO2D.IPHI_constant(width=32).cuda()

model = model.load_state_dict(torch.load(os.path.join(model_dir, 'model_FNO2D.pt')))
model_iphi = model_iphi.load_state_dict(torch.load(os.path.join(model_dir, 'model_iphi_FNO2D.pt')))
run.finish()

In [None]:
valid_loss, data_list, output_list, damage_list = validate(model, device, val_loader, config)