In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import dadaptation
import random


from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

import h5py
import os
import sys
import pickle
import json

from utilities3 import *
from sklearn.model_selection import train_test_split
from Adam import Adam
from timeit import default_timer
from collections import OrderedDict

# import local modules from FNO2D.py
import FNO2D


import wandb
import datetime

  warn(f"Failed to load image Python extension: {e}")


# Loading dataset into an Artifact in wandb

First step is to load the dataset from .csv files, and add it as an artifact. Artifacts are useful for:
1) Dataset versioning
2) Supports deduplication which minimizes the storage space used when generating versions of the dataset.

In [9]:
gc_path = "./Data/gc_samples_filtered.csv"
d_path = "./Data/d_samples_filtered.csv"
coordinates_path = "./Data/coordinates_n"

damage_x_path = "./Data/x_ver"
damage_y_path = "./Data/y_ver"

In [10]:
# read data from csv file
gc = pd.read_csv(gc_path, header=None)
d = pd.read_csv(d_path, header=None)

# coordinates data from csv file
coordinates = pd.read_csv(coordinates_path, header=None)

# output mesh coordinates
damage_x = pd.read_csv(damage_x_path, header=None)
damage_y = pd.read_csv(damage_y_path, header=None)


In [11]:
# concatenate damage_x and damage_y
damage_xy = np.concatenate((damage_x, damage_y), axis=1)

In [12]:
# create numpy array with shape (len(gc), gc.shape[1], 2)
input_mesh = np.zeros((len(gc), gc.shape[1], 2))
damage_mesh = np.zeros((len(gc), damage_xy.shape[0], 2))

# for each sample in input_mesh, add the coordinates
for i in range(len(gc)):
    input_mesh[i, :, :] = coordinates
    damage_mesh[i, :, :] = damage_xy
    
# add gc to the last dimension of input_mesh to have shape of (len(gc), gc.shape[1], 3)
input_data = np.concatenate((input_mesh, np.expand_dims(gc, axis=2)), axis=2)
input_data = torch.from_numpy(input_data).float()

damage_data = np.concatenate((damage_mesh, np.expand_dims(d, axis=2)), axis=2)
damage_data = torch.from_numpy(damage_data).float()

In [4]:
run = wandb.init(project="FNO2D", entity="jyyresearch", job_type="upload")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjyyresearch[0m (use `wandb login --relogin` to force relogin)


In [5]:
raw_data_at = wandb.Artifact("fracture-damage-raw-data", type="raw_data")

### Convert dataset to hdf5 format for efficient storage

In [14]:
# convert input_data and damage_data to h5py file
input_data_h5 = input_data.numpy()
damage_data_h5 = damage_data.numpy()

# save h5py file
with h5py.File("Data/input_data.h5", "w") as f:
    f.create_dataset("input_data", data=input_data_h5)
    
with h5py.File("Data/damage_data.h5", "w") as f:
    f.create_dataset("damage_data", data=damage_data_h5)

In [16]:
# add h5py file to artifact
raw_data_at.add_file("Data/input_data.h5", name="input_data")
raw_data_at.add_file("Data/damage_data.h5", name="damage_data")

<ManifestEntry digest: pa0A9QJaDxjKYPrY3FclPg==>

In [16]:
run.log_artifact(raw_data_at)
run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [6]:
artifact = run.use_artifact('jyyresearch/FNO2D/fracture-damage-raw-data:v3', type='raw_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact fracture-damage-raw-data:v3, 423.10MB. 2 files... Done. 0:0:0


In [7]:
artifact_dir + "/input_data.h5"

'./artifacts/fracture-damage-raw-data:v3/input_data.h5'

In [11]:
input_data = h5py.File(artifact_dir + "/input_data", "r")
damage_data = h5py.File(artifact_dir + "/damage_data", "r")

In [12]:
DEVICE = 'gpu'
PROJECT_NAME = 'FNO2D'

# Set the random seeds to improve reproducibility by removing stochasticity
def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = False # Force cuDNN to use a consistent convolution algorithm
    torch.backends.cudnn.deterministic = True # Force cuDNN to use deterministic algorithms if available
    torch.use_deterministic_algorithms(True) # Force torch to use deterministic algorithms if available

set_seeds(0)

In [13]:
config = {
    'train_val_split': [0.80, 0.20], # These must sum to 1.0
    'batch_size' : 32, # Num samples to average over for gradient updates
}

In [17]:
# Define an initial set of transforms that we think will be useful
with wandb.init(project=PROJECT_NAME, job_type='define-transforms', config=config) as run:
    transform_dict = OrderedDict()
    transform_dict['ToTensor'] = {
        'device': DEVICE
    }
    # Include an operational index to verify the order
    for key_idx, key in enumerate(transform_dict.keys()):
        transform_dict[key]['order'] = key_idx
    # Create an artifact for logging the transforms
    data_transform_artifact = wandb.Artifact(
        'data-transforms', type='parameters',
        description='Data preprocessing functions and parameters.',
        metadata=transform_dict) # Optional for viewing on the web app; the data is also stored in the txt file below
    # Log the transforms in JSON format
    with data_transform_artifact.new_file('transforms.txt') as f:
        f.write(json.dumps(transform_dict, indent=4))
    run.log_artifact(data_transform_artifact)
config.update(transform_dict)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

### define some helper functions for transforming numpy to tensors

In [18]:
class ToTensor(object):
    """Convert numpy arrays to tensor arrays
    """
    def __init__(self, device=None):
        if device is None:
            device = "cpu"
        self.device = device
    
    def __call__(self, data_tuple):
        data, labels = data_tuple
        return (torch.from_numpy(data).to(self.device), torch.from_numpy(labels).to(self.device))

In [21]:
def get_transforms(transform_dict):
    """
    Given a dictionary of transform parameters, return a list of class instances for each transform
    Arguments:
        transform_dict (OrderedDict) with optional keys:
            ToTensor (dict) if present, requires the 'device' key that indicates the PyTorch device
    Returns:
        composed_transforms (PyTorch composed transform class) containing the requested transform steps in order
    """
    transform_functions = []
    for key in transform_dict.keys():
        if key=='ToTensor': # Convert array to a PyTorch Tensor
            transform_functions.append(ToTensor(
                transform_dict[key]['device']
            ))
        
    composed_transforms = transforms.Compose(transform_functions)
    return composed_transforms

In [57]:
# train test split of input_data and damage_data. Obtain the respective indices
train_val_split = config['train_val_split']
train_val_indices = np.split(np.random.permutation(len(input_data)), [int(train_val_split[0]*len(input_data))])

# create train and test dataset from input_data and damage_data
train_dataset = torch.utils.data.TensorDataset(input_data[train_val_indices[0]], damage_data[train_val_indices[0]])
val_dataset = torch.utils.data.TensorDataset(input_data[train_val_indices[1]], damage_data[train_val_indices[1]])



In [None]:
def make_split_artifact(run, raw_data, train_rows, val_rows, test_rows):
    """
    Creates a w&b artifact that contains a singular reference table (aka a ForeignIndex table).
    The ForeignIndex table has a single column that we are naming 'source'.
    It contains references to the original table (raw_data_table) for each of the splits.
    Arguments:
        run (wandb run) returned from wandb.init()
        raw_data_table (wandb Table) that contains your original tabular data
        train_rows (list of ints) indices that reference the training rows in the raw_data_table
        val_rows (list of ints) indices that reference the validation rows in the raw_data_table
        test_rows (list of ints) indices that reference the test rows in the raw_data_table
    """
    split_artifact = wandb.Artifact(
        'data-splits', type='dataset',
        description='Train, validation, test dataset splits')

    # Our data split artifact will only store index references to the original dataset to save space
    split_artifact.add(wandb.Table(
        columns=['source'],
        data=train_rows), 'train-data')

    split_artifact.add(wandb.Table(
        columns=['source'],
        data=val_rows), 'val-data')

    run.log_artifact(split_artifact)


def make_loaders(config):
    """
    Makes data loaders using a artifact containing the dataset splits (created using the make_split_artifact() function)
    The function assumes that you have created a data-splits artifact and a data-transforms artifact
    Arguments:
        config [dict] containing keys:
            batch_size (int) amount of rows (i.e. data instances) to be delivered in a single batch
    Returns:
        train_loader (PyTorch DataLoader) containing the training data
        val_loader (PyTorch DataLoader) containing the validation data
    """
    with wandb.init(project=PROJECT_NAME, job_type='package-data', config=config) as run:
        # Load transforms
        transform_dir = run.use_artifact('data-transforms:latest').download()
        transform_dict = json.load(open(os.path.join(transform_dir, 'transforms.txt')), object_pairs_hook=OrderedDict)
        composed_transforms = get_transforms(transform_dict)

        split_artifact = run.use_artifact('data-splits:latest')
        # Reformat data to (inputs, labels)
        train_loader = DataLoader(
            train_dataset,
            batch_size=config['batch_size'],
            shuffle=True,
            num_workers=0
        )
        val_loader = DataLoader(
            val_dataset,
            batch_size=config['batch_size'],
            batch_sampler=None,
            shuffle=False,
            num_workers=0)
    
    return train_loader, val_loader