# 1. Requirements

### Using Docker

In [4]:
# TODO
# Pull one of our docker images
# docker pull gerome_v/...

# Run docker container using pulled docker image

### Using gitlab repo (supergcn_2.0 repo feature/batchload)

In [5]:
# !git checkout feature/batchload
# !git pull

/bin/sh: 1: git: not found
/bin/sh: 1: git: not found


# 2. Hydra components

1. Dataset
    - Dataset setup
        - save and specify which file paths to load.
    - EdgeCriterion
        - criterion for initial graph construction.
   
2. Model
    - torch.nn.Module
3. Trainer
    - transductive training ('full' mode) or inductive training ('batch' mode).
4. Config
    - global dictionary containing information about all the classes and configuration from user defined yaml file

** Basically, you can also jump to 4. Config if there is no need to add custom components.

### 

## 2.1 Dataset setup
- Inherit from `DatasetSetupBuilder` so that Hydra will know this dataset

In [None]:
# ./supergcn_2.0/base/datasets/dataset_setup.py

# When setting up dataset. It always inherits from DatasetSetupBuilder
class MyDataset(DatasetSetupBuilder):
    def __init__(self):
        super(MyDataset, self).__init__()

    def build_dataset(self):
        # This method is called in order to build the datasets
        # You can write code here to build then save the numpy dataset
        # within the filesystem
        pass
    
    # This dictionary will be used internally when loading the dataset.
    # You just have to specify where the files are located.
    self.dirs = {
                 'img': [folder_dir],
                 'img_paths': [general_dir + 'MNIST_numpy/img_paths.npy'],
                 'feat': [general_dir + 'MNIST_numpy/features.npy'],
                 'seq': [general_dir + 'MNIST_numpy/seq_data.npy'],
        
                 # Define targets here
                 'targets': [general_dir + 'MNIST_numpy/targets.npy'],
                 
                 # Loss function
                 'loss': [{'loss': 'CrossEntropyLoss', 'wts': 1.0}],
                 'meta': general_dir + 'MNIST_numpy/meta_data.npy',
                 'adj': general_dir + 'MNIST_numpy',
                 
                 # The 0th column in the meta data matrix will be used as adjacency
                 'adjtoinput': {'img': [0],
                                'feat': [0],
                                'seq': [0]}
                     }

## 2.2 Graph construction

In [None]:
# `./base/configs/default.yaml` 
# Specify which columns in your meta data you want to use
# This will be used when constructing the initial graph
# Below we are using 0th, 1st, and 2nd column of the meta data
meta_columns:
    - 0 
    - 1
    - 2

# `./base/datasets/dataset_base.py`
# Define similarity metric using EdgeCriterionBuilder
class MyDatasetEdgeCriterion(EdgeCriterionBuilder):
    def __init__(self):
        super(MyDatasetEdgeCriterion, self).__init__()
    
    # Method needed to return the adjacency matrix as numpy array (N x N) 
    # where N are the number of nodes.
    @staticmethod
    def edge_criterion(meta_data, meta_col):        
        # Thresholds you want to use for every meta information
        threshold = [2, 0, 5]  # Dummy dataset age, gender, weight
        
        # Define whichever similarity metric you want for as long as 
        # you return an N x N numpy array.
        dist = np.abs(meta_data[:, meta_col] - meta_data[:, meta_col, None])
        edges = dist <= threshold[meta_col]
        return edges.numpy()

## 2.3 Model
- inherit from `ModelBuilder`
- Model configuration can be specified within the yaml file
- `forward` method's input is a dictionary and should also return a dictionary

In [None]:
# Example GCN model
class MyModel(ModelBuilder):
    def __init__(self, conf, idx):
        super(MyModel, self).__init__()
        gnn_list = []
        for layer in range(conf.layers):
            kwargs = self.set_kwargs(conf, layer, idx)
            gnn_list.append(tg.GCNConv(**kwargs))
        self.model_list = nn.ModuleList(gnn_list)
        self.activation = Config.global_dict['model'][conf.activation.name]
        self.activation = self.activation(**conf.activation)

    def forward(self, x_dict):
        x, adj, adj_wts = x_dict['input'], x_dict['adj'], x_dict.get('adj_wts', None)
        for model in self.model_list:
            x = model(x, adj, adj_wts)
            x = self.activation(x)
            
        # Make sure we return all objects within the dict
        out_dict = {k: v for k, v in x_dict.items()}
        
        # Include new output
        out_dict['input'] = x
        out_dict['adj'] = adj
        return out_dict



# Example yaml configuration how to use the module above
# We always need a DynamicBlock and AggregatorBlock pairs
MyModel:
    # -----------------------------------------
    # Pair 0
    DynamicBlock0:
        # the distributor is a list of indices specifying which input you want to use
        # From above example of MyDataset, Hydra will yield a "batch" of images, a feature vector, 
        # and a sequential input.
        # [{'input': image_tensor}, {'input': feat_tensor}, {'input': sequential_tensor}]
        
        # The distributor will distribute which input tensor will be use.
        # You can leave this blank as `distributor:` and the system will automatically generate a list [0, 1, 2].
        distributor:
            - 0        
        ParallelGNN:
            Model0:
                model: MyModel
                    
                # Which input from the distributor will be used
                order:
                    - 0
                layers: 2
                activation:
                    name: LeakyReLU
                    negative_slope: 0.2

                layer0:
                    in_channels:
                    out_channels: 16
                    K: 3

                layer1:
                    in_channels: 16
                    out_channels:
                    K: 3

    AggregatorBlock0:
        # list of list of indices to allow aggregation of multiple
        # input from the previous block
        distributor:
            -
                - 0
        ParallelAgg:
            Agg0:
                
                # You can use whichever aggregator from `base.models.aggregator.py`
                # Here we are not doing anything to the inputs and we are just passing
                # them to the next block.
                agg: Pass
                order:
                    - 0


### 2.3 Trainer
- Handles training in full and batch mode.
- `KFoldModelTrainer` performs k-fold stratified cross-validation.
- just specify in `default.yaml` file

    - batchtype: 'full' # for transductive learning

    - batchtype: 'batch' # for inductive learning

- in case you want to create a custom training workflow just inherit from TrainerBuilder (`base.trainer.train_base.TrainerBuilder`) and specify in defaults.yaml that you want to use this instead of our default `KFoldModelTrainer`.


`train:
      name: MyNewKFoldModelTrainer`

### 2.4 Config
- given that you want to use one of the datasets within Hydra and that the model components within Hydra are already enough for your experiments including the default trainer.
- you can just setup your configuration using a yaml file
    - here there are two important yaml files
       1. default.yaml
       2. model_architecture.yaml

In [None]:
### This is the default.yaml
# Folder location of user-defined classes
# You can ignore this for now
include: /workspace/supergcn_2.0/setups/

# Dataset class names
dataset:
  name: UKBBDataset # Dataset
  datatype: BatchDataset # We use the same class for full and batch mode. Ignore for now.
  edge_criterion: UKBBMEdgeCriterion # used to initialize the graph {MyDataset}EdgeCriterion
  p_missing: 0.0 # [0.0, 1.0) level of missingness, 0.0 means use all available data
  preprocess_feat: true # Set to True to standardized data (zero-mean and unit variance scaling)

# Model class names
model:
  name: DynamicNet  # This is the general builder class for all models no need to change this
  yaml_path:  '../base/configs/models/HydraGNN.yaml' # location of your model architecture

# User-defined transformation for image inputs
user_transforms:
  - RandomRotation:
      degrees: 1

# Trainer class name
train:
  name: KFoldModelTrainer # stratified k-fold cross-validation trainer
  tensorboard: false # whether to use tensorboard or not

#
infer:
  name:

# Arguments currently in use.
batchtype: 'full' #
folds: 10 # number of cross-validation folds
train_mode: classification

# Column index of meta data to use
meta_columns:
  - 0
  - 1
  - 2
multigraph: true
epochs: 600
patience: 30
supervision_rate: 1.0
num_print: 10
no_cuda: false
GPU_device: '0'
writing: false
seed: 42
batch_size: 10
label_list:
  - 0
  - 1
  - 2
lr: 0.001
weight_decay: 0.005
dropout: 0.3
alpha: 0.3