In [1]:
%load_ext tensorboard

In [26]:
from dataclasses import dataclass
from datetime import datetime
from enum import Enum
import json
import logging
import math
import pprint
from typing import Tuple, List, Type, Union

import chemprop
import hyperopt
from hyperopt import hp
import numpy as np
import pandas as pd
import pickle
import pytorch_lightning as tl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
from rdkit import Chem
import scipy as sp
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, max_error
import torch
from torch import Tensor
from torch.nn import ReLU, Linear, MSELoss, Dropout
from torch.nn.functional import log_softmax, relu, dropout
from torch.optim import AdamW
from torchmetrics import MeanAbsoluteError, MeanSquaredError, PearsonCorrCoef, R2Score

from torch_geometric.loader import DataLoader
from torch_geometric.nn import (
    Sequential,
    MessagePassing, GCNConv, GATConv, GATv2Conv, GINConv,
    Aggregation, global_mean_pool, global_max_pool, global_add_pool
)

# Data Prep
Each dataset has the following columns: CID, SD, SD Z-score, DR, XC50, activity, neut-smiles, num. atoms and max atomic num. 

For the base of the project I will only be using the DR value and the neut-smiles representation. For any compound with a non-null DR value, the activity is 'Active'.

# Run trials

In [49]:
gcn_architecture = ModelArchitecture(
    layer_types=[GNNLayer.GCN, GNNLayer.GCN, GNNLayer.GCN],
    features=[133, 64, 16, 1],
    activation_funcs=[ActivationFunction.ReLU, ActivationFunction.ReLU, None],
    pool_func=PoolingFunction.MEAN
)

gat_architecture = ModelArchitecture(
    layer_types=[GNNLayer.GAT, GNNLayer.GAT, GNNLayer.GAT],
    features=[133, 64, 16, 1],
    activation_funcs=[ActivationFunction.ReLU, ActivationFunction.ReLU, None],
    pool_func=PoolingFunction.MEAN
)

gatv2_architecture = ModelArchitecture(
    layer_types=[GNNLayer.GATv2, GNNLayer.GATv2, GNNLayer.GATv2],
    features=[133, 64, 16, 1],
    activation_funcs=[ActivationFunction.ReLU, ActivationFunction.ReLU, None],
    pool_func=PoolingFunction.MEAN
)

In [33]:
%tensorboard --logdir={EXPERIMENT_DIR}

Reusing TensorBoard on port 6006 (pid 21916), started 0:11:17 ago. (Use '!kill 21916' to kill it.)

## Extension 2 - HyperOpt

In [None]:
def objective(parameters):
    dataset = DRDataset(root=DATA_DIR)
    dataset.shuffle()
    training_dataset, test_dataset = split_dataset(dataset, TRAIN_TEST_SPLIT)
    test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

    for train_fold, val_fold in k_folds(training_dataset, K_FOLDS):
        training_dataloader = DataLoader(train_fold, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
        validation_dataloader = DataLoader(val_fold, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
        
        model = LitGNN(architecture, DEFAULT_METRICS)

        early_stop_callback = EarlyStopping(
            monitor='loss/val',
            mode='min',
            patience=3,
        )

        trainer = tl.Trainer(
            default_root_dir=run_dir,
            deterministic=True,
            log_every_n_steps=1,
            max_epochs=MAX_EPOCHS,
            callbacks=[checkpoint_callback, early_stop_callback],
            enable_progress_bar=False
        )

        trainer.fit(
            model,
            training_dataloader,
            validation_dataloader,
        )
        
        return {'loss': trainer.callback_metrics['val/loss'], 'status': hp.STATUS_OK}

In [77]:
def objective(x):
    print(x)
    return {'loss': x['choice']['layers'] ** 2, 'status': hyperopt.STATUS_OK}

layer_names = [layer.name for layer in GNNLayer]
search_space = {
    'pool_func': hp.choice('name', ['MAX', 'MIN']),
    'choice': hp.choice('layers', [
        {
            'layers':i,
            'types': [hp.choice(f'types{i}{j}', layer_names) for j in range(i)]}
        for i in range(2, 5)
    ])      
}

best = hyperopt.fmin(
    fn=objective,
    space=search_space,
    algo=hyperopt.tpe.suggest,
    max_evals=3
)


{'choice': {'layers': 2, 'types': ('GAT', 'GATv2')}, 'pool_func': 'MAX'}                                                                                                                                     
{'choice': {'layers': 4, 'types': ('GATv2', 'GIN', 'GCN', 'GATv2')}, 'pool_func': 'MAX'}                                                                                                                     
{'choice': {'layers': 2, 'types': ('GCN', 'GAT')}, 'pool_func': 'MAX'}                                                                                                                                       
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 43.48trial/s, best loss: 4.0]


In [199]:
results=json.loads(input())
print()
result_arr = [str(results[name][measure]) for name in ['loss'] + list(DEFAULT_METRICS.keys()) for measure in ['mean', 'variance']]
print("\t".join(result_arr)) 

 {"loss": {"mean": 0.1427062451839447, "variance": 5.5452968808822334e-05}, "mae": {"mean": 0.3190796375274658, "variance": 6.943024345673621e-05}, "rmse": {"mean": 0.3752123713493347, "variance": 9.34278141357936e-05}, "r2": {"mean": -1.9244879484176636, "variance": 0.022452671080827713}, "max_error": {"mean": 0.7781556844711304, "variance": 0.0005830924492329359}}



0.1427062451839447	5.5452968808822334e-05	0.3190796375274658	6.943024345673621e-05	0.3752123713493347	9.34278141357936e-05	-1.9244879484176636	0.022452671080827713	0.7781556844711304	0.0005830924492329359
