# MoleculeNet - ESOL Dataset

Most of the times, you want to perform multiple runs to find the best architecture of your model. Manual logging and management of the configurations is extremely inefficient. For this purpose, you can leverage multiple existing tool:
- PyTorch Lightning (Torch wrapper to log, remove boilerplate code, automate runs, multi-gpu training)
- Hydra (configuration management, perform hyperparameters sweep)
- Optuna (hyperparameters search and optimization)
- Weight and Biases/MLFlow (logging, hyperparameters search)

In [6]:
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import GINConv, global_add_pool, GCNConv, GraphConv


class GNN(nn.Module):
    def __init__(self, in_channels, hidden_dim=128, num_layers=5, dropout=0.2, gnn_type="GIN"):
        super().__init__()
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.dropout = dropout


        # first layer
        if gnn_type == "GIN":
            nn1 = nn.Sequential(nn.Linear(in_channels, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
            self.convs.append(GINConv(nn1))
        elif gnn_type == "GCN":
            self.convs.append(GCNConv(in_channels, hidden_dim))
        else:
            self.convs.append(GraphConv(in_channels, hidden_dim))


        self.bns.append(nn.BatchNorm1d(hidden_dim))


        for _ in range(num_layers - 1):
            if gnn_type == "GIN":
                nnk = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim))
                self.convs.append(GINConv(nnk))
            elif gnn_type == "GCN":
                self.convs.append(GCNConv(hidden_dim, hidden_dim))
            else:
                self.convs.append(GraphConv(hidden_dim, hidden_dim))
                self.bns.append(nn.BatchNorm1d(hidden_dim))

            self.dropout = dropout
            self.mlp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1)
            )


    def forward(self, x, edge_index, batch):
        for conv, bn in zip(self.convs, self.bns):
            x = conv(x, edge_index)
            x = bn(x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)


            x = global_add_pool(x, batch)
            out = self.mlp(x).squeeze(-1)
            return out

## Lightning setup
To run with lighning, we shuold setup a LightningDataModule

In [2]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import MoleculeNet
from pytorch_lightning import LightningDataModule
from sklearn.model_selection import train_test_split


class MoleculeNetDataModule(LightningDataModule):
    def __init__(self, root, name, batch_size=64, num_workers=4, pin_memory=True, seed=42):
        super().__init__()
        self.root = root
        self.name = name
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.seed = seed


    def prepare_data(self):
        # download if needed
        MoleculeNet(self.root, name=self.name)


    def setup(self, stage=None):
        dataset = MoleculeNet(self.root, name=self.name)


        # MoleculeNet returns a dataset with .y as targets for regression tasks
        # We'll create a reproducible random split (80/10/10)
        n = len(dataset)
        idx = list(range(n))


        train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=self.seed)
        val_idx, test_idx = train_test_split(test_idx, test_size=0.5, random_state=self.seed)


        self.train_dataset = dataset[train_idx]
        self.val_dataset = dataset[val_idx]
        self.test_dataset = dataset[test_idx]


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers, pin_memory=self.pin_memory)


    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, pin_memory=self.pin_memory)


    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers, pin_memory=self.pin_memory)

In [24]:
import os
from omegaconf import DictConfig, OmegaConf
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger
from torchmetrics.functional import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np


class LitGNN(pl.LightningModule):
    def __init__(self, in_channels, cfg: DictConfig):
        super().__init__()
        self.save_hyperparameters(OmegaConf.to_container(cfg, resolve=True))
        self.model = GNN(in_channels, hidden_dim=cfg.model.hidden_dim, num_layers=cfg.model.num_layers, dropout=cfg.model.dropout, gnn_type=cfg.model.gnn_type)
        self.criterion = nn.MSELoss()
        self.val_preds, self.val_targets = [], []
        self.test_preds, self.test_targets = [], []


    def forward(self, x, edge_index, batch):
        return self.model(x, edge_index, batch)


    def step(self, batch, stage):
        y = batch.y.view(-1).float()
        pred = self(batch.x, batch.edge_index, batch.batch)
        loss = self.criterion(pred, y)
        return loss, pred.detach().cpu().numpy(), y.detach().cpu().numpy()


    def training_step(self, batch, batch_idx):
        loss, _, _ = self.step(batch, 'train')
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, pred, y = self.step(batch, 'val')
        self.val_preds.append(pred)
        self.val_targets.append(y)
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def on_validation_epoch_end(self):
        preds = np.concatenate(self.val_preds)
        ys = np.concatenate(self.val_targets)
        mse = torch.mean((torch.tensor(preds) - torch.tensor(ys)) ** 2).item()
        rmse = mse ** 0.5
        self.log("val_rmse", rmse, prog_bar=True)
        self.val_preds.clear()
        self.val_targets.clear()
    
    # def test_step(self, batch, batch_idx):
    #     y = batch.y.view(-1).float()
    #     pred = self(batch.x, batch.edge_index, batch.batch)
    #     loss = self.criterion(pred, y)
    #     # optionally log metrics
    #     self.log("test_loss", loss)
    #     return {"loss": loss, "pred": pred, "y": y}

    # def on_test_epoch_end(self):
    #     preds = torch.cat(self.test_preds).numpy()
    #     ys = torch.cat(self.test_targets).numpy()
    #     mse = np.mean((preds - ys) ** 2)
    #     rmse = mse ** 0.5
    #     mae = mean_absolute_error(ys, preds)
    #     r2 = r2_score(ys, preds)
    #     self.log("test_rmse", rmse)
    #     self.log("test_mae", mae)
    #     self.log("test_r2", r2)
    #     # clear memory
    #     self.test_preds.clear()
    #     self.test_targets.clear()

    def configure_optimizers(self):
        if self.hparams['optim']['optimizer'].lower() == 'adam':
            opt = torch.optim.Adam(self.parameters(), lr=self.hparams['optim']['lr'], weight_decay=self.hparams['optim']['weight_decay'])
        else:
            opt = torch.optim.SGD(self.parameters(), lr=self.hparams['optim']['lr'], weight_decay=self.hparams['optim']['weight_decay'])
        return opt

In [26]:
def main(cfg: DictConfig):

    pl.seed_everything(cfg.seed)


    # datamodule
    dm = MoleculeNetDataModule(root=cfg.data.root, name=cfg.data.name, batch_size=cfg.data.batch_size, num_workers=cfg.data.num_workers, pin_memory=cfg.data.pin_memory, seed=cfg.seed)
    dm.prepare_data()
    dm.setup()


    # infer input channels from first sample
    sample = dm.train_dataset[0]
    in_channels = sample.x.shape[1]


    # model
    lit = LitGNN(in_channels=in_channels, cfg=cfg)


    # logger (CSV logger will create metrics CSV files)
    csv_logger = CSVLogger(save_dir=cfg.logging.csv_log_dir, name=cfg.logging.name)


    # callbacks
    checkpoint_cb = ModelCheckpoint(dirpath=os.path.join(csv_logger.log_dir, "checkpoints"), save_top_k=cfg.callbacks.checkpoint_top_k, monitor='val_rmse', mode='min')
    early_stop_cb = EarlyStopping(monitor='val_rmse', patience=cfg.callbacks.early_stopping_patience, mode='min')


    trainer = pl.Trainer(max_epochs=cfg.trainer.max_epochs, logger=csv_logger, callbacks=[checkpoint_cb, early_stop_cb], precision=cfg.trainer.precision)


    trainer.fit(lit, datamodule=dm)
    # trainer.test(lit, datamodule=dm)


In [27]:
from hydra.core.global_hydra import GlobalHydra

# Reset Hydra if already initialized (for repeated runs in notebooks)
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()

from hydra import compose, initialize

initialize(config_path="../config", version_base=None)
cfg = compose(config_name="config")
main(cfg)


Seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type    | Params | Mode 
----------------------------------------------
0 | model     | GNN     | 158 K  | train
1 | criterion | MSELoss | 0      | train
----------------------------------------------
158 K     Trainable params
0         Non-trainable params
158 K     Total params
0.634     Total estimated model params size (MB)
40        Modules in train mode
0         Modules in eval mode


Epoch 29: 100%|██████████| 15/15 [00:00<00:00, 37.06it/s, v_num=11, val_loss=3.460, val_rmse=1.860, train_loss=1.100]

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 15/15 [00:00<00:00, 36.63it/s, v_num=11, val_loss=3.460, val_rmse=1.860, train_loss=1.100]
