# Run GNNs
with early stopping, validation dataset, and learning rate hyperparameter tuning

In [1]:
%cd /home/ltchen/gnnpp
import sys
import os
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    load_distances,
    normalize_features_and_create_graphs,
    rm_edges,
    summary_statistics,
)
from exploration.graph_creation import *
from models.graphensemble.multigraph import *

/home/ltchen/gnnpp


In [2]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/gnn3_24h/models")
JSONPATH = os.path.join(DIRECTORY, "trained_models/best_24h/params.json") # change learning rates! - check moritz' BA

# with open(JSONPATH, "r") as f:
#     print(f"[INFO] Loading {JSONPATH}")
#     args_dict = json.load(f)
# config = args_dict
# learning_rates = [0.002, 0.0002, 0.00002]
#
# print(config)
# print(config['lr'])
# print(config['max_dist'])
# print(type(config))
# print(type(config['lr']))
# print(type(config['gnn_hidden']))
# print(config['gnn_hidden'])
# print(config['batch_size'])
# '''{"batch_size":8,
# "gnn_hidden":265,
# "gnn_layers":2,
# "heads":8,
# "lr":0.0002, # could also try 0.001, or 0.00005?
# "max_dist":100,
# "max_epochs": 31}'''

In [3]:
config = {
    "batch_size":8,
    "gnn_hidden":265,
    "gnn_layers":2,
    "heads":8,
    "lr_list": [0.0002, 0.00007],
    # "max_dist":100,
    "max_epochs": 50}

In [4]:
dataframes = load_dataframes(mode="hyperopt", leadtime= "24h") # load newly created dataframes
dataframes = summary_statistics(dataframes)


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for valid


### Graph 1

In [5]:
graphs_train_rf, graphs_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid']], station_df=dataframes['stations'], attributes=["geo"], edges=[("geo", 100)], sum_stats = True)


[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 222.73it/s]
100%|██████████| 836/836 [00:03<00:00, 251.19it/s]


In [8]:
FILENAME = "g1_train_run_24h"
PROJECTNAME = "gnn_run3p7"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
                project=PROJECTNAME, id=FILENAME+f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"], reinit=True
        ):
        run_config = wandb.config
        print(run_config['batch_size'])

        print("[INFO] Creating data loaders...")
        g1_train_loader = DataLoader(graphs_train_rf, batch_size=run_config['batch_size'], shuffle=True)
        g1_valid_loader = DataLoader(graphs_valid_rf[0], batch_size=run_config['batch_size'], shuffle=False)
        train_loader = g1_train_loader
        valid_loader = g1_valid_loader

        print("[INFO] Creating model...")
        emb_dim=20
        in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
        edge_dim = graphs_train_rf[0].num_edge_features

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=7)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            refresh_rate = 0,
            callbacks=early_stop,
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

8
[INFO] Creating data loaders...
[INFO] Creating model...
[INFO] Training model...


Traceback (most recent call last):
  File "/tmp/ipykernel_3847497/3978896556.py", line 46, in <module>
    trainer = L.Trainer(
  File "/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/utilities/argparse.py", line 70, in insert_env_defaults
    return fn(self, **kwargs)
TypeError: Trainer.__init__() got an unexpected keyword argument 'refresh_rate'


TypeError: Trainer.__init__() got an unexpected keyword argument 'refresh_rate'

### Graph 2

In [9]:
g2_train_rf, g2_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 227.65it/s]
100%|██████████| 836/836 [00:03<00:00, 266.21it/s]


In [12]:
print("[INFO] Creating data loaders...")
g2_train_loader = DataLoader(g2_train_rf, batch_size=config['batch_size'], shuffle=True) # GRAPH
g2_valid_loader = DataLoader(g2_valid_rf[0], batch_size=config['batch_size'], shuffle=False) # GRAPH
train_loader = g2_train_loader # GRAPH
valid_loader = g2_valid_loader # GRAPH
edge_dim = g2_train_rf[0].num_edge_features #GRAPH
emb_dim = 20
in_channels = g2_train_rf[0].x.shape[1] + emb_dim - 1 # GRAPH


[INFO] Creating data loaders...


In [13]:
FILENAME = "g2_train_run_24h" # GRAPH
PROJECTNAME = "gnn_run3p7"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
                project=PROJECTNAME, id=FILENAME+f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"], reinit=True
        ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=7)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
                max_epochs=run_config['max_epochs'],
                log_every_n_steps=1,
                accelerator="gpu",
                enable_progress_bar=True,
                logger=wandb_logger,
                callbacks=[early_stop, progress_bar],
            )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇█████
train_loss_epoch,█▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▄▃▂▂▂▂▃▂▂▂▁▂▂▂▂▁▂▂▁▂▁▁▁▁▂▂▁▂▂▁▁▁▁▁▁▁▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█████
val_loss,█▃▃▂▂▂▂▁▁▁▁▁▂▂▂▂▂▁▁

0,1
epoch,18.0
train_loss_epoch,0.59088
train_loss_step,0.61253
trainer/global_step,6212.0
val_loss,0.67983


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇██
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▃▂▂▂▂▂▂▂▂▁▁▂▁▂▂▁▂▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇██████
val_loss,█▅▅▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▂▁▂

0,1
epoch,34.0
train_loss_epoch,0.54642
train_loss_step,0.5524
trainer/global_step,11444.0
val_loss,0.69538


### Graph 3

In [15]:
g3_train_rf, g3_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"],
                                                                 edges=[("geo", 55), ("alt", 6.5), ("alt-orog", 2.5)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 222.85it/s]
100%|██████████| 836/836 [00:03<00:00, 275.20it/s]


In [16]:
print("[INFO] Creating data loaders...")
g3_train_loader = DataLoader(g3_train_rf, batch_size=config['batch_size'], shuffle=True)  # GRAPH
g3_valid_loader = DataLoader(g3_valid_rf[0], batch_size=config['batch_size'], shuffle=False)  # GRAPH
train_loader = g3_train_loader  # GRAPH
valid_loader = g3_valid_loader  # GRAPH
edge_dim = g3_train_rf[0].num_edge_features  #GRAPH
emb_dim = 20
in_channels = g3_train_rf[0].x.shape[1] + emb_dim - 1  # GRAPH

[INFO] Creating data loaders...


In [17]:
FILENAME = "g3_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3p7"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
            project=PROJECTNAME, id=FILENAME + f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"],
            reinit=True
    ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=7)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./gnn_run3p7/g3_train_run_24h_0.0002/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
---

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▄▂▂▃▂▂▃▃▂▂▂▃▂▂▂▂▂▂▂▂▂▃▂▂▂▁▂▂▁▂▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇██
val_loss,█▅▄▃▂▂▂▂▁▂▁▂▂▁▁▁▂▃▂▁▁▁▁▁▁▁▂

0,1
epoch,26.0
train_loss_epoch,0.52703
train_loss_step,0.5134
trainer/global_step,8828.0
val_loss,0.70118


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
train_loss_epoch,█▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▇▆▅▄▄▆▅▄▃▃▄▃▅▄▂▄▂▄▂▃▄▃▃▄▁▃▃▃▁▂▂▅▂▁▂▁▃▃
trainer/global_step,▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_loss,█▆▅▄▄▃▃▃▃▂▂▂▂▂▁▂▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁

0,1
epoch,32.0
train_loss_epoch,0.56314
train_loss_step,0.48958
trainer/global_step,10790.0
val_loss,0.67225


### Graph 4

In [18]:
g4_train_rf, g4_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"],
                                                                 edges=[("geo", 100), ("alt", 10), ("alt-orog", 5)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 226.27it/s]
100%|██████████| 836/836 [00:03<00:00, 270.15it/s]


In [19]:
print("[INFO] Creating data loaders...")
g4_train_loader = DataLoader(g4_train_rf, batch_size=config['batch_size'], shuffle=True)  # GRAPH
g4_valid_loader = DataLoader(g4_valid_rf[0], batch_size=config['batch_size'], shuffle=False)  # GRAPH
train_loader = g4_train_loader  # GRAPH
valid_loader = g4_valid_loader  # GRAPH
edge_dim = g4_train_rf[0].num_edge_features  #GRAPH
emb_dim = 20
in_channels = g4_train_rf[0].x.shape[1] + emb_dim - 1  # GRAPH

[INFO] Creating data loaders...


In [20]:
FILENAME = "g4_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3p7"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
            project=PROJECTNAME, id=FILENAME + f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"],
            reinit=True
    ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=7)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇████
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▅▄▃▃▃▂▃▂▃▃▂▃▃▃▂▃▂▃▂▂▂▂▂▂▂▁▁▂▁▂▂▂▂▂▂▁▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇██
val_loss,█▆▄▃▅▂▄▂▂▂▁▁▂▂▁▁▁▂▁▁▁▁▂▁▁▂▁

0,1
epoch,26.0
train_loss_epoch,0.53507
train_loss_step,0.52631
trainer/global_step,8828.0
val_loss,0.67461


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇████
train_loss_epoch,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▅▆▅▃▅▅▃▃▅▃▃▃▃▃▂▄▃▄▂▂▃▂▃▂▂▂▃▂▂▂▂▁▁▂▂▂▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇██
val_loss,█▆▅▄▄▃▃▂▂▂▂▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁

0,1
epoch,35.0
train_loss_epoch,0.54537
train_loss_step,0.46447
trainer/global_step,11771.0
val_loss,0.68092


## Graph 5

In [21]:
g5_train_rf, g5_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["dist2"],
                                                                 edges=[("dist2", 0.005)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1
[INFO] Loading distances from file...


100%|██████████| 2612/2612 [00:11<00:00, 226.92it/s]


[INFO] Loading distances from file...


100%|██████████| 836/836 [00:03<00:00, 267.95it/s]


In [22]:
print("[INFO] Creating data loaders...")
g5_train_loader = DataLoader(g5_train_rf, batch_size=config['batch_size'], shuffle=True)  # GRAPH
g5_valid_loader = DataLoader(g5_valid_rf[0], batch_size=config['batch_size'], shuffle=False)  # GRAPH
train_loader = g5_train_loader  # GRAPH
valid_loader = g5_valid_loader  # GRAPH
edge_dim = g5_train_rf[0].num_edge_features  #GRAPH
emb_dim = 20
in_channels = g5_train_rf[0].x.shape[1] + emb_dim - 1  # GRAPH

[INFO] Creating data loaders...


In [23]:
FILENAME = "g5_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
            project=PROJECTNAME, id=FILENAME + f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"],
            reinit=True
    ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./gnn_run3/g5_train_run_24h_0.0002/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
-----

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇█
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▅▅▅▄▄▃▅▂▃▃▃▃▄▃▃▄▃▃▃▂▃▄▃▄▃▃▂▂▁▃▂▄▃▄▂▁▃▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇█████
val_loss,█▇▄▄▃▃▂▃▁▅▂▂▁▁▁▂▂▁▂▃▁▂▁▁▁▂▁▁

0,1
epoch,27.0
train_loss_epoch,0.54943
train_loss_step,0.49833
trainer/global_step,9155.0
val_loss,0.66176


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory ./gnn_run3/g5_train_run_24h_7e-05/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
------

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
train_loss_epoch,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▆▃▆▄▄▃▃▃▃▂▂▃▃▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▂▂▁▁▁▂▁▂▁▂
trainer/global_step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_loss,█▇▆▅▄▄▃▃▂▃▂▂▂▁▁▂▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,46.0
train_loss_epoch,0.52845
train_loss_step,0.47425
trainer/global_step,15368.0
val_loss,0.67147
