# Run GNNs
with early stopping, validation dataset, and learning rate hyperparameter tuning

In [9]:
%cd /home/ltchen/gnnpp
import sys
import os
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    load_distances,
    normalize_features_and_create_graphs,
    rm_edges,
    summary_statistics,
)
from exploration.graph_creation import *
from models.graphensemble.multigraph import *

/home/ltchen/gnnpp


In [2]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/gnn3_24h/models")
JSONPATH = os.path.join(DIRECTORY, "trained_models/best_24h/params.json") # change learning rates! - check moritz' BA

# with open(JSONPATH, "r") as f:
#     print(f"[INFO] Loading {JSONPATH}")
#     args_dict = json.load(f)
# config = args_dict
# learning_rates = [0.002, 0.0002, 0.00002]
#
# print(config)
# print(config['lr'])
# print(config['max_dist'])
# print(type(config))
# print(type(config['lr']))
# print(type(config['gnn_hidden']))
# print(config['gnn_hidden'])
# print(config['batch_size'])
# '''{"batch_size":8,
# "gnn_hidden":265,
# "gnn_layers":2,
# "heads":8,
# "lr":0.0002, # could also try 0.001, or 0.00005?
# "max_dist":100,
# "max_epochs": 31}'''

In [14]:
config = {
    "batch_size":8,
    "gnn_hidden":265,
    "gnn_layers":2,
    "heads":8,
    "lr_list": [0.002, 0.0002, 0.00007],
    # "max_dist":100,
    "max_epochs": 50}

In [4]:
dataframes = load_dataframes(mode="hyperopt", leadtime= "24h") # load newly created dataframes
dataframes = summary_statistics(dataframes)


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for valid


In [6]:
graphs_train_rf, graphs_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid']], station_df=dataframes['stations'], attributes=["geo"], edges=[("geo", 100)], sum_stats = True)


[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 224.96it/s]
100%|██████████| 836/836 [00:03<00:00, 261.03it/s]


In [None]:
FILENAME = "g1_train_run_24h"
PROJECTNAME = "gnn_run3"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
                project=PROJECTNAME, id=FILENAME+f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"], reinit=True
        ):
        run_config = wandb.config
        print(run_config['batch_size'])

        print("[INFO] Creating data loaders...")
        g1_train_loader = DataLoader(graphs_train_rf, batch_size=run_config['batch_size'], shuffle=True)
        g1_valid_loader = DataLoader(graphs_valid_rf[0], batch_size=run_config['batch_size'], shuffle=False)
        train_loader = g1_train_loader
        valid_loader = g1_valid_loader

        print("[INFO] Creating model...")
        emb_dim=20
        in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
        edge_dim = graphs_train_rf[0].num_edge_features

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            refresh_rate = 0,
            callbacks=early_stop,
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


8
[INFO] Creating data loaders...
[INFO] Creating model...


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.f

[INFO] Training model...
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


                                                                           

/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 0:  60%|██████    | 197/327 [00:06<00:04, 30.75it/s, v_num=.002, train_loss_step=1.310]

In [5]:
g2_train_rf, g2_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 225.82it/s]
100%|██████████| 836/836 [00:03<00:00, 255.25it/s]


In [7]:
print("[INFO] Creating data loaders...")
g2_train_loader = DataLoader(g2_train_rf, batch_size=config['batch_size'], shuffle=True) # GRAPH
g2_valid_loader = DataLoader(g2_valid_rf[0], batch_size=config['batch_size'], shuffle=False) # GRAPH
train_loader = g2_train_loader # GRAPH
valid_loader = g2_valid_loader # GRAPH
edge_dim = g2_train_rf[0].num_edge_features #GRAPH
emb_dim = 20
in_channels = g2_train_rf[0].x.shape[1] + emb_dim - 1 # GRAPH


[INFO] Creating data loaders...


In [11]:
FILENAME = "g2_train_run_24h" # GRAPH
PROJECTNAME = "gnn_run3"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
                project=PROJECTNAME, id=FILENAME+f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"], reinit=True
        ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
                max_epochs=run_config['max_epochs'],
                log_every_n_steps=1,
                accelerator="gpu",
                enable_progress_bar=True,
                logger=wandb_logger,
                callbacks=[early_stop, progress_bar],
            )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.f

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇████
train_loss_epoch,█▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▄█▃▃▄▃▂▂▂▂▂▂▂▂▂▁▂▃▂▂▁▁▂▁▁▂▁▁▂▂▁▂▂▁▂▂▁▁▁▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
val_loss,▇▃▃█▃▃▃▃▃▃▂▃▁▂▂▁▁▁▂▁▂▂▁▁▁▁▂▂▂▂

0,1
epoch,29.0
train_loss_epoch,0.54459
train_loss_step,0.55882
trainer/global_step,9809.0
val_loss,0.68494


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇█████
train_loss_epoch,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_loss_step,▅▅▆█▅▅▃▅▅▃▄▄▂▃▂▃▃▃▃▄▄▃▁▁▂▃▂▂▂▁▁▂▁▂▂▁▁▁▃▂
trainer/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇███
val_loss,█▅▄▃▄▂▃▂▃▂▂▃▂▂▂▁▁▁▁▂▂▁▂▁▂▂▁

0,1
epoch,26.0
train_loss_epoch,0.52843
train_loss_step,0.87131
trainer/global_step,8828.0
val_loss,0.68037


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
`Trainer.fit` stopped: `max_epochs=40` reached.


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█████
train_loss_epoch,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▅▅▅▄▅▅▅▄▃▄▄▃▂▅▃▃▄▃▄▃▂▃▃▃▃▃▃▃▄▃▄▃▃▂▂▂▃▁
trainer/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██████
val_loss,█▆▆▄▃▃▂▂▂▃▂▂▂▂▁▂▁▁▁▁▁▂▁▁▁▂▁▂▁▁▂▂▁▁▁▁▁▁▁▂

0,1
epoch,39.0
train_loss_epoch,0.53622
train_loss_step,0.53841
trainer/global_step,13079.0
val_loss,0.68987


In [12]:
g3_train_rf, g3_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['valid']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"],
                                                                 edges=[("geo", 55), ("alt", 6.5), ("alt-orog", 2.5)],
                                                                 sum_stats=True)

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 224.57it/s]
100%|██████████| 836/836 [00:03<00:00, 273.32it/s]


In [13]:
print("[INFO] Creating data loaders...")
g3_train_loader = DataLoader(g3_train_rf, batch_size=config['batch_size'], shuffle=True)  # GRAPH
g3_valid_loader = DataLoader(g3_valid_rf[0], batch_size=config['batch_size'], shuffle=False)  # GRAPH
train_loader = g3_train_loader  # GRAPH
valid_loader = g3_valid_loader  # GRAPH
edge_dim = g3_train_rf[0].num_edge_features  #GRAPH
emb_dim = 20
in_channels = g3_train_rf[0].x.shape[1] + emb_dim - 1  # GRAPH

[INFO] Creating data loaders...


In [15]:
FILENAME = "g3_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3"
for lr in config['lr_list']:
    sweep_config = {**config, "learning_rate": lr}
    with wandb.init(
            project=PROJECTNAME, id=FILENAME + f"_{lr}", config=sweep_config, tags=["earlystop", "lr_tuning"],
            reinit=True
    ):
        run_config = wandb.config

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=run_config['gnn_hidden'],
            out_channels_gnn=run_config['gnn_hidden'],
            num_layers_gnn=run_config['gnn_layers'],
            heads=run_config['heads'],
            hidden_channels_deepset=run_config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=run_config['learning_rate']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=run_config['max_epochs'],
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇████
train_loss_epoch,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▇▆▄▃▂▅▆▃▄▅▄▃▃▃▃▄▂▂▃▂▂▁▄▆▂▂▄▃▂▂▃▁▂▁▃▃▂▄
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇██
val_loss,▇█▄▄▂▆▃▂▂▂▂▃▅▂▁▁▂▃▂▂▂▂▂▂▃

0,1
epoch,24.0
train_loss_epoch,0.56867
train_loss_step,0.7505
trainer/global_step,8174.0
val_loss,0.71252


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▅▄▃▅▄▆▃▃▂▃▃▄▃▃▃▂▂▂▃▄▂▃▃▄▂▃▃▂▂▂▃▂▃▁▂▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇███
val_loss,█▆▄▃▂▃▂▁▁▂▂▁▂▁▁▂▁▁▃▁▁▁▁▁▁▂▂▂▁▂▁▁

0,1
epoch,31.0
train_loss_epoch,0.48726
train_loss_step,0.3905
trainer/global_step,10463.0
val_loss,0.6815


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...


/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
train_loss_epoch,█▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,███▅▆▄▅▅▄▄▃▄▂▃▃▃▄▂▂▂▂▂▂▁▂▁▂▂▁▂▁▂▁▂▂▁▁▁▂▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
val_loss,█▆▅▄▃▃▃▂▂▂▂▂▁▂▂▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,47.0
train_loss_epoch,0.50839
train_loss_step,0.43663
trainer/global_step,15695.0
val_loss,0.67632


In [16]:
print("[INFO] Creating data loaders...")
g3_train_loader = DataLoader(g3_train_rf, batch_size=config['batch_size'], shuffle=True)  # GRAPH
g3_valid_loader = DataLoader(g3_valid_rf[0], batch_size=config['batch_size'], shuffle=False)  # GRAPH
train_loader = g3_train_loader  # GRAPH
valid_loader = g3_valid_loader  # GRAPH

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = g3_train_rf[0].num_edge_features  #GRAPH

[INFO] Creating data loaders...
[INFO] Creating model...


In [19]:
FILENAME = "g3_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3"
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=config, tags=["earlystop"], reinit=True
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)

    # initialize
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    early_stop = EarlyStopping(monitor="val_loss", patience=10)

    # Train Model ###################################################################
    print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=1000,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=early_stop,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)
wandb.finish()

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 9.9 M  | train
2 | aggr        | DeepSetAggregator | 212 K  | tr

[INFO] Training model...
                                                                           

/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 327/327 [00:10<00:00, 31.58it/s, v_num=_24h, train_loss_step=0.869]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/105 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/105 [00:00<?, ?it/s][A
Validation DataLoader 0:   1%|          | 1/105 [00:00<00:01, 53.74it/s][A
Validation DataLoader 0:   2%|▏         | 2/105 [00:00<00:01, 57.59it/s][A
Validation DataLoader 0:   3%|▎         | 3/105 [00:00<00:01, 60.13it/s][A
Validation DataLoader 0:   4%|▍         | 4/105 [00:00<00:01, 61.37it/s][A
Validation DataLoader 0:   5%|▍         | 5/105 [00:00<00:01, 62.13it/s][A
Validation DataLoader 0:   6%|▌         | 6/105 [00:00<00:01, 60.58it/s][A
Validation DataLoader 0:   7%|▋         | 7/105 [00:00<00:01, 61.20it/s][A
Validation DataLoader 0:   8%|▊         | 8/105 [00:00<00:01, 62.00it/s][A
Validation DataLoader 0:   9%|▊         | 9/105 [00:00<00:01, 62.52it/s][A
Validation DataLoader 0:  10%|▉         | 10/105 [

0,1
epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇████
train_loss_epoch,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▇▆▃▅▅▃▅▃▃▃▂▃▃▃▂▃▃▄▄▂▃▄▂▃▃▂▂▂▂▂▂▂▂▃▂▁▂▁
trainer/global_step,▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
val_loss,█▅▅▃▃▂▂▄▃▂▁▁▂▂▅▁▁▁▁▃▁▂▁▂▁▁

0,1
epoch,25.0
train_loss_epoch,0.53196
train_loss_step,0.7173
trainer/global_step,8501.0
val_loss,0.66347


In [20]:
config_hypertune = {"batch_size":8,
"gnn_hidden":265,
"gnn_layers":2,
"heads":8,
"lr": [0.01, 0.001, 0.0002,0.00005] ,
"max_dist":100,
"max_epochs": 40}

In [None]:
print("[INFO] Creating data loaders...")
g3_train_loader = DataLoader(g3_train_rf, batch_size=config_hypertune['batch_size'], shuffle=True)  # GRAPH
g3_valid_loader = DataLoader(g3_valid_rf[0], batch_size=config_hypertune['batch_size'], shuffle=False)  # GRAPH
train_loader = g3_train_loader  # GRAPH
valid_loader = g3_valid_loader  # GRAPH

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = g3_train_rf[0].num_edge_features  #GRAPH
FILENAME = "g3_train_run_24h"  # GRAPH
PROJECTNAME = "gnn_run3"
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=config_hypertune, tags=["earlystop"], reinit=True
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)

    # initialize
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    early_stop = EarlyStopping(monitor="val_loss", patience=10)

    # Train Model ###################################################################
    print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=1000,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=early_stop,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)
wandb.finish()

In [9]:
# hyperopt
def train():
    with wandb.init(
            project=PROJECTNAME, id=FILENAME, config=config, tags=["hyperopt"], reinit=True
    ):
        config = wandb.config

        print("[INFO] Creating data loaders...")
        g1_train_loader = DataLoader(graphs_train_rf, batch_size=config['batch_size'], shuffle=True)
        g1_valid_loader = DataLoader(graphs_valid_rf, batch_size=config['batch_size'], shuffle=False)
        train_loader = g1_train_loader
        valid_loader = g1_valid_loader

        print("[INFO] Creating model...")
        emb_dim=20
        in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
        edge_dim = graphs_train_rf[0].num_edge_features

        multigraph = Multigraph(
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)

        # initialize
        batch = next(iter(g1_train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)

        # Train Model ###################################################################
        print("[INFO] Training model...")
        trainer = L.Trainer(
            max_epochs=1000,
            log_every_n_steps=1,
            accelerator="gpu",
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=early_stop,
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)
wandb.agent(sweep_id, function=train, count=1)

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[INFO] Creating data loaders...


Traceback (most recent call last):
  File "/tmp/ipykernel_3675734/135698725.py", line 10, in <module>
    g1_train_loader = DataLoader(graphs_train_rf, batch_size=config['batch_size'], shuffle=True)
  File "/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/torch_geometric/loader/dataloader.py", line 87, in __init__
    super().__init__(
  File "/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 357, in __init__
    batch_sampler = BatchSampler(sampler, batch_size, drop_last)
  File "/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/torch/utils/data/sampler.py", line 268, in __init__
    raise ValueError(f"batch_size should be a positive integer value, but got batch_size={batch_size}")
ValueError: batch_size should be a positive integer value, but got batch_size={'values': [8, 16, 32]}


ValueError: batch_size should be a positive integer value, but got batch_size={'values': [8, 16, 32]}

In [None]:
g2_train_rf, g2_valid_rf = normalize_features_and_create_graphs1(df_train=dataframes['train'],df_valid_test=[dataframes['valid']], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)], sum_stats = True)

print("[INFO] Creating data loaders...")

g2_train_loader = DataLoader(g2_train_rf, batch_size=config['batch_size'], shuffle=True)
g2_valid_loader = DataLoader(g2_valid_rf, batch_size=config['batch_size'], shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20

in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs_train_rf[0].num_edge_features

# hyperopt
FILENAME = "g2_train_run_24h"
train_loader = g2_train_loader
valid_loader = g2_valid_loader
PROJECTNAME = "gnn_run3"
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=config, tags=["hyperopt"], reinit=True
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)

    # initialize
    batch = next(iter(g1_train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    early_stop = EarlyStopping(monitor="val_loss", patience=10)

    # Train Model ###################################################################
    print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=1000,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=early_stop,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)
wandb.finish()

In [None]:
# training
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )
    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)