In [57]:
%cd /home/ltchen/gnnpp
import sys
import os
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    load_distances,
    normalize_features_and_create_graphs,
    rm_edges,
    summary_statistics,
)
from exploration.graph_creation import *
from models.graphensemble.multigraph import *

/home/ltchen/gnnpp


In [58]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/gnn_new_attr_24h/models")
JSONPATH = os.path.join(DIRECTORY, "trained_models/best_24h/params.json")

In [59]:
with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)
config = args_dict
print(config)
print(config['lr'])
print(config['max_dist'])
print(type(config))
print(type(config['lr']))
print(type(config['gnn_hidden']))
print(config['gnn_hidden'])
'''{"batch_size":8,
"gnn_hidden":265,
"gnn_layers":2,
"heads":8,
"lr":0.0002,
"max_dist":100,
"max_epochs": 31}'''

[INFO] Loading /home/ltchen/gnnpp/trained_models/best_24h/params.json
{'batch_size': 8, 'gnn_hidden': 265, 'gnn_layers': 2, 'heads': 8, 'lr': 0.0002, 'max_dist': 100, 'max_epochs': 31}
0.0002
100
<class 'dict'>
<class 'float'>
<class 'int'>
265


'{"batch_size":8,\n"gnn_hidden":265,\n"gnn_layers":2,\n"heads":8,\n"lr":0.0002,\n"max_dist":100,\n"max_epochs": 31}'

#### Load Graphs

In [60]:
dataframes = load_dataframes(mode="eval", leadtime= "24h") # load newly created dataframes
dataframes = summary_statistics(dataframes)

[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f


In [4]:
# train = dataframes["train"][0]
# train_target = dataframes["train"][1]
# test_rf = dataframes["test_rf"][0]
# test_rf_target = dataframes["test_rf"][1]
# test_f = dataframes["test_f"][0]
# test_f_target = dataframes["test_f"][1]

# self-created
graphs_train_rf, tests = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo"], edges=[("geo", 100)], sum_stats = True)

graphs_test_rf, graphs_test_f = tests
graphs_test = graphs_test_rf

[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
[INFO] Normalizing features...
fit_transform
transform 1
transform 2


100%|██████████| 3448/3448 [00:16<00:00, 204.66it/s]
100%|██████████| 732/732 [00:02<00:00, 273.74it/s]
100%|██████████| 730/730 [00:02<00:00, 257.81it/s]


### Graph 1: reproduction

In [47]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g1_train_loader = DataLoader(graphs_train_rf, batch_size=batch_size, shuffle=True)
g1_test_f_loader = DataLoader(graphs_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim=20

in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1

edge_dim = graphs_train_rf[0].num_edge_features
embedding_dim=emb_dim
in_channels=in_channels
hidden_channels_gnn=config['gnn_hidden']
out_channels_gnn=config['gnn_hidden']
num_layers_gnn=config['gnn_hidden']
heads=config['heads']
hidden_channels_deepset=config['gnn_hidden']
optimizer_class=AdamW
optimizer_params=dict(lr=config['lr'])

[INFO] Creating data loaders...
[INFO] Creating model...


In [6]:
wandb_run = None

try:
    wandb_run = wandb.init(project="my-project", name="safe_run")
except Exception as e:
    print(f"W&B failed to start: {e}")

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [48]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g1_train_run_24h"
train_loader = g1_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"], reinit=True
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(g1_train_loader))
    # batch = batch  # .to("cuda")
    # multigraph  # .to("cuda")
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.55it/s, v_num=_24h, train_loss_step=0.441, train_loss_epoch=0.504]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.54it/s, v_num=_24h, train_loss_step=0.441, train_loss_epoch=0.504]


0,1
epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train_loss_epoch,█▄▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
train_loss_step,▆█▄▄▄▄▃▃▃▃▃▂▄▃▂▂▂▂▂▂▂▃▂▂▂▂▂▂▂▂▂▁▂▂▁▂▃▂▂▂
trainer/global_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇▇▇████

0,1
epoch,30.0
train_loss_epoch,0.50399
train_loss_step,0.441
trainer/global_step,13360.0


In [55]:
# g1_test_rf_loader = DataLoader(graphs_test_rf, batch_size=batch_size, shuffle=False)

preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[g1_test_f_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

# targets = dataframes["test_rf"][1]
targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:01<00:00, 65.17it/s]
#############################################
#############################################
final crps: 0.6454867613345044
#############################################
#############################################


### Graph2: same edges, more attributes

In [61]:
graphs2_train_rf, tests2 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)], sum_stats = True)

graphs2_test_rf, graphs2_test_f = tests2
graphs2_test = graphs2_test_rf

batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g2_train_loader = DataLoader(graphs2_train_rf, batch_size=batch_size, shuffle=True)
g2_test_f_loader = DataLoader(graphs2_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20

in_channels = graphs2_train_rf[0].x.shape[1] + emb_dim - 1

edge_dim = graphs2_train_rf[0].num_edge_features
embedding_dim = emb_dim
in_channels = in_channels
hidden_channels_gnn = config['gnn_hidden']
out_channels_gnn = config['gnn_hidden']
num_layers_gnn = config['gnn_hidden']
heads = config['heads']
hidden_channels_deepset = config['gnn_hidden']
optimizer_class = AdamW
optimizer_params = dict(lr=config['lr'])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2


100%|██████████| 3448/3448 [00:16<00:00, 204.21it/s]
100%|██████████| 732/732 [00:02<00:00, 282.93it/s]
100%|██████████| 730/730 [00:02<00:00, 276.26it/s]

[INFO] Creating data loaders...
[INFO] Creating model...





In [69]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g2_train_run_24h"

In [9]:

train_loader = g2_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.55it/s, v_num=_24h, train_loss_step=0.548, train_loss_epoch=0.499]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.05it/s, v_num=_24h, train_loss_step=0.548, train_loss_epoch=0.499]


0,1
epoch,▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇████████
train_loss_epoch,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
train_loss_step,██▅▇▄▆▅▄▅▃▄▃▃▄▅▄▅▅▃▃▆▄▃▂▄▂▃▂▁▂▃▂▁▃▂▃▃▃▂▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇████

0,1
epoch,30.0
train_loss_epoch,0.49887
train_loss_step,0.5477
trainer/global_step,13360.0


In [74]:
test_loader = g2_test_f_loader

CKPT_PATH = os.path.join(SAVEPATH, FILENAME+'.ckpt')

multigraph = Multigraph.load_from_checkpoint(
    CKPT_PATH,
    embedding_dim=emb_dim,
    edge_dim=edge_dim,
    in_channels=in_channels,
    hidden_channels_gnn=config['gnn_hidden'],
    out_channels_gnn=config['gnn_hidden'],
    num_layers_gnn=config['gnn_layers'],
    heads=config['heads'],
    hidden_channels_deepset=config['gnn_hidden'],
    optimizer_class=AdamW,
    optimizer_params=dict(lr=config['lr']),
)

multigraph.eval()
trainer = L.Trainer()
# trainer.fit(multigraph, ckpt_path=CKPT_PATH)

preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[test_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
# print(preds)
print(preds[0].shape)
# preds = [prediction.reshape(1, 122, 2).mean(axis=0) for prediction in
#          preds]
#ACHTUNG - reshape(1, 122, 2) mit 1 statt 5!
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:01<00:00, 67.75it/s]
torch.Size([976, 2])
#############################################
#############################################
final crps: 0.6623683693510217
#############################################
#############################################


### Graph3: more edges, more attributes

In [75]:
graphs3_train_rf, tests3 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 55), ("alt", 6.5), ("alt-orog", 2.5)], sum_stats = True)

graphs3_test_rf, graphs3_test_f = tests3
graphs3_test = graphs3_test_rf

facts_about(graphs3_train_rf[0])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2


100%|██████████| 3448/3448 [00:17<00:00, 200.95it/s]
100%|██████████| 732/732 [00:02<00:00, 263.88it/s]
100%|██████████| 730/730 [00:02<00:00, 274.09it/s]

Number of nodes: 122 with feature dimension of x: 65
Number of isolated nodes: 2
Number of edges: 1482 with edge dimension: 5
Average node degree: 12.147541046142578





In [84]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g3_train_loader = DataLoader(graphs3_train_rf, batch_size=batch_size, shuffle=True)
g3_test_f_loader = DataLoader(graphs3_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs3_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs3_train_rf[0].num_edge_features

[INFO] Creating data loaders...
[INFO] Creating model...


In [81]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g3_train_run_24h"

In [13]:

train_loader = g3_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:16<00:00, 26.86it/s, v_num=_24h, train_loss_step=0.521, train_loss_epoch=0.494]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:16<00:00, 26.33it/s, v_num=_24h, train_loss_step=0.521, train_loss_epoch=0.494]


0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇█████
train_loss_epoch,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▆▆▄▅▆▃▇▄▄▄▄▄▄▃▄▅▄▃▃▃▄▁▃▄▅▄▄▃▃▂▃▃▂▂▁▃▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████

0,1
epoch,30.0
train_loss_epoch,0.49358
train_loss_step,0.52067
trainer/global_step,13360.0


In [85]:
test_loader = g3_test_f_loader

CKPT_PATH = os.path.join(SAVEPATH, FILENAME+'.ckpt')

multigraph = Multigraph.load_from_checkpoint(
    CKPT_PATH,
    embedding_dim=emb_dim,
    edge_dim=edge_dim,
    in_channels=in_channels,
    hidden_channels_gnn=config['gnn_hidden'],
    out_channels_gnn=config['gnn_hidden'],
    num_layers_gnn=config['gnn_layers'],
    heads=config['heads'],
    hidden_channels_deepset=config['gnn_hidden'],
    optimizer_class=AdamW,
    optimizer_params=dict(lr=config['lr']),
)

multigraph.eval()
trainer = L.Trainer()

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [86]:
preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[test_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
# print(preds)
print(preds[0].shape)
# preds = [prediction.reshape(1, 122, 2).mean(axis=0) for prediction in
#          preds]
#ACHTUNG - reshape(1, 122, 2) mit 1 statt 5!
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:01<00:00, 68.95it/s]
torch.Size([976, 2])
#############################################
#############################################
final crps: 0.6568804218453907
#############################################
#############################################


In [87]:
graphs4_train_rf, tests4 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["dist2"], edges=[("dist2", 0.005)], sum_stats = True)

graphs4_test_rf, graphs4_test_f = tests4
graphs4_test = graphs4_test_rf

facts_about(graphs4_train_rf[0])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
[INFO] Loading distances from file...


100%|██████████| 3448/3448 [00:17<00:00, 202.56it/s]


[INFO] Loading distances from file...


100%|██████████| 732/732 [00:03<00:00, 236.70it/s]


[INFO] Loading distances from file...


100%|██████████| 730/730 [00:02<00:00, 276.35it/s]

Number of nodes: 122 with feature dimension of x: 65
Number of isolated nodes: 9
Number of edges: 1356 with edge dimension: 1
Average node degree: 11.114753723144531





In [88]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g4_train_loader = DataLoader(graphs4_train_rf, batch_size=batch_size, shuffle=True)
g4_test_f_loader = DataLoader(graphs4_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs4_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs4_train_rf[0].num_edge_features

[INFO] Creating data loaders...
[INFO] Creating model...


In [89]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g4_train_run_24h"

In [26]:
train_loader = g4_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.85it/s, v_num=_24h, train_loss_step=0.468, train_loss_epoch=0.522]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:15<00:00, 27.33it/s, v_num=_24h, train_loss_step=0.468, train_loss_epoch=0.522]


0,1
epoch,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▂▁▂▂▂▂▂▂▂▁▁▂▂▁▁▁▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█

0,1
epoch,30.0
train_loss_epoch,0.52206
train_loss_step,0.46805
trainer/global_step,13360.0


In [90]:
test_loader = g4_test_f_loader

CKPT_PATH = os.path.join(SAVEPATH, FILENAME+'.ckpt')

multigraph = Multigraph.load_from_checkpoint(
    CKPT_PATH,
    embedding_dim=emb_dim,
    edge_dim=edge_dim,
    in_channels=in_channels,
    hidden_channels_gnn=config['gnn_hidden'],
    out_channels_gnn=config['gnn_hidden'],
    num_layers_gnn=config['gnn_layers'],
    heads=config['heads'],
    hidden_channels_deepset=config['gnn_hidden'],
    optimizer_class=AdamW,
    optimizer_params=dict(lr=config['lr']),
)

multigraph.eval()
trainer = L.Trainer()

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [91]:
preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[test_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
# print(preds)
print(preds[0].shape)
# preds = [prediction.reshape(1, 122, 2).mean(axis=0) for prediction in
#          preds]
#ACHTUNG - reshape(1, 122, 2) mit 1 statt 5!
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:01<00:00, 69.49it/s]
torch.Size([976, 2])
#############################################
#############################################
final crps: 0.6382404991725519
#############################################
#############################################


In [27]:
graphs5_train_rf, tests5 = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['test_rf'],
                                                                                dataframes['test_f']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["geo", "alt", "lon", "lat", "alt-orog"],
                                                                 edges=[("geo", 100), ("alt", 10), ("alt-orog", 5)],
                                                                 sum_stats=True)

graphs5_test_rf, graphs5_test_f = tests5
graphs5_test = graphs5_test_rf

facts_about(graphs5_train_rf[0])


[INFO] Normalizing features...
fit_transform
transform 1
transform 2



  0%|          | 0/3448 [00:00<?, ?it/s][A
  0%|          | 17/3448 [00:00<00:20, 166.12it/s][A
  1%|          | 39/3448 [00:00<00:17, 193.99it/s][A
  2%|▏         | 59/3448 [00:00<00:17, 190.98it/s][A
  2%|▏         | 79/3448 [00:00<00:17, 189.93it/s][A
  3%|▎         | 101/3448 [00:00<00:16, 199.17it/s][A
  4%|▎         | 122/3448 [00:00<00:16, 202.37it/s][A
  4%|▍         | 144/3448 [00:00<00:16, 205.60it/s][A
  5%|▍         | 166/3448 [00:00<00:15, 209.34it/s][A
  5%|▌         | 188/3448 [00:00<00:15, 211.69it/s][A
  6%|▌         | 210/3448 [00:01<00:15, 208.22it/s][A
  7%|▋         | 231/3448 [00:01<00:16, 195.82it/s][A
  7%|▋         | 252/3448 [00:01<00:16, 198.49it/s][A
  8%|▊         | 272/3448 [00:01<00:17, 183.98it/s][A
  9%|▊         | 294/3448 [00:01<00:16, 191.63it/s][A
  9%|▉         | 314/3448 [00:01<00:16, 188.01it/s][A
 10%|▉         | 333/3448 [00:01<00:16, 184.89it/s][A
 10%|█         | 352/3448 [00:01<00:17, 176.39it/s][A
 11%|█         | 373/344

Number of nodes: 122 with feature dimension of x: 65
Number of isolated nodes: 1
Number of edges: 2804 with edge dimension: 5
Average node degree: 22.983606338500977





In [28]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g5_train_loader = DataLoader(graphs5_train_rf, batch_size=batch_size, shuffle=True)
g5_test_f_loader = DataLoader(graphs5_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs5_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs5_train_rf[0].num_edge_features


[INFO] Creating data loaders...
[INFO] Creating model...


In [29]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g5_train_run_24h"
train_loader = g5_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:21<00:00, 19.89it/s, v_num=_24h, train_loss_step=0.610, train_loss_epoch=0.487]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:21<00:00, 19.62it/s, v_num=_24h, train_loss_step=0.610, train_loss_epoch=0.487]


0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇█████
train_loss_epoch,█▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇██

0,1
epoch,30.0
train_loss_epoch,0.48651
train_loss_step,0.61032
trainer/global_step,13360.0


In [40]:
preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[g5_test_f_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
# print(preds)
print(preds[0].shape)
# preds = [prediction.reshape(1, 122, 2).mean(axis=0) for prediction in
#          preds]
#ACHTUNG - reshape(1, 122, 2) mit 1 statt 5!
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:02<00:00, 44.41it/s]
torch.Size([976, 2])
#############################################
#############################################
final crps: 0.6466054917523435
#############################################
#############################################


In [43]:
graphs6_train_rf, tests6 = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                 df_valid_test=[dataframes['test_rf'],
                                                                                dataframes['test_f']],
                                                                 station_df=dataframes['stations'],
                                                                 attributes=["dist3"],
                                                                 edges=[("dist3", 0.015)],
                                                                 sum_stats=True)

graphs6_test_rf, graphs6_test_f = tests6
graphs6_test = graphs6_test_rf

facts_about(graphs6_train_rf[0])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
[INFO] Loading distances from file...


100%|██████████| 3448/3448 [00:16<00:00, 204.07it/s]


[INFO] Loading distances from file...


100%|██████████| 732/732 [00:02<00:00, 273.86it/s]


[INFO] Loading distances from file...


100%|██████████| 730/730 [00:02<00:00, 263.57it/s]

Number of nodes: 122 with feature dimension of x: 65
Number of isolated nodes: 26
Number of edges: 1514 with edge dimension: 1
Average node degree: 12.409835815429688





In [44]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g6_train_loader = DataLoader(graphs6_train_rf, batch_size=batch_size, shuffle=True)
g6_test_f_loader = DataLoader(graphs6_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20
in_channels = graphs6_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs6_train_rf[0].num_edge_features

[INFO] Creating data loaders...
[INFO] Creating model...


In [45]:
PROJECTNAME = "new_attr_graphs"
FILENAME = "g6_train_run_24h"
train_loader = g6_train_loader
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(train_loader))
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/gnn_new_attr_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder  

Epoch 30: 100%|██████████| 431/431 [00:16<00:00, 26.74it/s, v_num=_24h, train_loss_step=0.556, train_loss_epoch=0.523]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:16<00:00, 26.22it/s, v_num=_24h, train_loss_step=0.556, train_loss_epoch=0.523]


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇███████
train_loss_epoch,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,██▆▅▆▅▃▆▅▅▅▄▄▅▃▃▂▂▅▃▃▅▅▄▃▂▂▃▃▄▂▂▂▃▃▂▂▃▂▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,30.0
train_loss_epoch,0.52288
train_loss_step,0.55647
trainer/global_step,13360.0


In [46]:
preds_list = []
preds = trainer.predict(model=multigraph, dataloaders=[g6_test_f_loader]) # 92 x 976 x 2 forecasts with mu and sigma of 122 stations
preds = torch.cat(preds, dim=0)
preds_list.append(preds)

targets = dataframes["test_f"][1]
targets = torch.tensor(targets.t2m.values) - 273.15

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = multigraph.loss_fn.crps(final_preds, targets)
print("#############################################")
print("#############################################")
print(f"final crps: {res.item()}")
print("#############################################")
print("#############################################")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:01<00:00, 61.93it/s]
#############################################
#############################################
final crps: 0.6487236425125072
#############################################
#############################################
