In [1]:
%cd /home/ltchen/gnnpp
import sys
import os
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    summary_statistics,
)
from exploration.graph_creation import *
from models.graphensemble.multigraph import *
from exploration.get_graphs_and_data import *
from exploration.explainability_utils import *

/home/ltchen/gnnpp


## 72h Leadtime Graphs
- adjust distance 2 and distance 3 metrics!!

In [36]:
leadtime = "120h"
graph_name = "g5"
# data_type = "f"

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()

JSONPATH, SAVEPATH, RESULTPATH = get_json_save_result_paths(leadtime=leadtime, graph_name=graph_name)
with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)
config = args_dict

dataframes = load_dataframes(leadtime=leadtime)
dataframes = summary_statistics(dataframes)
g_train_rf, g_valid_rf, g_test_rf, g_test_f = get_train_valid_graph_data(leadtime=leadtime, graph_name=graph_name)

[INFO] Loading /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/params.json
[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for valid
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
Loading precomputed graph data on g5...


  train_data = torch.load(train_path)
  valid_data = torch.load(valid_path)
  test_rf = torch.load(test_rf_path)


Successfully loaded precomputed data.


  test_f = torch.load(test_f_path)


In [37]:
g_train_loader = DataLoader(g_train_rf, batch_size=config['batch_size'], shuffle=True)
g_valid_loader = DataLoader(g_valid_rf, batch_size=config['batch_size'], shuffle=True)
g_test_f_loader = DataLoader(g_test_f, batch_size=config['batch_size'], shuffle=False)
g_test_rf_loader = DataLoader(g_test_rf, batch_size=config['batch_size'], shuffle=False)

train_loader = g_train_loader
valid_loader = g_valid_loader
test_f_loader = g_test_f_loader
test_rf_loader = g_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = g_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = g_train_rf[0].num_edge_features
num_nodes = g_train_rf[0].num_nodes
max_epochs = 100

In [38]:
PROJECTNAME = f"gnn_run_{leadtime}"

for i in range(0, 10):
    # FILENAME = graph_name + f"run{i}_" + leadtime
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes, #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
                max_epochs=max_epochs,
                log_every_n_steps=1,
                accelerator="gpu",
                devices = 1,
                enable_progress_bar=True,
                logger=wandb_logger,
                callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn 

0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▄▅▅▃▃▆█▅▄▄▄▃▁▄▂▁▃▅▃▄▇▃▄▃▅▂▂▄▃▄▃▃▂▃▂▃▂▃▄▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇█████
val_loss,█▇▃▃▂▂▂▃▁▁▂▂▂▅▂▂▂▁▁▂▁▂▁▃▄▃▃▃▂▅▃▄▄

0,1
epoch,32.0
train_loss_epoch,1.02921
train_loss_step,1.00234
trainer/global_step,10790.0
val_loss,1.27656


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇█████
train_loss_epoch,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▆▅▆█▄▃▇▃▄▃▂▄▄▅▂▃▁▂▅▃▄▂▃▃▂▃▄▄▄▅▅▂▄▅▁▁▂▃▂▂
trainer/global_step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██
val_loss,█▄▄▃▅▂▁▁▁▁▂▂▂▃▁▃▂▂

0,1
epoch,17.0
train_loss_epoch,1.11384
train_loss_step,1.18005
trainer/global_step,5885.0
val_loss,1.23928


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▁▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇███
val_loss,█▄▄▃▂▃▂▂▄▃▂▁▁▄▄▁▁▂▂▁▁▁▁▃▅▂

0,1
epoch,25.0
train_loss_epoch,1.07194
train_loss_step,1.15164
trainer/global_step,8501.0
val_loss,1.2382


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▂▄▄▆▃▃▂▄▄█▃▁▂▄▃▂▅▂▃▆▂▁▃▁▂▃▁▃▁▄▂▁▆▂▂▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
val_loss,█▄▃▅▃▂▃▂▂▂▁▁▂▁▃▂▂▁▃▃▄▃▂▄▂▃▂▄

0,1
epoch,27.0
train_loss_epoch,1.06921
train_loss_step,0.96641
trainer/global_step,9155.0
val_loss,1.28845


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,██▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
val_loss,█▃▂▂▂▂▃▃▁▂▁▁▁▂▂▃▂▁▁▂▂▂

0,1
epoch,21.0
train_loss_epoch,1.08824
train_loss_step,1.09584
trainer/global_step,7193.0
val_loss,1.2272


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▄▂▄▃▃▄▃▂▅▃▇▃█▂▄▂▃▄▁▂▄▂▃▅▃▃▃▅▃▇▃▁▄▂▃▃▃▃▄▂
trainer/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇███
val_loss,█▄▃▂▂▂▁▂▁▁▁▁▂▂▁▃▁▁▃▂▂▁▂▂▃▂▂

0,1
epoch,26.0
train_loss_epoch,1.07081
train_loss_step,1.71874
trainer/global_step,8828.0
val_loss,1.24504


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▄▆▄▇▇▅▄▃▆▃▃▄▃▅▂▆▅▃▄▃▅▅▅█▄▄▂▃▂▃▅▂▃▅▅▅▅▃▁▃
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_loss,█▄▄▂▂▂▂▂▂▃▂▁▁▃▂▁▃▁▁▂▂▂▃▂▂▂▂▁

0,1
epoch,27.0
train_loss_epoch,1.06075
train_loss_step,1.09199
trainer/global_step,9155.0
val_loss,1.22923


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▄▅▄▅▄▄▆▄▄▂▆▂▁▅▁▃▅▅▅▆▄▂▂▂▂▄▅▂▅▄▅▃▁▄▃█▁▃▃▄
trainer/global_step,▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█
val_loss,█▄▄▄▂▄▂▄▄▂▂▃▁▂▂▂▅▂▂▃▁▁▃

0,1
epoch,22.0
train_loss_epoch,1.08156
train_loss_step,0.75471
trainer/global_step,7520.0
val_loss,1.25241


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▂▂▂▂▂▁▁▂▂▂▂▁▁▂▂▂▂▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▂▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█
val_loss,█▄▃▂▃▂▃▂▃▁▁▃▁▂▁▁▁▂▂▃▂▂▃

0,1
epoch,22.0
train_loss_epoch,1.08988
train_loss_step,0.97633
trainer/global_step,7520.0
val_loss,1.25213


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 314 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      

0,1
epoch,▁▁▂▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▂▅▄▂▂▄▅▂▃▂▃▃▂▂▂▃▄▂▃▂▄▄▂▃▂▃▂▂▃▁▂▂▃▂▁▂▁▂
trainer/global_step,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
val_loss,█▄▃▂▃▂▂▁▂▄▄▃▁▁▁▂▁▃▁▂▄▇▂▂

0,1
epoch,23.0
train_loss_epoch,1.0733
train_loss_step,1.05442
trainer/global_step,7847.0
val_loss,1.23353


In [39]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config['gnn_hidden'],
                out_channels_gnn=config['gnn_hidden'],
                num_layers_gnn=config['gnn_layers'],
                heads=config['heads'],
                hidden_channels_deepset=config['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps for {data}: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


[INFO] Loading model from g5_120h_train_run8.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 142.95it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 134.21it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 154.42it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 143.13it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 142.72it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 138.16it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 147.68it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 144.76it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 140.61it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_120h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 153.17it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps for f: 1.102037799315447
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g5_120h_train_run8.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 148.97it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 144.06it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 153.73it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 146.35it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 144.09it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 134.91it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 142.20it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 140.89it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 134.31it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_120h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 138.41it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps for rf: 1.1905915546993322
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_120h/g5_120h/


- SAVEPATH for model saving
- JSONPATH for parameters
- RESULTPATH for test results (f.txt, f_results.csv, rf.txt., rf_results.csv)

# 24h Leadtime Graphs

In [3]:
leadtime = "24h"

sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()
JSONPATH = os.path.join(DIRECTORY, f"trained_models/no_ensemble_{leadtime}/params.json")
with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)
config = args_dict

[INFO] Loading /home/ltchen/gnnpp/trained_models/no_ensemble_24h/params.json


In [None]:
'''{"batch_size":8,
"gnn_hidden":256,
"gnn_layers":1,
"heads":8,
"lr":0.0001,
"max_dist":50,
"max_epochs": 23,
"remove_edges": "False",
"only_summary": "True"}'''

In [3]:
dataframes = load_dataframes(leadtime=leadtime)
dataframes = summary_statistics(dataframes)

[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for valid
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f


In [4]:
def get_train_valid_data():
    train_path = 'exploration/graphs/train-AgalloEg100-a10-o4.pt'
    valid_path = 'exploration/graphs/valid-AgalloEg100-a10-o4.pt'
    test_rf_path = 'exploration/graphs/test_rf-AgalloEg100-a10-o4.pt'
    test_f_path = 'exploration/graphs/test_f-AgalloEg100-a10-o4.pt'

    if os.path.exists(train_path) and os.path.exists(valid_path):
        print("Loading precomputed graph data...")
        try:
            train_data = torch.load(train_path)
            valid_data = torch.load(valid_path)
            test_rf = torch.load(test_rf_path)
            test_f = torch.load(test_f_path)
            print("Successfully loaded precomputed data.")
            return train_data, valid_data, test_rf, test_f
        except Exception as e:
            print(f"Error loading precomputed data: {e}")
            print("Falling back to data preparation...")
    else:
        print("Precomputed data not found.")

    print("Preparing data from scratch...")
    train_data, valid_data, test_rf, test_f = prepare_data()
    return train_data, valid_data, test_rf, test_f

def prepare_data():
    leadtime = "24h"
    dataframes = load_dataframes(leadtime=leadtime)
    dataframes = summary_statistics(dataframes)
    graph_name = "g3"
    graphs3_train_rf, tests3 = normalize_features_and_create_graphs1(df_train=dataframes['train'],
                                                                     df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']],
                                                                     station_df=dataframes['stations'],attributes=["geo", "alt", "lon", "lat","alt-orog"],
                                                                     edges=[("geo", 100), ("alt", 10), ("alt-orog", 4)],
                                                                     sum_stats=True)
    graphs3_valid_rf, graphs3_test_rf, graphs3_test_f = tests3
    os.makedirs('exploration/graphs', exist_ok=True)
    torch.save(graphs3_train_rf, 'exploration/graphs/train-AgalloEg100-a10-o4.pt')
    torch.save(graphs3_valid_rf, 'exploration/graphs/valid-AgalloEg100-a10-o4.pt')
    torch.save(graphs3_test_rf, 'exploration/graphs/test_rf-AgalloEg100-a10-o4.pt')
    torch.save(graphs3_test_f, 'exploration/graphs/test_f-AgalloEg100-a10-o4.pt')
    return graphs3_train_rf, graphs3_valid_rf, graphs3_test_rf, graphs3_test_f

In [19]:
prepare_data()

[INFO] Normalizing features...
fit_transform
transform 1


100%|██████████| 2612/2612 [00:11<00:00, 220.52it/s]
100%|██████████| 836/836 [00:03<00:00, 276.01it/s]


In [33]:
train, valid, testrf, testf = get_train_valid_data()
print(valid[0])

Loading precomputed graph data...


  train_data = torch.load(train_path)
  valid_data = torch.load(valid_path)
  test_rf = torch.load(test_rf_path)


Successfully loaded precomputed data.
Data(x=[120, 65], edge_index=[2, 2626], edge_attr=[2626, 5], y=[120], timestamp=2010-01-01 00:00:00, n_idx=[120])


  test_f = torch.load(test_f_path)


## Graph 1

In [5]:
graph_name = "g1"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

graphs1_train_rf, tests1 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo"], edges=[("geo", 50)], sum_stats = True)
graphs1_valid_rf, graphs1_test_rf, graphs1_test_f = tests1

g1_train_loader = DataLoader(graphs1_train_rf, batch_size=config['batch_size'], shuffle=True)
g1_valid_loader = DataLoader(graphs1_valid_rf, batch_size=config['batch_size'], shuffle=False)
g1_test_f_loader = DataLoader(graphs1_test_f, batch_size=config['batch_size'], shuffle=False)
g1_test_rf_loader = DataLoader(graphs1_test_rf, batch_size=config['batch_size'], shuffle=False)

train_loader = g1_train_loader
valid_loader = g1_valid_loader
test_f_loader = g1_test_f_loader
test_rf_loader = g1_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = graphs1_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs1_train_rf[0].num_edge_features
num_nodes = graphs1_train_rf[0].num_nodes
# max_epochs = max_epoch_list[graph_name]
max_epochs = 100


# embedding_dim = emb_dim
# in_channels = in_channels
# hidden_channels_gnn = config['gnn_hidden']
# out_channels_gnn = config['gnn_hidden']
# num_layers_gnn = config['gnn_hidden']
# heads = config['heads']
# hidden_channels_deepset = config['gnn_hidden']
# optimizer_class = AdamW
# optimizer_params = dict(lr=config['lr'])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
transform 3


100%|██████████| 2612/2612 [00:11<00:00, 229.52it/s]
100%|██████████| 836/836 [00:03<00:00, 255.32it/s]
100%|██████████| 732/732 [00:02<00:00, 266.51it/s]
100%|██████████| 730/730 [00:02<00:00, 248.84it/s]


In [6]:
print(SAVEPATH)
print(RESULTPATH)

/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h


In [1]:
import wandb
if wandb.run is None:
    wandb.init(project="your-project-name")

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
PROJECTNAME = "gnn_run8"

for i in range(3, 10):
    # FILENAME = graph_name + f"run{i}_" + leadtime
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes, #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
                max_epochs=max_epochs,
                log_every_n_steps=1,
                accelerator="gpu",
                devices = [1],
                enable_progress_bar=True,
                logger=wandb_logger,
                callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▅▆▇▃▄▅▄█▄▅▄▅▄▃▅▄▃▅▆▂▅▄▅▄▄▄▅▃▃▃▃▃▃▄▅▁▄▂▄▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
val_loss,█▄▄▃▃▃▂▃▂▂▂▁▁▂▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▁▂▂▁▁▁▁

0,1
epoch,38.0
train_loss_epoch,0.55642
train_loss_step,0.4572
trainer/global_step,12752.0
val_loss,0.65584


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▇▇▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▇▅▇▅▇▇▆▅▄▃▅▄▄▄▄▄▃▂▃▃▃▃▃▃▃▄▃▄▁▃▄▃▂▂▃▃▂▃
trainer/global_step,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇███
val_loss,█▄▃▃▃▂▂▂▂▃▂▁▂▁▂▂▁▁▁▂▁▁▂▂▁▁▁▁▂▁▂▁▁▂▁

0,1
epoch,34.0
train_loss_epoch,0.56764
train_loss_step,0.58707
trainer/global_step,11444.0
val_loss,0.65889


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇█
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,██▃▅▄▄▅▄▄▃▅▂▃▃▃▂▃▃▃▄▃▃▂▂▂▃▃▂▃▁▂▂▂▁▃▂▂▃▂▃
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▁▂▁▂▂▁▂▁▂▁▁▁▂▁▁▁

0,1
epoch,33.0
train_loss_epoch,0.56943
train_loss_step,0.53146
trainer/global_step,11117.0
val_loss,0.65399


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▆▄▆▅▄▃▄▃▃▃▅▂▄▄▂▃▃▃▄▃▃▂▃▃▃▂▂▄▃▃▁▃▃▂▂▂▂▁
trainer/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
val_loss,█▅▃▃▃▃▂▂▂▂▂▂▂▂▂▁▃▁▁▂▂▂▁▁▁▁▂▁▁▂▁▁▁▁▂▁▁▂▂▁

0,1
epoch,44.0
train_loss_epoch,0.54875
train_loss_step,0.41778
trainer/global_step,14714.0
val_loss,0.66027


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▄▆▄▆▅▃▃▄▂▂▁▃▂▂▂▃▂▃▂▃▃▃▂▃▄▃▃▃▄▂▂▂▁▃▂▂▃▃
trainer/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▃▁▂▁▂▂▂▂▂▁▁▃▂▂▂▂▂▁▁▁▂▂▂▁▁▂▂

0,1
epoch,45.0
train_loss_epoch,0.54721
train_loss_step,0.46216
trainer/global_step,15041.0
val_loss,0.66592


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇████████
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇██▄▆▃▃▃▅▆▅▂▁▃▇▃▆▃▃▅▅▁▃▄▄▅▂▃▄▂▄▂▃▂▄▄▁▅▃▂
trainer/global_step,▁▁▁▁▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▂▁▃▁▂▂▁▁▁▂▁▁▁▂▁▁▁▂▁▂▂▁

0,1
epoch,33.0
train_loss_epoch,0.57732
train_loss_step,0.48267
trainer/global_step,11117.0
val_loss,0.66059


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 878 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇█▃▅▃▃▃▂▂▃▃▄▅▂▃▂▄▄▂▃▄▂▂▂▃▄▃▂▁▂▂▃▂▄▂▃▂▃▂▁
trainer/global_step,▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇████
val_loss,█▄▃▃▃▂▂▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▂▁▁▂▁▁▁▁▁▁▂▁▂▂▂▁▂▂

0,1
epoch,40.0
train_loss_epoch,0.55405
train_loss_step,0.57807
trainer/global_step,13406.0
val_loss,0.66599


In [8]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config['gnn_hidden'],
                out_channels_gnn=config['gnn_hidden'],
                num_layers_gnn=config['gnn_layers'],
                heads=config['heads'],
                hidden_channels_deepset=config['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


[INFO] Loading model from g1_24h_train_run7.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 162.05it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 176.56it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 166.80it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 159.20it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 177.13it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 185.24it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 159.88it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 163.81it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 142.15it/s]

torch.Size([87600, 2])
[INFO] Loading model from g1_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 175.77it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps: 0.6149606924107296
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g1_24h_train_run7.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 187.91it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 164.69it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 176.73it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 163.03it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 150.49it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 191.06it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 170.65it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 186.23it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 166.75it/s]

torch.Size([87840, 2])
[INFO] Loading model from g1_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 164.54it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps: 0.6189686617958247
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h


In [9]:
RESULTPATH

'/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g1_24h'

## Graph 2

In [9]:
graph_name = "g2"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

graphs2_train_rf, tests2 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 50)], sum_stats = True)
graphs2_valid_rf, graphs2_test_rf, graphs2_test_f = tests2

g2_train_loader = DataLoader(graphs2_train_rf, batch_size=config['batch_size'], shuffle=True)
g2_valid_loader = DataLoader(graphs2_valid_rf, batch_size=config['batch_size'], shuffle=False)
g2_test_f_loader = DataLoader(graphs2_test_f, batch_size=config['batch_size'], shuffle=False)
g2_test_rf_loader = DataLoader(graphs2_test_rf, batch_size=config['batch_size'], shuffle=False)

train_loader = g2_train_loader
valid_loader = g2_valid_loader
test_f_loader = g2_test_f_loader
test_rf_loader = g2_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = graphs2_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs2_train_rf[0].num_edge_features
num_nodes = graphs2_train_rf[0].num_nodes
# max_epochs = max_epoch_list[graph_name]
max_epochs = 100

# embedding_dim = emb_dim
# in_channels = in_channels
# hidden_channels_gnn = config['gnn_hidden']
# out_channels_gnn = config['gnn_hidden']
# num_layers_gnn = config['gnn_hidden']
# heads = config['heads']
# hidden_channels_deepset = config['gnn_hidden']
# optimizer_class = AdamW
# optimizer_params = dict(lr=config['lr'])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
transform 3


100%|██████████| 2612/2612 [00:11<00:00, 222.63it/s]
100%|██████████| 836/836 [00:03<00:00, 271.77it/s]
100%|██████████| 732/732 [00:02<00:00, 264.64it/s]
100%|██████████| 730/730 [00:02<00:00, 264.70it/s]


In [10]:
PROJECTNAME = "gnn_run8"

for i in range(3, 10):
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes,  #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
            max_epochs=max_epochs,
            log_every_n_steps=1,
            accelerator="gpu",
            devices=[1],
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▃▃▂▃▂▂▃▂▂▁▃▃▂▂▂▁▂▂▁▂▂▂▂▂▁▁▂▁▂▁▂▁▂▂▂▂▁▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇██
val_loss,█▆▄▃▃▂▂▂▂▂▂▂▃▄▁▁▁▁▂▂▁▁▂▁▁▁▁▂▁▂▁▁▁▁▂▁▁▁▁▂

0,1
epoch,41.0
train_loss_epoch,0.55585
train_loss_step,0.55841
trainer/global_step,13733.0
val_loss,0.66641


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▃▃▂▂▂▂▂▂▂▂▁▂▂▂▃▂▁▂▂▂▁▁▁▂▁▁▂▂▂▁▂▂▂▁▂▂▁▁
trainer/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
val_loss,█▅▄▄▃▂▂▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▂▂▁▂▁▁▂▂▁▂▂▁▁▂▂

0,1
epoch,41.0
train_loss_epoch,0.55261
train_loss_step,0.59485
trainer/global_step,13733.0
val_loss,0.66614


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇▇▅▄▄▄▂▄▅▃▄▇▆▅▃█▃▃▄▃▂▃▅▄▃▄▂▄▁▃▂▁▁▃▁▄▄▂▂▃
trainer/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
val_loss,█▅▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▃▁▁▁▁▂

0,1
epoch,35.0
train_loss_epoch,0.569
train_loss_step,0.60744
trainer/global_step,11771.0
val_loss,0.66909


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▃▄▃▃▂▄▃▃▂▃▂▃▃▂▃▃▄▁▄▂▂▃▂▄▃▄▃▂▄▂▁▂▂▂▂▂▂▂
trainer/global_step,▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
val_loss,█▆▄▃▃▂▂▂▂▂▂▂▁▂▂▂▁▂▁▁▂▁▁▁▁▁▂▁▁▁▁▁▂▁▁▁▂▁▃▁

0,1
epoch,42.0
train_loss_epoch,0.54812
train_loss_step,0.50321
trainer/global_step,14060.0
val_loss,0.66395


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▆█▅▆▃▂█▅▃▁▅▂▃▅▄▄▃▄▃▂▅▃▃▄▄▂▆▅▄▄▂▃▅▂▄▄▁▃
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇██
val_loss,█▅▅▃▃▂▂▂▂▂▂▂▁▁▁▁▂▁▁▂▂▁▁▁▁▁▁▁▁▂▂▁▁▁▁▂▁

0,1
epoch,36.0
train_loss_epoch,0.5662
train_loss_step,0.48256
trainer/global_step,12098.0
val_loss,0.65238


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▅█▅▅▅▃▂▆▄▅▆▄▄▄▃▃▄▂▂▄▆▃▄▅▄▂▃▄▆▁▂▄▃▃▃▄▄▂▂▂
trainer/global_step,▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
val_loss,█▅▄▄▃▃▂▂▂▃▂▃▂▃▁▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▂▁

0,1
epoch,31.0
train_loss_epoch,0.57556
train_loss_step,0.54007
trainer/global_step,10463.0
val_loss,0.65901


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 887 K  | train
2 | aggr        | DeepSetAggregator | 197 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▃▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▂▃▃▂▁▂▁▂▂▁▁▁▂▂▁▂▂▁▂▂▂▂
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇███
val_loss,█▅▄▃▃▂▃▂▂▂▂▃▂▁▁▂▂▁▁▁▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▂▁▁

0,1
epoch,42.0
train_loss_epoch,0.5504
train_loss_step,0.54106
trainer/global_step,14060.0
val_loss,0.66237


In [12]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config['gnn_hidden'],
                out_channels_gnn=config['gnn_hidden'],
                num_layers_gnn=config['gnn_layers'],
                heads=config['heads'],
                hidden_channels_deepset=config['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


[INFO] Loading model from g2_24h_train_run1.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 173.98it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 184.12it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 164.77it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 165.32it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 157.18it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 153.78it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 171.07it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 158.34it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 144.33it/s]

torch.Size([87600, 2])
[INFO] Loading model from g2_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 160.20it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps: 0.6126827844698789
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g2_24h_train_run1.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 153.28it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 153.28it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 170.70it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 160.78it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 168.57it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 162.60it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 147.87it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 177.91it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 197.10it/s]

torch.Size([87840, 2])
[INFO] Loading model from g2_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 168.35it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps: 0.6193296252351883
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g2_24h


## Graph 3

In [17]:
graph_name = "g3"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

PARAMS = os.path.join(RESULTPATH, "params.json")
with open(PARAMS, "r") as f:
    print(f"[INFO] Loading {PARAMS}")
    args_dict = json.load(f)
config_g3 = args_dict

[INFO] Loading /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/params.json


In [18]:
# same number of edges for each attribute as Graph 1 (350 x 3)
graphs3_train_rf, tests3 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 50), ("alt", 4),("alt-orog", 1.5)], sum_stats = True)
graphs3_valid_rf, graphs3_test_rf, graphs3_test_f = tests3

g3_train_loader = DataLoader(graphs3_train_rf, batch_size=config_g3['batch_size'], shuffle=True)
g3_valid_loader = DataLoader(graphs3_valid_rf, batch_size=config_g3['batch_size'], shuffle=False)
g3_test_f_loader = DataLoader(graphs3_test_f, batch_size=config_g3['batch_size'], shuffle=False)
g3_test_rf_loader = DataLoader(graphs3_test_rf, batch_size=config_g3['batch_size'], shuffle=False)

train_loader = g3_train_loader
valid_loader = g3_valid_loader
test_f_loader = g3_test_f_loader
test_rf_loader = g3_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = graphs3_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs3_train_rf[0].num_edge_features
num_nodes = graphs3_train_rf[0].num_nodes
# max_epochs = max_epoch_list[graph_name]
max_epochs = 100

facts_about(graphs3_train_rf[0])

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
transform 3


100%|██████████| 2612/2612 [00:12<00:00, 215.22it/s]
100%|██████████| 836/836 [00:03<00:00, 277.40it/s]
100%|██████████| 732/732 [00:02<00:00, 275.84it/s]
100%|██████████| 730/730 [00:02<00:00, 271.32it/s]

Number of nodes: 120 with feature dimension of x: 65
Number of isolated nodes: 4
Number of edges: 1054 with edge dimension: 5
Average node degree: 8.783333778381348





In [19]:
PROJECTNAME = "gnn_run8"

for i in range(0, 10):
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes,  #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
            max_epochs=max_epochs,
            log_every_n_steps=1,
            accelerator="gpu",
            devices=[1],
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | train
4 | loss_fn     | NormalCRPS        | 0      | train
----------------------------------------------------------
138 K     Trainable params
0         Non-trainable params
138 K     Total params
0.554     Total estimated mode

0,1
epoch,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇██▄▆▄▃▅▆▃▃▂▄▄▄▅▅▂▄▅▄▂▂▃▃▃▁▅▄▂▆▃▄▂▄▂▃▂▁▂
trainer/global_step,▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇█
val_loss,█▆▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▃▂▂▃▂▁▃▁▃▁▁▁▂▂▃▂▂▂▂▁

0,1
epoch,38.0
train_loss_epoch,0.58041
train_loss_step,0.49402
trainer/global_step,12752.0
val_loss,0.66308


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▇█▅▇▅▆▅▅▆▆▃▃▃▄▂▃▂▃▄▃▂▄▄▂▄▂▂▂▃▃▂▃▂▂▁▃▅▂▂▂
trainer/global_step,▁▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇██
val_loss,█▅▄▃▃▃▂▂▂▂▂▂▁▂▂▂▁▂▂▁▁▁▁▁▁▁▂▁▁▂▁▁▂▁▂▃▁▂▁▂

0,1
epoch,44.0
train_loss_epoch,0.56889
train_loss_step,0.53033
trainer/global_step,14714.0
val_loss,0.6685


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▆▅▄▅▅▅█▃▆▅▄▃▄▃▃▃▄▅▃▂▃▃▃▃▂▄▃▁▃▂▂▇▂▃▄▂▃▃▂▄
trainer/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_loss,█▅▄▄▂▂▂▂▂▂▁▂▂▂▂▃▃▄▁▂▂▁▁▁▂▁▂▁▂▁▁▁▁▁▁▁▁▃

0,1
epoch,37.0
train_loss_epoch,0.58041
train_loss_step,0.62528
trainer/global_step,12425.0
val_loss,0.7054


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇█
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇████
val_loss,█▄▃▃▂▂▂▂▂▁▁▁▁▂▂▁▁▁▄▁▁▁▁▁▁▁▁

0,1
epoch,26.0
train_loss_epoch,0.60244
train_loss_step,0.62495
trainer/global_step,8828.0
val_loss,0.66424


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,██▆▄▆▇▅▄▆▃▅▃▄▅▆▃▂▄▂▂▂▂▃▂▁▂▃▃▂▂▄▂▃▃▂▁▂▂▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇█████
val_loss,█▅▄▄▂▃▃▂▂▂▃▁▁▁▂▂▂▂▂▁▂▂▁▁▁▂▁▃▁▂▂▁▁▁▂▁▁▂▁

0,1
epoch,38.0
train_loss_epoch,0.57957
train_loss_step,0.52747
trainer/global_step,12752.0
val_loss,0.6613


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▆█▄▄▅▄▄▄▆▃▃▅▄▄▂▄▃▂▂▃▃▂▃▃▂▂▄▃▃▂▃▂▁▂▃▃▁▂▂▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇████
val_loss,█▅▄▃▃▃▂▂▂▃▂▃▂▃▂▂▁▂▂▂▂▁▃▁▂▃▂▁▂▁▁▁▂▂▂▁▂▂▁▁

0,1
epoch,42.0
train_loss_epoch,0.57455
train_loss_step,0.56505
trainer/global_step,14060.0
val_loss,0.65699


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▂▁▁▁▁▁▂▁▁▁▁▁▁▂▁▁▁▁▂
trainer/global_step,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█████
val_loss,█▅▄▃▃▄▂▂▂▂▁▁▃▂▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▂▂▂▁

0,1
epoch,32.0
train_loss_epoch,0.59028
train_loss_step,0.61536
trainer/global_step,10790.0
val_loss,0.66128


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▇▆▆▄▄▃▄▅▄▄▃▅▂▇▂▅▄▂▅▄▂▄▂▂▂▄▄▂▄▂▂▁▄▃▂▂▃▄
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▇▇▇▇█████
val_loss,██▅▃▃▃▂▂▃▂▁▁▁▂▁▁▂▂▄▂▁▁▂▃▂▁▂▁▂▁▁▁▁▂▂▁▂▁▁▂

0,1
epoch,41.0
train_loss_epoch,0.57462
train_loss_step,0.60774
trainer/global_step,13733.0
val_loss,0.69146


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▄▄▄▄▅▃▃▂▃▃▃▂▄▂▂▄▄▃▁▃▃▃▂▂▁▂▂▂▂▂▂▄▂▃▂▂▂▁
trainer/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇██
val_loss,█▄▄▃▄▂▂▂▂▃▂▂▂▂▂▁▂▃▂▁▁▁▁▁▁▁▁▁▁▂▂▁▂▄

0,1
epoch,33.0
train_loss_epoch,0.59029
train_loss_step,0.51909
trainer/global_step,11117.0
val_loss,0.76348


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 123 K  | train
2 | aggr        | DeepSetAggregator | 12.6 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,███▅▄▃▃▃▇▄▄▆▃▃▆▄▂▄▇▅▄▅▁▅▃▁▂▃▃▆▆▂▂▂▃▅▃▃▄▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇██
val_loss,█▄▅▃▃▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▂▂▁▂▂▁▁▁▁▂▁▂▁▁▂▁▁▁

0,1
epoch,40.0
train_loss_epoch,0.58065
train_loss_step,0.6358
trainer/global_step,13406.0
val_loss,0.65679


In [21]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config_g3['gnn_hidden'],
                out_channels_gnn=config_g3['gnn_hidden'],
                num_layers_gnn=config_g3['gnn_layers'],
                heads=config_g3['heads'],
                hidden_channels_deepset=config_g3['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config_g3['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


[INFO] Loading model from g3_24h_train_run5.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 202.35it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 149.86it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 169.47it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 236.60it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 202.15it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 173.69it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 241.58it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 203.09it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 215.91it/s]

torch.Size([87600, 2])
[INFO] Loading model from g3_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 195.53it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps: 0.6095958010435606
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g3_24h_train_run5.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 226.96it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 182.14it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 217.41it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 248.94it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 204.32it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 183.44it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 199.54it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 189.43it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 160.28it/s]

torch.Size([87840, 2])
[INFO] Loading model from g3_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 184.70it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps: 0.6172392730752438
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g3_24h


## Graph 4

In [11]:
graph_name = "g4"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

PARAMS = os.path.join(RESULTPATH, "params.json")
with open(PARAMS, "r") as f:
    print(f"[INFO] Loading {PARAMS}")
    args_dict = json.load(f)
config_g4 = args_dict

[INFO] Loading /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/params.json


In [12]:
graph_name = "g4"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

graphs4_train_rf, tests4 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["dist2", "dist3"],edges=[("dist2", 0.003), ("dist3", 0.0074)], sum_stats = True)
graphs4_valid_rf, graphs4_test_rf, graphs4_test_f = tests4

g4_train_loader = DataLoader(graphs4_train_rf, batch_size=config_g4['batch_size'], shuffle=True)
g4_valid_loader = DataLoader(graphs4_valid_rf, batch_size=config_g4['batch_size'], shuffle=False)
g4_test_f_loader = DataLoader(graphs4_test_f, batch_size=config_g4['batch_size'], shuffle=False)
g4_test_rf_loader = DataLoader(graphs4_test_rf, batch_size=config_g4['batch_size'], shuffle=False)

train_loader = g4_train_loader
valid_loader = g4_valid_loader
test_f_loader = g4_test_f_loader
test_rf_loader = g4_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = graphs4_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs4_train_rf[0].num_edge_features
num_nodes = graphs4_train_rf[0].num_nodes
# max_epochs = max_epoch_list[graph_name]
max_epochs = 100

facts_about(graphs4_train_rf[0])


[INFO] Normalizing features...
fit_transform
transform 1
transform 2
transform 3
[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 2612/2612 [00:11<00:00, 220.53it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 836/836 [00:03<00:00, 252.60it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 732/732 [00:02<00:00, 270.63it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 730/730 [00:02<00:00, 279.02it/s]

Number of nodes: 120 with feature dimension of x: 65
Number of isolated nodes: 13
Number of edges: 764 with edge dimension: 2
Average node degree: 6.366666793823242





In [25]:
PROJECTNAME = "gnn_run8"

for i in range(0, 10):
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes,  #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
            max_epochs=max_epochs,
            log_every_n_steps=1,
            accelerator="gpu",
            devices=[1],
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | train
4 | loss_fn     | NormalCRPS        | 0      | train
----------------------------------------------------------
361 K     Trainable params
0         Non-trainable params
361 K     Total params
1.446     Total estimated mode

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▂▂▂▂▁▁▂▁▂▂▁▁▁▁▁▁▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█
val_loss,█▅▄▃▃▂▃▂▃▂▂▁▁▂▂▂▁▂▁▂▁▁▁▁▁▂▁

0,1
epoch,26.0
train_loss_epoch,0.60099
train_loss_step,0.79984
trainer/global_step,8828.0
val_loss,0.67027


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇███
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▇▅▆▅▆▆▄▃▇▄▃▃▂▅▅▆▄▃▃▃▃▂▃▄▄▅▃▄▄▂▃▅▃▂▂▄▄▁
trainer/global_step,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
val_loss,█▆▄▃▃▃▅▂▂▅▂▂▁▂▁▁▂▁▂▂▁▂▂▂▁▁▁▂▁▁▁▁▁▂▂▁▁▁▂▁

0,1
epoch,45.0
train_loss_epoch,0.55924
train_loss_step,0.45763
trainer/global_step,15041.0
val_loss,0.68508


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▇▆▆█▃▅▆▅▆▄▆▇▇▄▄▇▆█▆▅▇▃▄▆▄▅▆▅▄▁▃▃▂▄▂▁▃▅▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▆▆▆▇▇▇▇▇▇██████
val_loss,█▅▄▄▄▄▃▃▂▂▂▂▂▁▂▁▁▂▁▂▂▂▁▂▁▃▁▁▁▁▁▂▂▁▂▁▁▁▂▁

0,1
epoch,42.0
train_loss_epoch,0.56601
train_loss_step,0.56625
trainer/global_step,14060.0
val_loss,0.67638


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▃▄▃▃▄▄▃▂▂▃▂▂▃▂▃▃▂▂▃▃▂▂▂▂▂▂▁▂▁▁▃▁▃▁▁▂▂▁
trainer/global_step,▁▁▁▁▁▂▂▂▂▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
val_loss,█▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁

0,1
epoch,39.0
train_loss_epoch,0.57053
train_loss_step,0.56717
trainer/global_step,13079.0
val_loss,0.67754


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▁▁▁▁▂▁▁▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
val_loss,█▅▄▃▃▂▃▂▂▂▂▂▁▁▂▂▁▂▂▂▂▂▁▁▂▁▂▂▁▃▁▁▁▂▁▁▂▁▂▂

0,1
epoch,47.0
train_loss_epoch,0.55848
train_loss_step,0.52301
trainer/global_step,15695.0
val_loss,0.68118


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▂▂▂▁▂▂▁▂▁▂▁▁▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
val_loss,█▅▄▃▂▂▂▂▂▂▂▃▂▁▂▃▂▂▁▁▁▁▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁

0,1
epoch,41.0
train_loss_epoch,0.57145
train_loss_step,0.53239
trainer/global_step,13733.0
val_loss,0.6708


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇███
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▅▅▅▆█▄▂▅▆▃▃▄▃▄▄▂▃▅▅▅▃▃▃▃▃▄▃▂▁▂▄▄▃▁▃▃▂▂▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█
val_loss,█▅▄▄▃▂▃▂▂▂▂▂▂▁▁▂▁▂▂▁▁▂▂▁▂▁▂▁▁▁▁▁▁▂▁▂▂▂▁▁

0,1
epoch,49.0
train_loss_epoch,0.55444
train_loss_step,0.63022
trainer/global_step,16349.0
val_loss,0.6735


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▇▅▃▃▄▃▃▃▃▃▃▄▃▂▃▃▂▁▃▂▂▃▃▂▂▄▃▆▁▂▇▂▂▁▂▃▁▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
val_loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▁▃▁▁▂▂▁▁▁▁▁▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁

0,1
epoch,47.0
train_loss_epoch,0.55876
train_loss_step,0.61911
trainer/global_step,15695.0
val_loss,0.67838


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▃▃▃▃▃▃▃▂▂▂▄▃▂▄▂▂▁▂▂▂▂▂▂▂▂▃▂▂▂▂▃▁▃▂▂▁▂▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇████
val_loss,█▅▄▄▃▂▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▂▁▂▂▁▁▁▂▁▁▂▁▁▂▁▂

0,1
epoch,39.0
train_loss_epoch,0.5724
train_loss_step,0.56513
trainer/global_step,13079.0
val_loss,0.67773


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 309 K  | train
2 | aggr        | DeepSetAggregator | 49.8 K | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▄▄▂▂▂▃▂▁▂▂▂▂▃▂▁▂▂▂▂▁▂▂▁▂▂▁▂▁▃▁▁▁▂▂▂▂▁▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
val_loss,█▅▄▅▃▃▃▂▂▂▂▂▁▂▂▂▂▁▁▁▁▂▁▁▂▁▃▁▁▁▁▁▂▁▂▂▂▁▁▂

0,1
epoch,46.0
train_loss_epoch,0.56273
train_loss_step,0.52407
trainer/global_step,15368.0
val_loss,0.70427


In [16]:
SAVEPATH

'/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h/models'

In [18]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config_g4['gnn_hidden'],
                out_channels_gnn=config_g4['gnn_hidden'],
                num_layers_gnn=config_g4['gnn_layers'],
                heads=config_g4['heads'],
                hidden_channels_deepset=config_g4['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config_g4['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


[INFO] Loading model from g4_24h_train_run7.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 183.76it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 169.07it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 185.53it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 198.52it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 163.32it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 167.73it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 184.72it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 152.29it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 176.54it/s]

torch.Size([87600, 2])
[INFO] Loading model from g4_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 161.01it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps: 0.610728339804973
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g4_24h_train_run7.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 158.52it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 178.71it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 168.88it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 183.87it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 149.82it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 162.22it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 182.34it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 174.19it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 162.70it/s]

torch.Size([87840, 2])
[INFO] Loading model from g4_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 179.16it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps: 0.6233347052191083
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g4_24h


## Graph 5

In [19]:
graph_name = "g5"
SAVEPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}/models")
RESULTPATH = os.path.join(DIRECTORY, f"leas_trained_models/sum_stats_{leadtime}/{graph_name}_{leadtime}")

PARAMS = os.path.join(RESULTPATH, "params.json")
with open(PARAMS, "r") as f:
    print(f"[INFO] Loading {PARAMS}")
    args_dict = json.load(f)
config_g5 = args_dict

[INFO] Loading /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/params.json


In [20]:
graphs5_train_rf, tests5 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['valid'], dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog", "dist2", "dist3"], edges=[("geo", 50),("alt", 4), ("alt-orog", 1.5), ("dist2", 0.003), ("dist3", 0.0074)], sum_stats = True)
graphs5_valid_rf, graphs5_test_rf, graphs5_test_f = tests5

g5_train_loader = DataLoader(graphs5_train_rf, batch_size=config_g5['batch_size'], shuffle=True)
g5_valid_loader = DataLoader(graphs5_valid_rf, batch_size=config_g5['batch_size'], shuffle=False)
g5_test_f_loader = DataLoader(graphs5_test_f, batch_size=config_g5['batch_size'], shuffle=False)
g5_test_rf_loader = DataLoader(graphs5_test_rf, batch_size=config_g5['batch_size'], shuffle=False)

train_loader = g5_train_loader
valid_loader = g5_valid_loader
test_f_loader = g5_test_f_loader
test_rf_loader = g5_test_rf_loader
test_loader = [test_f_loader, test_rf_loader]

emb_dim = 20
in_channels = graphs5_train_rf[0].x.shape[1] + emb_dim - 1
edge_dim = graphs5_train_rf[0].num_edge_features
num_nodes = graphs5_train_rf[0].num_nodes
max_epochs = 100


[INFO] Normalizing features...
fit_transform
transform 1
transform 2
transform 3
[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 2612/2612 [00:11<00:00, 221.40it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 836/836 [00:03<00:00, 271.56it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 732/732 [00:02<00:00, 255.66it/s]


[INFO] Loading distances from file...
[INFO] Loading distances from file...


100%|██████████| 730/730 [00:02<00:00, 267.08it/s]


In [21]:
PROJECTNAME = "gnn_run8"

for i in range(0, 10):
    TRAINNAME = f"{graph_name}_{leadtime}_train_run{i}"

    with wandb.init(
            project=PROJECTNAME, id=TRAINNAME, config=args_dict, tags=["final"], resume="never"
    ):
        config = wandb.config

        multigraph = Multigraph(
            num_nodes=num_nodes,  #
            embedding_dim=emb_dim,
            edge_dim=edge_dim,
            in_channels=in_channels,
            hidden_channels_gnn=config['gnn_hidden'],
            out_channels_gnn=config['gnn_hidden'],
            num_layers_gnn=config['gnn_layers'],
            heads=config['heads'],
            hidden_channels_deepset=config['gnn_hidden'],
            optimizer_class=AdamW,
            optimizer_params=dict(lr=config['lr']),
        )
        # torch.compile(multigraph)
        batch = next(iter(train_loader))
        multigraph.forward(batch)

        wandb_logger = WandbLogger(project=PROJECTNAME)
        early_stop = EarlyStopping(monitor="val_loss", patience=10)
        progress_bar = TQDMProgressBar(refresh_rate=0)

        checkpoint_callback = ModelCheckpoint(
            dirpath=SAVEPATH, filename=TRAINNAME, monitor="val_loss", mode="min", save_top_k=1
        )

        trainer = L.Trainer(
            max_epochs=max_epochs,
            log_every_n_steps=1,
            accelerator="gpu",
            devices=[1],
            enable_progress_bar=True,
            logger=wandb_logger,
            callbacks=[early_stop, progress_bar, checkpoint_callback],
        )

        trainer.fit(model=multigraph, train_dataloaders=train_loader, val_dataloaders=valid_loader)

[34m[1mwandb[0m: Currently logged in as: [33mleachen01[0m ([33mleachen01-karlsruhe-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | train
4 | loss_fn     | NormalCRPS        | 0      | train
----------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total params
4.624     Total estimated mode

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▅▃▄▅▄▄▃▃▆▃▄▅▄▅▂▃▃▂▄▃▂▂▁▄▄▁▄▄▁▁▃▃▃▂▁▂▂▃
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇██████
val_loss,█▅▄▄▃▂▃▂▄▁▁▂▂▁▃▁▁▂▁▁▁▂▂▁▁▂▁▁▂▁▂▁▁▂▁

0,1
epoch,34.0
train_loss_epoch,0.5518
train_loss_step,0.55819
trainer/global_step,11444.0
val_loss,0.66167


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▃▃▄▃▄▅▂▂▃▃▃▃▃▂▂▂▂▄▃▃▃▂▃▂▂▂▂▂▃▃▂▃▁▃▃▂▁▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▇███████
val_loss,█▅▄▃▃▂▂▂▂▅▂▂▂▂▁▁▁▁▁▁▂▁▁▂▁▃▃▂

0,1
epoch,27.0
train_loss_epoch,0.56258
train_loss_step,0.49346
trainer/global_step,9155.0
val_loss,0.69096


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
train_loss_epoch,█▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▃▄▃█▅▄▄▂▃▃▂▄▄▃▄▃▃▃▄▄▄▃▄▃▁▃▂▄▃▃▁▁▂▂▃▃▂▁▃▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇█
val_loss,█▆▅▄▃▃▃▂▂▁▃▃▁▂▁▂▁▂▂▁▂▁▂▃▂▁▂

0,1
epoch,26.0
train_loss_epoch,0.57418
train_loss_step,0.56671
trainer/global_step,8828.0
val_loss,0.66624


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▃▁▂▂▂▂▂▁▂▂▁▁▂▁▂▁▁▂▂▁▁▁▂▁▂▂▂▁▁▁▁▁▁▁▁▂
trainer/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
val_loss,█▆▄▃▃▂▄▂▂▂▂▁▂▁▂▂▁▂▂▁▂▁▁▂▃▁▁▁▂▁▂▁▁▂▁▃▄

0,1
epoch,36.0
train_loss_epoch,0.55127
train_loss_step,0.56262
trainer/global_step,12098.0
val_loss,0.73384


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▇▇▇▇▇▇███
val_loss,█▆▅▃▃▂▃▃▃▂▂▂▂▂▁▂▁▁▂▂▃▁▁▁▁▃▂▁▂▄▂▁▂▂▄▁▁▁▂▂

0,1
epoch,44.0
train_loss_epoch,0.51849
train_loss_step,0.48004
trainer/global_step,14714.0
val_loss,0.67844


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇██████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,▆▄█▅▃▅▇▃▅▄▄▃▆▄▆▃▄▃▆▅▄▃▁▃▃▁▃▂▃▄▂▃▂▂▂▂▂▁▂▂
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█
val_loss,█▆▄▄▂▃▃▂▃▁▁▃▁▃▁▂▁▂▂▂▁▃▂▁▂▁▁▁▁▁▂▁▁▂▂▁▂

0,1
epoch,36.0
train_loss_epoch,0.53804
train_loss_step,0.53304
trainer/global_step,12098.0
val_loss,0.67865


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇█████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▅▅▃▂▂▃▃▂▂▂▂▂▂▂▂▁▂▂▂▁▁▁▃▂▁▃▁▂▂▁▁▂▁▂▁▁▂▂▁
trainer/global_step,▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇▇█
val_loss,█▄▃▃▂▂▄▂▂▂▁▁▂▁▁▂▂▁▂▁▂▂▁▁▂▁▁▁▂▁▂▁▂

0,1
epoch,32.0
train_loss_epoch,0.55429
train_loss_step,0.5722
trainer/global_step,10790.0
val_loss,0.67192


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▇▇████
train_loss_epoch,█▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇██
val_loss,█▇▄▃▃▃▃▂▂▃▂▂▄▂▂▁▄▁▁▁▂▂▂▁▂▂

0,1
epoch,25.0
train_loss_epoch,0.58486
train_loss_step,0.66497
trainer/global_step,8501.0
val_loss,0.66489


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆█▇▄▅▇▅▇▅▅▅▄▃▄▄▂▄▄▆▂▄▂▅▄▁▄▃▄▃▃▃▃▃▂▃▃▄▁▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█████
val_loss,█▅▅▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▃▂▁▆▁▁▂▁▂▁▂▂▁▃▁▁▁▂▁▁

0,1
epoch,37.0
train_loss_epoch,0.53587
train_loss_step,0.56088
trainer/global_step,12425.0
val_loss,0.65445


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type              | Params | Mode 
----------------------------------------------------------
0 | encoder     | EmbedStations     | 2.4 K  | train
1 | conv        | ResGnn            | 941 K  | train
2 | aggr        | DeepSetAggregator | 212 K  | train
3 | postprocess | MakePositive      | 0      | 

0,1
epoch,▁▁▁▁▁▁▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train_loss_epoch,█▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▄▆▅▅▆▃▄▃▄▄▃▃▁▂▂▄▄▃▂▁▃▄▂▁▃▄▃▄▃▂▂▃▁▁▂▃▃▃▂
trainer/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇████
val_loss,█▅▄▃▄▂▂▃▂▃▂▂▃▁▂▅▁▃▁▁▁▁▂▁▂▂▁▂▁▂▂▁▂▂

0,1
epoch,33.0
train_loss_epoch,0.55087
train_loss_step,0.48892
trainer/global_step,11117.0
val_loss,0.67377


In [22]:
data_list = ["f", "rf"]
for data, tl in zip(data_list, test_loader):
    preds_list = []
    for path in os.listdir(SAVEPATH):
        if path.endswith(".ckpt"):
            print(f"[INFO] Loading model from {path}")
            # Load Model from checkpoint

            multigraph = Multigraph.load_from_checkpoint(
                os.path.join(SAVEPATH, path),
                num_nodes=num_nodes,
                embedding_dim=emb_dim,
                edge_dim=edge_dim,
                in_channels=in_channels,
                hidden_channels_gnn=config_g5['gnn_hidden'],
                out_channels_gnn=config_g5['gnn_hidden'],
                num_layers_gnn=config_g5['gnn_layers'],
                heads=config_g5['heads'],
                hidden_channels_deepset=config_g5['gnn_hidden'],
                optimizer_class=AdamW,
                optimizer_params=dict(lr=config_g5['lr']),
            )
            multigraph.eval()
            batch = next(iter(train_loader))
            batch = batch.to("cuda")
            multigraph.to("cuda")
            multigraph.forward(batch)

            trainer = L.Trainer(log_every_n_steps=1, accelerator="gpu", devices=[1], enable_progress_bar=True)

            ####################################################################################################
            preds = trainer.predict(model=multigraph, dataloaders=[tl])
            preds = torch.cat(preds, dim=0)
            preds_list.append(preds)
            print()
            print(preds.shape)

    targets = dataframes[f"test_{data}"][1]
    targets = torch.tensor(targets.t2m.values) - 273.15

    stacked = torch.stack(preds_list)
    final_preds = torch.mean(stacked, dim=0)

    res = multigraph.loss_fn.crps(final_preds, targets)
    print("#############################################")
    print("#############################################")
    print(f"final crps: {res.item()}")
    print("#############################################")
    print("#############################################")

    ####################################################################################################
    os.makedirs(RESULTPATH, exist_ok=True)
    print(RESULTPATH)

    df = pd.DataFrame(np.concatenate([targets.view(-1, 1), final_preds], axis=1), columns=["t2m", "mu", "sigma"])
    df.to_csv(os.path.join(RESULTPATH, f"{data}_{graph_name}_{leadtime}_results.csv"), index=False)

    # Create Log File ###############################################################
    log_file = os.path.join(RESULTPATH, f"{data}.txt")
    with open(log_file, "w") as f:
        f.write(f"Data: {data}\n")
        f.write(f"Leadtime: {leadtime}\n")
        f.write(f"Final crps: {res.item()}")

[INFO] Loading model from g5_24h_train_run5.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 109.43it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 113.67it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 118.43it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 117.82it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 113.40it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 110.69it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 120.62it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 119.70it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 120.00it/s]

torch.Size([87600, 2])
[INFO] Loading model from g5_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 121.08it/s]

torch.Size([87600, 2])
#############################################
#############################################
final crps: 0.6051888017527292
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


[INFO] Loading model from g5_24h_train_run5.ckpt
Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 118.29it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run0.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 118.66it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run7.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 114.12it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run4.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 119.16it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run9.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 111.61it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run6.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 117.69it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run2.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 110.13it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run8.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 107.01it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run1.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 119.95it/s]

torch.Size([87840, 2])
[INFO] Loading model from g5_24h_train_run3.ckpt


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting DataLoader 0: 100%|██████████| 92/92 [00:00<00:00, 119.67it/s]

torch.Size([87840, 2])
#############################################
#############################################
final crps: 0.6175431825760571
#############################################
#############################################
/home/ltchen/gnnpp/leas_trained_models/sum_stats_24h/g5_24h
