In [1]:
%cd /home/ltchen/gnnpp
import sys
import os
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    load_distances,
    normalize_features_and_create_graphs,
    rm_edges,
    summary_statistics,
)
from exploration.graph_creation import *
from models.graphensemble import *

/home/ltchen/gnnpp


In [2]:
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/gnn_new_attr_24h/models")
JSONPATH = os.path.join(DIRECTORY, "trained_models/best_24h/params.json")

In [3]:
dataframes = load_dataframes(mode="eval", leadtime= "24h")

[INFO] Data files not found, will load from zarr.
[INFO] Loading data...
[INFO] Loading all features
[INFO] Loading training data (1997-2013)
<xarray.Dataset>
Dimensions:             (station_id: 122, number: 11, time: 3449)
Coordinates: (12/13)
    model_altitude      (station_id) float32 ...
    model_land_usage    (station_id) int8 ...
    model_latitude      (station_id) float64 ...
    model_longitude     (station_id) float64 ...
    model_orography     (station_id) float64 ...
  * number              (number) int64 0 1 2 3 4 5 6 7 8 9 10
    ...                  ...
  * station_id          (station_id) int64 11101 11105 11308 ... 340 344 330
    station_land_usage  (station_id) int8 ...
    station_latitude    (station_id) float64 ...
    station_longitude   (station_id) float64 ...
    station_name        (station_id) <U20 ...
  * time                (time) datetime64[ns] 1997-01-02 ... 2014-01-01
Data variables: (12/30)
    cape                (station_id, number, time) float32

In [25]:
train_target = dataframes['train'][1]
print(train_target[train_target['station_id'] == 62]['t2m'].isna().sum())
print(train_target[train_target['station_id'] == 74].isna().sum())
print(train_target[train_target['station_id'] == 2].isna().sum())

# wie viele haben ueberhaupt keine nans
counter = 0
list = []
for i in range(122):
    if (train_target[train_target['station_id'] == i]['t2m'].isna().sum() != 0):
        list.append(i)
        counter += 1
print(counter)
print(list)

3448
time             0
station_id       0
t2m           3448
dtype: int64
time           0
station_id     0
t2m           27
dtype: int64
76
[2, 19, 30, 31, 33, 34, 35, 37, 40, 41, 42, 43, 44, 47, 49, 50, 51, 52, 54, 55, 56, 57, 58, 60, 61, 62, 65, 67, 68, 69, 70, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 111, 112, 113, 115, 118, 119, 120, 121]


In [3]:
with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)
config = args_dict
print(config)
print(config['lr'])
print(config['max_dist'])
print(type(config))
print(type(config['lr']))
print(type(config['gnn_hidden']))
'''{"batch_size":8,
"gnn_hidden":265,
"gnn_layers":2,
"heads":8,
"lr":0.0002,
"max_dist":100,
"max_epochs": 31}'''

[INFO] Loading /home/ltchen/gnnpp/trained_models/best_24h/params.json
{'batch_size': 8, 'gnn_hidden': 265, 'gnn_layers': 2, 'heads': 8, 'lr': 0.0002, 'max_dist': 100, 'max_epochs': 31}
0.0002
100
<class 'dict'>
<class 'float'>
<class 'int'>


'{"batch_size":8,\n"gnn_hidden":265,\n"gnn_layers":2,\n"heads":8,\n"lr":0.0002,\n"max_dist":100,\n"max_epochs": 31}'

## Check GAT

In [4]:
dataframes = load_dataframes(mode="eval", leadtime= "24h") # load newly created dataframes
dataframes = summary_statistics(dataframes)
dist = load_distances(dataframes["stations"])

graphs1_train_rf, tests1 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo"], edges=[("geo", 100)], sum_stats = True)

graphs1_test_rf, graphs1_test_f = tests1
graphs1_test = graphs1_test_rf


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
[INFO] Loading distances from file...
[INFO] Normalizing features...
fit_transform
transform 1
transform 2
[INFO] Converting temperature values...


100%|██████████| 3448/3448 [00:16<00:00, 205.29it/s]
100%|██████████| 732/732 [00:02<00:00, 288.06it/s]
100%|██████████| 730/730 [00:02<00:00, 245.80it/s]


In [6]:
graphs2_train_rf, tests2 = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)], sum_stats = True)

graphs2_test_rf, graphs2_test_f = tests2
graphs2_test = graphs2_test_rf

[INFO] Normalizing features...
fit_transform
transform 1
transform 2
[INFO] Converting temperature values...


100%|██████████| 3448/3448 [00:16<00:00, 210.09it/s]
100%|██████████| 732/732 [00:02<00:00, 260.57it/s]
100%|██████████| 730/730 [00:02<00:00, 278.09it/s]


In [7]:
graphs2_train_rf[0].y

tensor([-268.5500, -266.7500, -269.8500, -270.5500, -266.9500, -269.8500,
        -268.8500, -271.0500, -270.0500, -268.3500, -271.9500, -270.0500,
        -271.0500, -272.0500, -269.7500, -273.6500, -270.6500, -264.2500,
        -264.4500,       nan, -264.8500, -264.3500, -264.8500, -266.2500,
        -265.2500, -266.4500, -265.7500, -267.0500, -264.5500, -267.3500,
        -267.4500, -277.7500, -276.3500, -277.7500,       nan, -275.1500,
        -267.5500, -267.2500, -274.6500, -270.4500, -265.3500, -270.4500,
        -270.2500, -272.2500, -274.3500, -270.7500, -276.8500, -272.7500,
        -275.5500, -271.8500, -273.3500, -273.2500, -271.0500, -268.8500,
        -268.0500, -266.4500, -276.4500, -271.5500, -270.7500, -273.1500,
        -274.1500, -273.2500,       nan, -268.0500, -272.8500, -269.6500,
        -267.1500, -272.0500, -278.3500, -271.4500, -269.2500, -270.1500,
        -267.7500, -266.3500,       nan, -276.3500, -268.9500, -272.7500,
        -274.3500, -271.5500, -267.950

In [6]:
batch_size = config['batch_size']

print("[INFO] Creating data loaders...")
g2_train_loader = DataLoader(graphs2_train_rf, batch_size=batch_size, shuffle=True)
g2_test_f_loader = DataLoader(graphs2_test_f, batch_size=batch_size, shuffle=False)

print("[INFO] Creating model...")
emb_dim = 20

in_channels = graphs2_train_rf[0].x.shape[1] + emb_dim - 1

edge_dim = graphs2_train_rf[0].num_edge_features
embedding_dim = emb_dim
in_channels = in_channels
hidden_channels_gnn = config['gnn_hidden']
out_channels_gnn = config['gnn_hidden']
num_layers_gnn = config['gnn_hidden']
heads = config['heads']
hidden_channels_deepset = config['gnn_hidden']
optimizer_class = AdamW
optimizer_params = dict(lr=config['lr'])


[INFO] Creating data loaders...
[INFO] Creating model...


In [None]:
PROJECTNAME = "test"
FILENAME = "test_g2_train_run_24h"
with wandb.init(
        project=PROJECTNAME, id=FILENAME, config=args_dict, tags=["reproduction"]
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)
    batch = next(iter(g2_train_loader))
    # batch = batch  # .to("cuda")
    # multigraph  # .to("cuda")
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=FILENAME, monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

## Check equality

In [5]:
dataframes = load_dataframes(mode="eval", leadtime= "24h") # load newly created dataframes
dataframes = summary_statistics(dataframes)
dist = load_distances(dataframes["stations"])


#self-created
l_graphs_train_rf, l_tests = normalize_features_and_create_graphs1(df_train=dataframes['train'], df_valid_test=[dataframes['test_rf'], dataframes['test_f']], station_df=dataframes['stations'], attributes=["geo", "alt", "lon", "lat", "alt-orog"], edges=[("geo", 100)], sum_stats = True)

l_graphs_test_rf, l_graphs_test_f = l_tests

l_graphs_test = l_graphs_test_rf

#moritz
m_graphs_train_rf, m_tests = normalize_features_and_create_graphs(
    training_data=dataframes["train"],
    valid_test_data=[dataframes["test_rf"], dataframes["test_f"]],
    mat=dist,
    max_dist=config['max_dist'],
)
m_graphs_test_rf, m_graphs_test_f = m_tests

m_graphs_test = m_graphs_test_rf

# print(graphs_test_rf[0].x.shape) (1342, 36)



[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
[INFO] Loading distances from file...
[INFO] Normalizing features...
fit_transform
transform 1
transform 2


TypeError: max() received an invalid combination of arguments - got (out=NoneType, axis=int, ), but expected one of:
 * ()
 * (Tensor other)
 * (int dim, bool keepdim = False)
      didn't match because some of the keywords were incorrect: out, axis
 * (name dim, bool keepdim = False)
      didn't match because some of the keywords were incorrect: out, axis


In [6]:
l_train_loader = DataLoader(l_graphs_train_rf, batch_size=config['batch_size'], shuffle=True)
m_train_loader = DataLoader(m_graphs_train_rf, batch_size=config['batch_size'], shuffle=True)

In [22]:
print(l_graphs_train_rf[0])
print(m_graphs_train_rf[0])

graph1 = l_graphs_train_rf[0]
graph2 = m_graphs_train_rf[0]

print(f"edge indices are the same: {torch.equal(graph1.edge_index, graph2.edge_index)}")
print(f"targets are the same: {torch.equal(graph1.y, graph2.y)}")
print(f"targets are almost the same: {torch.allclose(graph1.y, graph2.y, atol=1e-50)}")

print(graph1.y)
print(graph2.y)
print(type(graph1))
print(type(graph2))
comparison = np.array(graph1.x) == np.array(graph2.x)
diff_indices = np.where(comparison == False)
print(np.array(graph1.x)==np.array(graph2.x))
print(diff_indices)
print("graph1[diff]:", graph1.x[diff_indices[0]][0])
print("graph2[diff]:", graph2.x[diff_indices[0]][0])
print(np.allclose(graph1.x, graph2.x, atol=1e-50))
print(torch.equal(graph1.x, graph2.x))
print(122*65)

Data(x=[122, 65], edge_index=[2, 1420], edge_attr=[1420, 1], y=[122, 1], pos=[122, 2], timestamp=1997-01-02 00:00:00, n_idx=[122])
Data(x=[122, 65], edge_index=[2, 1420], edge_attr=[1420, 1], y=[122], timestamp=1997-01-02 00:00:00, n_idx=[122])
edge indices are the same: True
targets are the same: False
targets are almost the same: False
tensor([[ 4.6000],
        [ 6.4000],
        [ 3.3000],
        [ 2.6000],
        [ 6.2000],
        [ 3.3000],
        [ 4.3000],
        [ 2.1000],
        [ 3.1000],
        [ 4.8000],
        [ 1.2000],
        [ 3.1000],
        [ 2.1000],
        [ 1.1000],
        [ 3.4000],
        [-0.5000],
        [ 2.5000],
        [ 8.9000],
        [ 8.7000],
        [    nan],
        [ 8.3000],
        [ 8.8000],
        [ 8.3000],
        [ 6.9000],
        [ 7.9000],
        [ 6.7000],
        [ 7.4000],
        [ 6.1000],
        [ 8.6000],
        [ 5.8000],
        [ 5.7000],
        [-4.6000],
        [-3.2000],
        [-4.6000],
        [    n

In [7]:
emb_dim=20
# edge_dim=l_graphs_test_f[0].num_edge_features
edge_dim = 1
in_channels = m_graphs_train_rf[0].x.shape[1] + emb_dim - 1

In [8]:
with wandb.init(
    project=PROJECTNAME, id=f"m_graph_training_run_24h", config=args_dict, tags=["final_training"], reinit=True
):
    config = wandb.config

    multigraph = Multigraph(
        embedding_dim=emb_dim,
        # edge_dim=edge_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)

    # understand what this is
    batch = next(iter(m_train_loader))
    # batch = batch  # .to("cuda")
    # multigraph  # .to("cuda")
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project=PROJECTNAME)
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=f"m_graph_run_24h", monitor="train_loss", mode="min", save_top_k=1
    )

    # print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=m_train_loader) # trainer speichern und entweder neuladen oder
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mleachen[0m ([33mleachen_thesis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env4/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env4/

Epoch 30: 100%|██████████| 431/431 [00:13<00:00, 31.69it/s, v_num=_24h, train_loss_step=1.070, train_loss_epoch=1.520]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 431/431 [00:13<00:00, 31.68it/s, v_num=_24h, train_loss_step=1.070, train_loss_epoch=1.520]


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train_loss_epoch,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▃▃▃▂▃▂▂▂▁▂▂▁▂▁▂▁▁▁▂▁▁▁▁▂▁▁▁▁▂▁▂▁▁▁▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▇▇▇▇▇▇████

0,1
epoch,30.0
train_loss_epoch,1.52217
train_loss_step,1.06947
trainer/global_step,13360.0


In [20]:
emb_dim=20
# edge_dim=l_graphs_test_f[0].num_edge_features
edge_dim = 1
in_channels = m_graphs_train_rf[0].x.shape[1] + emb_dim - 1

tensor([[ 0.0000e+00, -7.3829e-01, -7.6410e-01,  ...,  6.1836e-01,
          9.9941e-01,  3.4422e-02],
        [ 1.0000e+00, -7.3700e-01, -7.7860e-01,  ...,  1.5635e+00,
          9.9941e-01,  3.4422e-02],
        [ 2.0000e+00, -7.3185e-01, -7.3317e-01,  ...,  7.6520e-01,
          9.9941e-01,  3.4422e-02],
        ...,
        [ 1.1900e+02,  2.3386e+00,  6.4645e-01,  ..., -6.1813e-01,
          9.9941e-01,  3.4422e-02],
        [ 1.2000e+02,  4.7996e+00,  3.9940e+00,  ..., -6.0907e-01,
          9.9941e-01,  3.4422e-02],
        [ 1.2100e+02,  5.9139e+00,  4.3452e+00,  ..., -4.7565e-01,
          9.9941e-01,  3.4422e-02]])

TypeError: 'method' object is not iterable