In [1]:
%cd /home/ltchen/gnnpp
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
import pytorch_lightning as L
import torch
import torch_geometric
import json
import wandb

from typing import Tuple
from torch_geometric.nn import GATv2Conv
from torch_geometric.utils import scatter
from torch.nn import Linear, ModuleList, ReLU
#from torch.utils.data import DataLoader
from torch_geometric.loader import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim import AdamW
from pytorch_lightning.loggers import WandbLogger

from models.loss import NormalCRPS
from models.model_utils import MakePositive, EmbedStations
from utils.data import (
    load_dataframes,
    load_distances,
    normalize_features_and_create_graphs,
    rm_edges,
    summary_statistics,
)

/home/ltchen/gnnpp


In [2]:
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/gnn_24h/models")
JSONPATH = os.path.join(DIRECTORY, "trained_models/best_24h/params.json")

In [3]:
with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)
config = args_dict
print(config)
print(config['lr'])
print(config['max_dist'])
print(type(config))
'''{"batch_size":8,
"gnn_hidden":265,
"gnn_layers":2,
"heads":8,
"lr":0.0002,
"max_dist":100,
"max_epochs": 31}'''

[INFO] Loading /home/ltchen/gnnpp/trained_models/best_24h/params.json
{'batch_size': 8, 'gnn_hidden': 265, 'gnn_layers': 2, 'heads': 8, 'lr': 0.0002, 'max_dist': 100, 'max_epochs': 31}
0.0002
100
<class 'dict'>


'{"batch_size":8,\n"gnn_hidden":265,\n"gnn_layers":2,\n"heads":8,\n"lr":0.0002,\n"max_dist":100,\n"max_epochs": 31}'

In [4]:
# load graph only for rf
dataframes = load_dataframes(mode="eval", leadtime= "24h")
dist = load_distances(dataframes["stations"])


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Computing distances...


In [6]:
print(f"dist.shape: {dist.shape}")

dist.shape: (122, 122)


In [8]:
print(next(iter(train_loader)))


DataBatch(x=[10736, 36], edge_index=[2, 232320], edge_attr=[232320, 1], y=[976], timestamp=[8], n_idx=[10736], batch=[10736], ptr=[9])


In [15]:
# just for review to understand the difference between gnn and drn

dataframes = load_dataframes(mode="train", leadtime="24h")
dist = load_distances(dataframes["stations"])

graphs_train_rf, tests = normalize_features_and_create_graphs(
    training_data=dataframes["train"],
    valid_test_data=[dataframes["test_rf"], dataframes["test_f"]],
    mat=dist,
    max_dist=config['max_dist'],
)
graphs_test_rf, graphs_test_f = tests

# wo ist alles was hinter diesem Code kommt : pop stations, reset_index, dropnans in normalize_features_and_create_graphs
dataframes.pop("stations") # .pop("stations") => entfernt den df mit stations, wofuer brauche ich die dann überhaupt? Grafik?

# test
for X, y in dataframes.values(): # wofuer?
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

##
train, valid_test = normalize_features_and_create_graphs(
    training_data=dataframes["train"], valid_test_data=[dataframes["test_rf"], dataframes["test_f"]]
)

train = drop_nans(train)
(test_rf, test_f) = valid_test
test_rf = drop_nans(test_rf)
test_f = drop_nans(test_f)

DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/drn_24h/models")

[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Computing distances...
[INFO] Normalizing features...


KeyboardInterrupt: 

## GNN Architecture

In [5]:
# gnn architecture
class DeepSetAggregator(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(DeepSetAggregator, self).__init__()

        self.input = torch.nn.Linear(in_channels, hidden_channels)
        self.hidden1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.hidden2 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.output = torch.nn.Linear(hidden_channels, out_channels)
        self.relu = torch.nn.ReLU()

    def forward(self, x, index):
        x = self.input(x)
        x = self.relu(x)
        print(f"Input: {x.shape}")
        x = self.hidden1(x)
        x = self.relu(x)
        print(f"Hidden1: {x.shape}")
        x = scatter(x, index, dim=0, reduce="mean")
        print(f"scatter: {x.shape}")
        print(f"index: {index}")
        self.hidden2(x)
        x = self.relu(x)
        print(f"Hidden2: {x.shape}")
        x = self.output(x)
        return x


class ResGnn(torch.nn.Module):
    def __init__(self, in_channels: int, out_channels: int, num_layers: int, hidden_channels: int, heads: int):
        super(ResGnn, self).__init__()
        assert num_layers > 0, "num_layers must be > 0."

        # Create Layers
        self.convolutions = ModuleList()
        for _ in range(num_layers):
            self.convolutions.append(
                GATv2Conv(-1, hidden_channels, heads=heads, edge_dim=1, add_self_loops=True, fill_value=0.01)
            )
        self.lin = Linear(hidden_channels * heads, out_channels)
        self.relu = ReLU()

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor, edge_attr: torch.Tensor) -> torch.Tensor:
        x = x.float()
        edge_attr = edge_attr.float()
        for i, conv in enumerate(self.convolutions):
            if i == 0:
                # First Layer
                x = conv(x, edge_index, edge_attr)
                x = self.relu(x)
            else:
                x = x + self.relu(conv(x, edge_index, edge_attr))  # Residual Layers

        x = self.lin(x)
        return x

    @torch.no_grad()
    def get_attention(
        self, x: torch.Tensor, edge_index: torch.Tensor, edge_attr: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Runs a forward Pass for the given graph only though the ResGNN layer.
        NOTE: the data that is given to this method must first pass through the layers before this layer in the Graph

        :param torch.Tensor x: Tensor of Node Features (NxD)
        :param torch.Tensor edge_index: Tensor of Edges (2xE)
        :param torch.Tensor edge_attr: Edge Attributes (ExNum_Attr)
        :return x, edge_index_attention, attention_weights: Tensor of Node Features (NxD), Tensor of Edges with
        self loops (2xE), Tensor of Attention per edge (ExNum_Heads)
        """
        x = x.float()
        edge_attr = edge_attr.float()

        # Pass Data though Layer to get the Attention
        attention_list = []
        # Note: edge_index_attention has to be added since we have self loops now
        edge_index_attention, attention_weights = None, None

        for i, conv in enumerate(
            self.convolutions,
        ):
            if i == 0:
                # First Layer
                x, (edge_index_attention, attention_weights) = conv(
                    x, edge_index, edge_attr, return_attention_weights=True
                )
                print("attention_weights:")
                print(attention_weights)
                print("edge_index_attention")
                print(edge_index_attention)
                print(f"attention_weights.shape{attention_weights.shape}")
                print(f"type(attention_weights){type(attention_weights)}")
                attention_list.append(attention_weights)
                x = self.relu(x)
                x = self.norm(x)
            else:
                x_conv, (edge_index_attention, attention_weights) = conv(
                    x, edge_index, edge_attr, return_attention_weights=True
                )
                attention_list.append(attention_weights)
                x = x + self.relu(x_conv)  # Residual Layers
        x = self.lin(x)

        # Attention weights of first layer
        attention_weights = attention_weights.mean(dim=1)
        print("attention_weights.mean(dim=1)")
        print(attention_weights)
        print(attention_weights.shape)

        return x, edge_index_attention, attention_weights, attention_list

# gnn architecture
class ThisMultigraph(L.LightningModule):
    def __init__(
        self,
        embedding_dim,
        in_channels,
        hidden_channels_gnn,
        out_channels_gnn,
        num_layers_gnn,
        heads,
        hidden_channels_deepset,
        optimizer_class,
        optimizer_params,
    ):
        super(ThisMultigraph, self).__init__()

        self.encoder = EmbedStations(num_stations_max=122, embedding_dim=embedding_dim)

        self.conv = ResGnn(
            in_channels=in_channels,
            hidden_channels=hidden_channels_gnn,
            out_channels=out_channels_gnn,
            num_layers=num_layers_gnn,
            heads=heads,
        )

        self.aggr = DeepSetAggregator(
            in_channels=out_channels_gnn, hidden_channels=hidden_channels_deepset, out_channels=2
        )

        self.postprocess = MakePositive()
        self.loss_fn = NormalCRPS()

        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, data):
        x, edge_index, edge_attr, batch_id, node_idx = data.x, data.edge_index, data.edge_attr, data.batch, data.n_idx
        node_idx = node_idx + batch_id * 122  # add batch_id to node_idx to get unique node indices
        x = self.encoder(x)
        x = self.conv(x, edge_index, edge_attr)
        x = self.aggr(x, node_idx)
        x = self.postprocess(x)
        return x

    def training_step(self, batch, batch_idx):
        y_hat = self.forward(batch)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=batch.y)
        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, batch_size=1
        )  # The batch size is not actually 1 but the loss is already averaged over the batch
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        y_hat = self.forward(batch)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=batch.y)
        self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=1)
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx=0):
        y_hat = self.forward(batch)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=batch.y)
        self.log("test_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=1)
        return loss

    def initialize(self, dataloader):
        batch = next(iter(dataloader))
        self.validation_step(batch, 0)

### Check DeepSetAggregator and ResGNN outside of Multigraph

In [9]:
# DeepSetAggregator
graphs_train_rf, tests = normalize_features_and_create_graphs(
    training_data=dataframes["train"],
    valid_test_data=[dataframes["test_rf"], dataframes["test_f"]],
    mat=dist,
    max_dist=config['max_dist'],
)
graphs_test_rf, graphs_test_f = tests

graphs_test = graphs_test_rf

# print(graphs_test_rf[0].x.shape) (1342, 36)


print("[INFO] Creating data loaders...")
train_loader = DataLoader(graphs_train_rf, batch_size=config['batch_size'], shuffle=True)

print("[INFO] Creating model...")
emb_dim=20
in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1

embedding_dim=emb_dim,
in_channels=in_channels,
hidden_channels_gnn=config['gnn_hidden'],
out_channels_gnn=config['gnn_hidden'],
num_layers_gnn=config['gnn_hidden'],
heads=config['heads'],
hidden_channels_deepset=config['gnn_hidden'],
optimizer_class=AdamW,
optimizer_params=dict(lr=config['lr']),

ds = DeepSetAggregator(
    in_channels=out_channels_gnn, hidden_channels=hidden_channels_deepset, out_channels=2
)

[INFO] Normalizing features...
[INFO] Creating graph data...


KeyboardInterrupt: 

## Train GNN

In [6]:
# train gnn
# build a graph with wandb => create multigraph - without summmary_statistics and no edges removed
with wandb.init(
    project="multigraph", id=f"training_run_24h", config=args_dict, tags=["final_training"]
):
    config = wandb.config
    #print("[INFO] Starting sweep with config: ", config)

    graphs_train_rf, tests = normalize_features_and_create_graphs(
        training_data=dataframes["train"],
        valid_test_data=[dataframes["test_rf"], dataframes["test_f"]],
        mat=dist,
        max_dist=config['max_dist'],
    )
    graphs_test_rf, graphs_test_f = tests

    graphs_test = graphs_test_rf

    # print(graphs_test_rf[0].x.shape) (1342, 36)


    print("[INFO] Creating data loaders...")
    train_loader = DataLoader(graphs_train_rf, batch_size=config['batch_size'], shuffle=True)

    print("[INFO] Creating model...")
    emb_dim=20
    in_channels = graphs_train_rf[0].x.shape[1] + emb_dim - 1 #(36 + 20 - 1) = 55

    multigraph = ThisMultigraph(
        embedding_dim=emb_dim,
        in_channels=in_channels,
        hidden_channels_gnn=config['gnn_hidden'],
        out_channels_gnn=config['gnn_hidden'],
        num_layers_gnn=config['gnn_layers'],
        heads=config['heads'],
        hidden_channels_deepset=config['gnn_hidden'],
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config['lr']),
    )
    torch.compile(multigraph)

    # understand what this is
    batch = next(iter(train_loader))
    batch = batch  # .to("cuda")
    multigraph  # .to("cuda")
    multigraph.forward(batch)

    wandb_logger = WandbLogger(project="multigraph")
    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
    )

    print("[INFO] Training model...")
    trainer = L.Trainer(
        max_epochs=config['max_epochs'],
        log_every_n_steps=1,
        accelerator="gpu",
        devices=1,
        enable_progress_bar=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )

    trainer.fit(model=multigraph, train_dataloaders=train_loader)

[34m[1mwandb[0m: Currently logged in as: [33mleachen[0m ([33mleachen_thesis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[INFO] Normalizing features...
[INFO] Creating graph data...
[INFO] Creating data loaders...
[INFO] Creating model...
Input: torch.Size([10736, 265])
Hidden1: torch.Size([10736, 265])
scatter: torch.Size([976, 265])
index: tensor([  0,   1,   2,  ..., 973, 974, 975])
Hidden2: torch.Size([976, 265])


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVIC

[INFO] Training model...
Epoch 0:   0%|          | 0/431 [00:00<?, ?it/s] 

Traceback (most recent call last):
  File "/tmp/ipykernel_3183839/4149401971.py", line 64, in <module>
    trainer.fit(model=multigraph, train_dataloaders=train_loader)
  File "/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 538, in fit
    call._call_and_handle_interrupt(
  File "/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 47, in _call_and_handle_interrupt
    return trainer_fn(*args, **kwargs)
  File "/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 574, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 981, in _run
    results = self._run_stage()
  File "/home/ltchen/.conda/envs/gnn_env3/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1025, in _run_stage
    self.fit

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.92 GiB. GPU 0 has a total capacity of 23.68 GiB of which 1.00 GiB is free. Process 3119012 has 22.17 GiB memory in use. Including non-PyTorch memory, this process has 504.00 MiB memory in use. Of the allocated memory 234.84 MiB is allocated by PyTorch, and 23.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# evaluate gnn


In [18]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()