# Run DRN

In [1]:
# load data first
%cd /home/ltchen/gnnpp
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import argparse
import json
import pytorch_lightning as L
import torch
import wandb

from models.drn import DRN
from models.model_utils import EmbedStations
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from sklearn.preprocessing import StandardScaler
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

from utils.data import load_dataframes, load_distances, normalize_features_and_create_graphs, summary_statistics
from utils.drn_utils import *

/home/ltchen/gnnpp


Deterministic NN with one hidden layer using MSE as loss

In [2]:
# eine station ohne embeddings (+ CRPS)
# nn mse loss with lightning
class MyDRN(L.LightningModule):
    def __init__(self, hidden_size, embedding_dim, in_feat, optimizer_class, optimizer_params):
        super(MyDRN, self).__init__()
        self.embedding = EmbedStations(num_stations_max=122, embedding_dim=embedding_dim)
        self.linear = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        self.linear_t2m = torch.nn.Linear(in_features=hidden_size, out_features=1) # output t2m value

        self.loss = torch.nn.MSELoss()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        print(x.shape) # (8, 65)
        x = self.embedding(x)
        print(f"After embedding: {x.shape}") # (8, 84)
        x = self.linear(x)
        print(x.shape) # (8, 64)
        x = self.relu(x)
        print(x.shape)
        x = self.linear_t2m(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten()) # why y.flatten()?
        # self.log("train_loss", loss.item(), on_step=True, on_epoch=True, prog_bar=True) # through wandb
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        # self.log("validation_loss", loss.item(), on_epoch=True, prog:bar=True)
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx): # unterschied zwischen predict und test_step?
        x, y = batch # wieso hat test_step auch y?
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        # self.log("test_loss", loss.item(), on_epoch=True, prog_bar=True)
        return loss

In [3]:
dataframes = load_dataframes(mode="train", leadtime="24h") # train mode => for training nn? Wie wird das im Paper beschrieben?
dataframes = summary_statistics(dataframes)
dataframes.pop("stations") # .pop("stations") => entfernt den df mit stations, wofuer brauche ich die dann überhaupt? Grafik?

for X, y in dataframes.values(): # wofuer?
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

train, valid_test = normalize_features(
    training_data=dataframes["train"], valid_test_data=[dataframes["test_rf"], dataframes["test_f"]]
)

train = drop_nans(train)


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
[INFO] Normalizing features...


In [None]:
# MyDRN train without wandb without saving
DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/drn_24h/models")

y_scaler = StandardScaler(with_std=False) # wieso standardisiere ich varianz nicht?
y_scaler = y_scaler.fit(train[1][["t2m"]])


train_dataset = TensorDataset(
    torch.Tensor(train[0].to_numpy()), torch.Tensor(y_scaler.transform(train[1][["t2m"]]))
)

#from params.json best_24h
batch_size = 8 # grössere batch size, 1024
hidden_size=64
#lr=0.0002
lr=0.0002
# max_dist=100
max_epochs=31

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

embed_dim = 20
in_feat = train[0].shape[1] + embed_dim - 1

mydrn = MyDRN(
    hidden_size=hidden_size,
    embedding_dim=embed_dim,
    in_feat=in_feat,
    optimizer_class=AdamW,
    optimizer_params=dict(lr=lr),
)

checkpoint_callback = ModelCheckpoint(
    # dirpath=SAVEPATH, filename=f"run_{args.id}", monitor="train_loss", mode="min", save_top_k=1
    dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
)

trainer = L.Trainer(
    max_epochs=max_epochs,
    log_every_n_steps=1,
    accelerator="gpu",
    enable_progress_bar=True,
    enable_model_summary=True,
    callbacks=checkpoint_callback,
)



trainer.fit(model=mydrn, train_dataloaders=train_loader)



# train = drop_nans((t_train_rf, t_train_rf_target)) # error: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match # ausserdem: wieso nur fuer train drop_nans?




Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-package

Epoch 0:   0%|          | 0/49859 [00:00<?, ?it/s] torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 1/49859 [00:00<8:11:31,  1.69it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 2/49859 [00:00<4:09:30,  3.33it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 3/49859 [00:00<2:48:48,  4.92it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 4/49859 [00:00<2:08:28,  6.47it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 5/49859 [00:00<1:44:20,  7.96it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|   

  return F.mse_loss(input, target, reduction=self.reduction)


torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 8/49859 [00:00<1:08:01, 12.21it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 9/49859 [00:00<1:01:12, 13.57it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 10/49859 [00:00<55:46, 14.90it/s, v_num=18] torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 11/49859 [00:00<51:18, 16.19it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 12/49859 [00:00<47:34, 17.46it/s, v_num=18]torch.Size([8, 65])
After embedding: torch.Size([8, 84])
torch.Size([8, 64])
torch.Size([8, 64])
Epoch 0:   0%|          | 13/49859 [00:00<44:28, 18.68it/s, v_num=18]t

In [37]:
# with given models - funktioniert fuer summary statistics, aber nicht ohne? => woran liegt das?
DIRECTORY = os.getcwd()
JSONPATH = os.path.join(DIRECTORY, "trained_models/drn_24h/params.json")
SAVEPATH = os.path.join(DIRECTORY, "trained_models/drn_24h/models")

with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)

with wandb.init(
    project="multigraph",
    # id=f"training_run_drn_{args_dict['leadtime']}_{args.id}",
    id = f"training_run_{args_dict['leadtime']}",
    config=args_dict,
    tags=["final_training"],
):
    config=wandb.config
    dataframes = load_dataframes(mode="train", leadtime=config.leadtime)
    dataframes = summary_statistics(dataframes)
    dataframes.pop("stations")

    # print(list(dataframes.values()))
    for df in dataframes.values():
        print(type(df))

    for X, y in dataframes.values():
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

    train, valid_test = normalize_features(
        training_data=dataframes["train"], valid_test_data=[dataframes["test_rf"], dataframes["test_f"]]
    )

    print(f"dataframes['train']: {dataframes['train']}")
    print(f"train: {train}")

    train = drop_nans(train)

    y_scaler = StandardScaler(with_std=False)
    y_scaler = y_scaler.fit(train[1][["t2m"]])

    train_dataset = TensorDataset(
        torch.Tensor(train[0].to_numpy()), torch.Tensor(y_scaler.transform(train[1][["t2m"]]))
    )

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

    embed_dim = 20 # why 20? => embed stations - instead of station_id - map into a latent vector space
    in_channels = train[0].shape[1] + embed_dim - 1

    drn = DRN(
        in_channels=in_channels,
        hidden_channels=config.hidden_channels,
        embedding_dim=embed_dim,
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config.lr),
    )
    wandb_logger = WandbLogger(project="multigraph")
    checkpoint_callback = ModelCheckpoint(
        # dirpath=SAVEPATH, filename=f"run_{args.id}", monitor="train_loss", mode="min", save_top_k=1
        dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
    )
    trainer = L.Trainer(
        max_epochs=config.max_epochs,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_progress_bar=True,
        enable_model_summary=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )
    trainer.fit(model=drn, train_dataloaders=train_loader)


[INFO] Loading /home/ltchen/gnnpp/trained_models/drn_24h/params.json


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
[INFO] Normalizing features...


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: C

dataframes['train']: (        station_id  model_orography  station_altitude  station_latitude  \
0                0        -0.738289         -0.764101          1.382904   
1                1        -0.737002         -0.778600          1.016052   
2                2        -0.731851         -0.733171          1.571141   
3                3        -0.728793         -0.765712          1.661951   
4                4        -0.724769         -0.761846          0.884948   
...            ...              ...               ...               ...   
420651         117         0.914135          0.298485         -2.911765   
420652         118         1.443053          0.598123         -1.881973   
420653         119         2.338639          0.646451         -2.021799   
420654         120         4.799571          3.994016         -2.028314   
420655         121         5.913903          4.345205         -2.201382   

        station_longitude  cape_mean  cape_std   sd_mean    sd_std  stl1_mean

`Trainer.fit` stopped: `max_epochs=26` reached.


Epoch 25: 100%|██████████| 98/98 [00:05<00:00, 17.92it/s, v_num=_24h, train_loss_step=0.580, train_loss_epoch=0.597]


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▇▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▄▄▄▃▃▃▂▃▃▂▃▂▂▂▂▃▂▂▂▂▂▂▃▂▂▂▂▂▁▁▂▁▂▂▁▂▁▂
trainer/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██

0,1
epoch,25.0
train_loss_epoch,0.59687
train_loss_step,0.58027
trainer/global_step,2547.0


In [None]:
# nn with mse loss without lightning
class MyNN(nn.Module):
    def __init__(self, hidden_size, embedding_dim, optimizer_class, optimizer_params):
        super().__init__()
        self.embedding = EmbedStations(num_stations_max=122, embedding_dim=embedding_dim)

        self.linear_t2m = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        self.linear = torch.nn.Linear(in_features=hidden_size, out_features=1)  # output t2m value

        self.loss = torch.nn.MSELoss()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

# build a deterministic nn: 1 hidden layer + loss = mse, with lightning
# build a deterministic nn: 1 hidden layer + loss = mse without lightning
# add wandb
# try out different kinds of hidden layers => how do I log them differently? - learn how to use wandb (Thurs)
# build a drn (mu and sigma): more hidden layers + loss = crps, with lightning
# try out different kinds of hidden layer architectures
# build drn