# Run DRN

In [1]:
from typing import Any

from pytorch_lightning.utilities.types import STEP_OUTPUT
# load data first
%cd /home/ltchen/gnnpp
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import argparse
import json
import pytorch_lightning as L
import torch
import wandb

from models.drn import DRN
from models.model_utils import EmbedStations
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
#from sklearn.preprocessing import StandardScaler
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch import nn

from utils.data import load_dataframes, summary_statistics
from utils.drn_utils import *
from models.loss import NormalCRPS

/home/ltchen/gnnpp


### Load dataframes for train, valid, test

In [2]:
dataframes = load_dataframes(mode="train", leadtime="24h") # train mode => for training nn? Wie wird das im Paper beschrieben?
dataframes = summary_statistics(dataframes) # wie sehen die daten von summary statistics aus? => wenn das nur die Daten von einer Station sind, dann über Zeitpunkte
dataframes.pop("stations") # .pop("stations") => entfernt den df mit stations, wofuer brauche ich die dann überhaupt? Grafik?

for X, y in dataframes.values(): # wofuer?
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

train, valid_test = normalize_features(
    training_data=dataframes["train"], valid_test_data=[dataframes["test_rf"], dataframes["test_f"]]
)

train = drop_nans(train)
(test_rf, test_f) = valid_test
test_rf = drop_nans(test_rf)
test_f = drop_nans(test_f)

DIRECTORY = os.getcwd()
SAVEPATH = os.path.join(DIRECTORY, "explored_models/drn_24h/models")
#print(train[1].isna().sum()) #drop_nans does not work without summary_statistics

[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
[INFO] Normalizing features...


In [3]:
# one station only
# train
for i in train:
    print(i.shape)
    print(i.columns)

# valid_test
for (i, j) in valid_test:
    print(i.shape)
    print(i.columns)

(398866, 65)
Index(['station_id', 'model_orography', 'station_altitude', 'station_latitude',
       'station_longitude', 'cape_mean', 'cape_std', 'sd_mean', 'sd_std',
       'stl1_mean', 'stl1_std', 'swvl1_mean', 'swvl1_std', 't2m_mean',
       't2m_std', 'tcc_mean', 'tcc_std', 'tcw_mean', 'tcw_std', 'tcwv_mean',
       'tcwv_std', 'u10_mean', 'u10_std', 'u100_mean', 'u100_std', 'v10_mean',
       'v10_std', 'v100_mean', 'v100_std', 'vis_mean', 'vis_std', 'cp6_mean',
       'cp6_std', 'mn2t6_mean', 'mn2t6_std', 'mx2t6_mean', 'mx2t6_std',
       'p10fg6_mean', 'p10fg6_std', 'slhf6_mean', 'slhf6_std', 'sshf6_mean',
       'sshf6_std', 'ssr6_mean', 'ssr6_std', 'ssrd6_mean', 'ssrd6_std',
       'str6_mean', 'str6_std', 'strd6_mean', 'strd6_std', 'tp6_mean',
       'tp6_std', 'z_mean', 'z_std', 'q_mean', 'q_std', 'u_mean', 'u_std',
       'v_mean', 'v_std', 't_mean', 't_std', 'cos_doy', 'sin_doy'],
      dtype='object')
(398866, 3)
Index(['time', 'station_id', 't2m'], dtype='object')
(89304

## One Station

In [4]:
# train
one_station_X = train[0][train[0]["station_id"]==1]
one_station_y = train[1][train[1]["station_id"]==1]

one_station_X = one_station_X.drop("station_id", axis=1)
one_station_y = one_station_y.drop("station_id", axis=1)

print(one_station_X)
print(one_station_X.shape)
print(one_station_y.shape)

# test_rf
s1_test_rf_X = test_rf[0][test_rf[0]["station_id"]==1]
s1_test_rf_y = test_rf[1][test_rf[1]["station_id"]==1]

s1_test_rf_X = s1_test_rf_X.drop("station_id", axis=1)
s1_test_rf_y = s1_test_rf_y.drop("station_id", axis=1)

# test_f
s1_test_f_X = test_f[0][test_f[0]["station_id"]==1]
s1_test_f_y = test_f[1][test_f[1]["station_id"]==1]

s1_test_f_X = s1_test_f_X.drop("station_id", axis=1)
s1_test_f_y = s1_test_f_y.drop("station_id", axis=1)

        model_orography  station_altitude  station_latitude  \
1             -0.737002           -0.7786          1.016052   
123           -0.737002           -0.7786          1.016052   
245           -0.737002           -0.7786          1.016052   
367           -0.737002           -0.7786          1.016052   
489           -0.737002           -0.7786          1.016052   
...                 ...               ...               ...   
420047        -0.737002           -0.7786          1.016052   
420169        -0.737002           -0.7786          1.016052   
420291        -0.737002           -0.7786          1.016052   
420413        -0.737002           -0.7786          1.016052   
420535        -0.737002           -0.7786          1.016052   

        station_longitude  cape_mean  cape_std   sd_mean    sd_std  stl1_mean  \
1                -0.89124  -0.164696 -0.233030 -0.138945 -0.130389  -0.465046   
123              -0.89124  -0.127184 -0.138486 -0.138945 -0.130389  -0.443237   


### One Station MSE and CRPS NNs
Station (station_id=1) with one hidden layer and loss functions MSE or CRPS

In [5]:
class MSEStationNN(L.LightningModule):
    def __init__(self, in_feat, hidden_size, optimizer_class, optimizer_params):
        super(MSEStationNN, self).__init__()
        self.linear = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        self.linear_t2m = torch.nn.Linear(in_features=hidden_size, out_features=1)

        self.loss = torch.nn.MSELoss()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear(x)
        x = self.relu(x)
        x = self.linear_t2m(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        self.log("train_loss", loss.item(), on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx): # unterschied zwischen predict und test_step?
        x, y = batch # wieso hat test_step auch y? => um score zu berechnen
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        return loss


In [43]:
class CRPSStationNN(L.LightningModule):
    def __init__(self, in_feat, hidden_size, optimizer_class, optimizer_params):
        super(CRPSStationNN, self).__init__()
        self.linear = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        #self.linear_t2m = torch.nn.Linear(in_features=hidden_size, out_features=2) => wieso nicht direkt 2 outputs?
        self.softplus = torch.nn.Softplus()
        self.last_linear_mu = nn.Linear(in_features=hidden_size, out_features=1)
        self.last_linear_sigma = nn.Linear(in_features=hidden_size, out_features=1)

        self.loss_fn = NormalCRPS()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.linear(x)
        x = self.relu(x)
        mu = self.last_linear_mu(x)
        sigma = self.softplus(self.last_linear_sigma(x))
        res = torch.cat([mu, sigma], dim=1)
        return res

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        self.log("train_loss", loss.item(), on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx=0):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        print(f'test_loss: {loss}')
        return {'loss': loss}

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, _ = batch
        y_hat = self.forward(x)
        return y_hat


### Train One Station NN (MSE or CRPS)

In [44]:
with wandb.init(
    project="exploration",
    id = f"training_run_24h_crps",
    tags=["exploration"],
):

    y_scaler = StandardScaler(with_std=False) # wieso scalen wir überhaupt? => robuster?
    y_scaler = y_scaler.fit(one_station_y[["t2m"]])

    batch_size = 512
    hidden_size=128
    lr=0.0002
    max_epochs=31
    in_feat = one_station_X.shape[1]

    one_station_train_ds = TensorDataset(torch.Tensor(one_station_X.to_numpy()), torch.Tensor(y_scaler.transform(one_station_y[["t2m"]])))
    one_station_loader = DataLoader(one_station_train_ds, batch_size=batch_size, shuffle=True)

    s1_test_rf_ds = TensorDataset(torch.Tensor(s1_test_rf_X.to_numpy()), torch.Tensor(y_scaler.transform(s1_test_rf_y[["t2m"]])))
    s1_test_rf_loader = DataLoader(s1_test_rf_ds, batch_size=batch_size, shuffle=False)

    one_station_nn = CRPSStationNN(
        in_feat=in_feat,
        hidden_size=hidden_size,
        optimizer_class=AdamW,
        optimizer_params={"lr": lr}
    )

    wandb_logger = WandbLogger(project="one_station_crps")

    os_checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
    )

    one_station_trainer = L.Trainer(
        max_epochs=max_epochs,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_model_summary=True,
        logger=wandb_logger,
        callbacks=os_checkpoint_callback,
    )

    value = one_station_trainer.fit(model=one_station_nn, train_dataloaders=one_station_loader, val_dataloaders=s1_test_rf_loader)

    final_loss = one_station_trainer.logged_metrics["train_loss_step"]
    print("Final MSE Loss:", final_loss)

# wo finde ich den tatsaechlichen wert? => bei test, jetzt wird nur das Modell trainiert

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ltchen/gnnpp/explored_models/drn_24h/models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name              | Type       | Params | Mode 
-------

                                                                            

/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 7/7 [00:00<00:00, 82.25it/s, v_num=crps, train_loss_step=4.250]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 95.71it/s][A
Validation DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 95.56it/s][A
Epoch 1: 100%|██████████| 7/7 [00:00<00:00, 65.82it/s, v_num=crps, train_loss_step=4.090, train_loss_epoch=4.210]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s][A
Validation DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 112.24it/s][A
Validation DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 103.46it/s][A
Epoch 2: 100%|██████████| 7/7 [00:00<00:00, 74.35it/s, v_num=crps, train_loss_step=3.830, train_loss_epoch=4.120]
Validation: |          | 0/? [00:00<?, ?it/s][A

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 7/7 [00:00<00:00, 41.98it/s, v_num=crps, train_loss_step=1.210, train_loss_epoch=1.250]
Final MSE Loss: tensor(1.2118)


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇██
train_loss_epoch,███▇▇▇▇▆▆▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁
train_loss_step,█▇▇▇▇▇▇▆▇▆▇▆▆▇▆▅▅▅▅▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇███

0,1
epoch,30.0
train_loss_epoch,1.25029
train_loss_step,1.21183
trainer/global_step,216.0


### Validate and test one station NN (MSE and CRPS)

In [46]:
# validation and test for both mse and crps
s1_test_f_ds = TensorDataset(torch.Tensor(s1_test_f_X.to_numpy()), torch.Tensor(y_scaler.transform(s1_test_f_y[["t2m"]])))
s1_test_f_loader = DataLoader(s1_test_f_ds, batch_size=batch_size, shuffle=False)

loss = one_station_trainer.test(model=one_station_nn, dataloaders=s1_test_f_loader)
print(loss)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]test_loss: 1.2276196479797363
Testing DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 183.27it/s]test_loss: 1.2759742736816406
Testing DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 131.92it/s]
[{}]


In [54]:
preds_list = []
preds = one_station_trainer.predict(model=one_station_nn, dataloaders=s1_test_f_loader)
preds = torch.cat(preds, dim=0)
# Reverse transform of the y_scaler (only on the mean)
preds[:, 0] = torch.Tensor(y_scaler.inverse_transform(preds[:, 0].view(-1, 1))).flatten()

preds_list.append(preds)
targets = s1_test_f_y
targets = torch.Tensor(targets.t2m.values)

stacked = torch.stack(preds_list)
final_preds = torch.mean(stacked, dim=0)

res = one_station_nn.loss_fn.crps(final_preds, targets)
print(f"final pred: {final_preds[0]}, targets: {targets[0]}")
print(res)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 188.47it/s]
final pred: tensor([278.6718,   2.0566]), targets: 275.25
tensor(1.2421)


## All stations
Deterministic NN with one hidden layer using MSE as loss and embeddings

In [3]:
# nn mse loss with lightning
class MyDRN(L.LightningModule):
    def __init__(self, hidden_size, embedding_dim, in_feat, optimizer_class, optimizer_params):
        super(MyDRN, self).__init__()
        self.embedding = EmbedStations(num_stations_max=122, embedding_dim=embedding_dim)
        self.linear = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        self.linear_t2m = torch.nn.Linear(in_features=hidden_size, out_features=1) # output t2m value

        self.loss = torch.nn.MSELoss()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        #print(x.shape) # (8, 65)
        x = self.embedding(x)
        #print(f"After embedding: {x.shape}") # (8, 84)
        x = self.linear(x)
        #print(x.shape) # (8, 64)
        x = self.relu(x)
        #print(x.shape)
        x = self.linear_t2m(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten()) # why y.flatten()?
        self.log("train_loss", loss.item(), on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        self.log("validation_loss", loss.item(), on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx=0): # unterschied zwischen predict und test_step?
        x, y = batch # wieso hat test_step auch y?
        y_hat = self.forward(x)
        loss = self.loss(y_hat, y.flatten())
        self.log("test_loss", loss.item(), on_epoch=True, prog_bar=True)
        return loss

In [4]:
# nn crps loss with lightning
class CRPSDRN(L.LightningModule):
    def __init__(self, hidden_size, embedding_dim, in_feat, optimizer_class, optimizer_params):
        super(CRPSDRN, self).__init__()
        self.embedding = EmbedStations(num_stations_max=122, embedding_dim=embedding_dim)
        self.linear = torch.nn.Linear(in_features=in_feat, out_features=hidden_size)
        self.relu = torch.nn.ReLU()
        self.softplus = torch.nn.Softplus()
        self.last_linear_mu = nn.Linear(in_features=hidden_size, out_features=1)
        self.last_linear_sigma = nn.Linear(in_features=hidden_size, out_features=1)

        self.loss_fn = NormalCRPS()
        self.optimizer_class = optimizer_class
        self.optimizer_params = optimizer_params

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x) #(8, 65)
        x = self.linear(x) #(8, 84)
        x = self.relu(x) #(8, 64)
        mu = self.last_linear_mu(x)
        sigma = self.softplus(self.last_linear_sigma(x))
        res = torch.cat([mu, sigma], dim=1)
        return res

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        self.log("train_loss", loss.item(), on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return self.optimizer_class(self.parameters(), **self.optimizer_params)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        self.log("validation_loss", loss.item(), on_epoch=True, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx, dataloader_idx=0): # unterschied zwischen predict und test_step?
        x, y = batch # wieso hat test_step auch y?
        y_hat = self.forward(x)
        loss = self.loss_fn.crps(mu_sigma=y_hat, y=y.flatten())
        self.log("test_loss", loss.item(), on_epoch=True, prog_bar=True)
        return loss

### Train All_station NN

In [5]:
with wandb.init(
    project="exploration",
    id = f"training_run_24h_crps",
    tags=["exploration"],
):

    y_scaler = StandardScaler(with_std=False) # wieso scalen wir überhaupt? => robuster?
    y_scaler = y_scaler.fit(train[1][["t2m"]])

    batch_size = 2048
    hidden_size=128
    lr=0.0002
    max_epochs=31
    embed_dim = 20
    in_feat = train[0].shape[1] + embed_dim - 1


    train_dataset = TensorDataset(torch.Tensor(train[0].to_numpy()), torch.Tensor(y_scaler.transform(train[1][["t2m"]])))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    test_rf_dataset = TensorDataset(torch.Tensor(test_rf[0].to_numpy()), torch.Tensor(y_scaler.transform(test_rf[1][["t2m"]])))
    test_rf_loader = DataLoader(test_rf_dataset, batch_size=batch_size, shuffle=False)

    test_f_dataset = TensorDataset(torch.Tensor(test_f[0].to_numpy()), torch.Tensor(y_scaler.transform(test_f[1][["t2m"]])))
    test_f_loader = DataLoader(test_f_dataset, batch_size=batch_size, shuffle=False)

    mydrn = CRPSDRN(
        hidden_size=hidden_size,
        embedding_dim=embed_dim,
        in_feat=in_feat,
        optimizer_class=AdamW,
        optimizer_params=dict(lr=lr),
)

    wandb_logger = WandbLogger(project="all_station_crps")

    checkpoint_callback = ModelCheckpoint(
        dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
    )

    trainer = L.Trainer(
        max_epochs=max_epochs,
        log_every_n_steps=10,
        accelerator="gpu",
        enable_progress_bar=True,
        enable_model_summary=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
)

    trainer.fit(model=mydrn, train_dataloaders=train_loader, val_dataloaders=test_rf_loader)

    final_loss = trainer.logged_metrics["train_loss_step"]
    print("Final MSE Loss:", final_loss)

[34m[1mwandb[0m: Currently logged in as: [33mleachen[0m ([33mleachen_thesis[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A5000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.fin

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


                                                                           

/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 195/195 [00:05<00:00, 32.54it/s, v_num=crps, train_loss_step=1.730]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/38 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/38 [00:00<?, ?it/s][A
Validation DataLoader 0:   3%|▎         | 1/38 [00:00<00:00, 109.05it/s][A
Validation DataLoader 0:   5%|▌         | 2/38 [00:00<00:00, 76.51it/s] [A
Validation DataLoader 0:   8%|▊         | 3/38 [00:00<00:00, 74.52it/s][A
Validation DataLoader 0:  11%|█         | 4/38 [00:00<00:00, 73.26it/s][A
Validation DataLoader 0:  13%|█▎        | 5/38 [00:00<00:00, 72.78it/s][A
Validation DataLoader 0:  16%|█▌        | 6/38 [00:00<00:00, 72.62it/s][A
Validation DataLoader 0:  18%|█▊        | 7/38 [00:00<00:00, 72.65it/s][A
Validation DataLoader 0:  21%|██        | 8/38 [00:00<00:00, 72.61it/s][A
Validation DataLoader 0:  24%|██▎       | 9/38 [00:00<00:00, 72.66it/s][A
Validation DataLoader 0:  26%|██▋       | 10/38 [00:00<00:0

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 195/195 [00:07<00:00, 25.94it/s, v_num=crps, train_loss_step=0.625, validation_loss=0.652, train_loss_epoch=0.658]
Final MSE Loss: tensor(0.6251)


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
train_loss_epoch,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▂▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▆▆▆▆▆▇▇▇██
validation_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,30.0
train_loss_epoch,0.6578
train_loss_step,0.66634
trainer/global_step,6044.0
validation_loss,0.65187


In [19]:
# MyDRN train without wandb without saving

y_scaler = StandardScaler(with_std=False)
y_scaler = y_scaler.fit(train[1][["t2m"]])


train_dataset = TensorDataset(
    torch.Tensor(train[0].to_numpy()), torch.Tensor(y_scaler.transform(train[1][["t2m"]]))
)

#from params.json best_24h
batch_size = 2048
hidden_size=128
lr=0.0002
max_epochs=31

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

embed_dim = 20
in_feat = train[0].shape[1] + embed_dim - 1

mydrn = MyDRN(
    hidden_size=hidden_size,
    embedding_dim=embed_dim,
    in_feat=in_feat,
    optimizer_class=AdamW,
    optimizer_params=dict(lr=lr),
)

checkpoint_callback = ModelCheckpoint(
    # dirpath=SAVEPATH, filename=f"run_{args.id}", monitor="train_loss", mode="min", save_top_k=1
    dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
)

trainer = L.Trainer(
    max_epochs=max_epochs,
    log_every_n_steps=50,
    accelerator="gpu",
    enable_progress_bar=True,
    enable_model_summary=True,
    callbacks=checkpoint_callback,
)

trainer.fit(model=mydrn, train_dataloaders=train_loader)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-package

Epoch 0:   3%|▎         | 5/195 [00:00<00:05, 32.69it/s, v_num=28, train_loss_step=42.50]

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1:   3%|▎         | 5/195 [00:00<00:05, 34.61it/s, v_num=28, train_loss_step=41.90, train_loss_epoch=41.20]  

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 30: 100%|██████████| 195/195 [00:05<00:00, 37.23it/s, v_num=28, train_loss_step=39.70, train_loss_epoch=41.20]

`Trainer.fit` stopped: `max_epochs=31` reached.


Epoch 30: 100%|██████████| 195/195 [00:05<00:00, 37.21it/s, v_num=28, train_loss_step=39.70, train_loss_epoch=41.20]


In [37]:
# with given models - funktioniert fuer summary statistics, aber nicht ohne? => woran liegt das?
DIRECTORY = os.getcwd()
JSONPATH = os.path.join(DIRECTORY, "trained_models/drn_24h/params.json")
SAVEPATH = os.path.join(DIRECTORY, "trained_models/drn_24h/models")

with open(JSONPATH, "r") as f:
    print(f"[INFO] Loading {JSONPATH}")
    args_dict = json.load(f)

with wandb.init(
    project="multigraph",
    # id=f"training_run_drn_{args_dict['leadtime']}_{args.id}",
    id = f"training_run_{args_dict['leadtime']}",
    config=args_dict,
    tags=["final_training"],
):
    config=wandb.config
    dataframes = load_dataframes(mode="train", leadtime=config.leadtime)
    dataframes = summary_statistics(dataframes)
    dataframes.pop("stations")

    # print(list(dataframes.values()))
    for df in dataframes.values():
        print(type(df))

    for X, y in dataframes.values():
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

    train, valid_test = normalize_features(
        training_data=dataframes["train"], valid_test_data=[dataframes["test_rf"], dataframes["test_f"]]
    )

    print(f"dataframes['train']: {dataframes['train']}")
    print(f"train: {train}")

    train = drop_nans(train)

    y_scaler = StandardScaler(with_std=False)
    y_scaler = y_scaler.fit(train[1][["t2m"]])

    train_dataset = TensorDataset(
        torch.Tensor(train[0].to_numpy()), torch.Tensor(y_scaler.transform(train[1][["t2m"]]))
    )

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

    embed_dim = 20 # why 20? => embed stations - instead of station_id - map into a latent vector space
    in_channels = train[0].shape[1] + embed_dim - 1

    drn = DRN(
        in_channels=in_channels,
        hidden_channels=config.hidden_channels,
        embedding_dim=embed_dim,
        optimizer_class=AdamW,
        optimizer_params=dict(lr=config.lr),
    )
    wandb_logger = WandbLogger(project="multigraph")
    checkpoint_callback = ModelCheckpoint(
        # dirpath=SAVEPATH, filename=f"run_{args.id}", monitor="train_loss", mode="min", save_top_k=1
        dirpath=SAVEPATH, filename=f"run_24h", monitor="train_loss", mode="min", save_top_k=1
    )
    trainer = L.Trainer(
        max_epochs=config.max_epochs,
        log_every_n_steps=1,
        accelerator="gpu",
        enable_progress_bar=True,
        enable_model_summary=True,
        logger=wandb_logger,
        callbacks=checkpoint_callback,
    )
    trainer.fit(model=drn, train_dataloaders=train_loader)


[INFO] Loading /home/ltchen/gnnpp/trained_models/drn_24h/params.json


[INFO] Dataframes exist. Will load pandas dataframes.
[INFO] Calculating summary statistics for train
[INFO] Calculating summary statistics for test_rf
[INFO] Calculating summary statistics for test_f
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
[INFO] Normalizing features...


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/home/ltchen/.conda/envs/gnn_env/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: C

dataframes['train']: (        station_id  model_orography  station_altitude  station_latitude  \
0                0        -0.738289         -0.764101          1.382904   
1                1        -0.737002         -0.778600          1.016052   
2                2        -0.731851         -0.733171          1.571141   
3                3        -0.728793         -0.765712          1.661951   
4                4        -0.724769         -0.761846          0.884948   
...            ...              ...               ...               ...   
420651         117         0.914135          0.298485         -2.911765   
420652         118         1.443053          0.598123         -1.881973   
420653         119         2.338639          0.646451         -2.021799   
420654         120         4.799571          3.994016         -2.028314   
420655         121         5.913903          4.345205         -2.201382   

        station_longitude  cape_mean  cape_std   sd_mean    sd_std  stl1_mean

`Trainer.fit` stopped: `max_epochs=26` reached.


Epoch 25: 100%|██████████| 98/98 [00:05<00:00, 17.92it/s, v_num=_24h, train_loss_step=0.580, train_loss_epoch=0.597]


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▇▇▇▇▇▇▇████
train_loss_epoch,█▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_step,█▆▄▄▄▃▃▃▂▃▃▂▃▂▂▂▂▃▂▂▂▂▂▂▃▂▂▂▂▂▁▁▂▁▂▂▁▂▁▂
trainer/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██

0,1
epoch,25.0
train_loss_epoch,0.59687
train_loss_step,0.58027
trainer/global_step,2547.0


In [51]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()
