In [30]:
import pandas as pd
import xarray as xr
import numpy as np
import yaml

import torch
import torch.nn as nn
import torch.nn.functional as F

In [31]:
def load_global(config="global"):
    with open(f"configs/{config}.yaml", "r") as f:
        return yaml.safe_load(f)
    
cfg = load_global()
cfm = load_global(config="datapp_de")

In [17]:
# load target dataset
target_ds = pd.read_csv(cfm["target_data_raw"])
target_ds = target_ds[target_ds["Date"].str.endswith("12:00:00")]

In [3]:
class CNNRegionPool(nn.Module):
    def __init__(self, n_in_ch=15, n_regions=38, H=32, W=40, hidden=64, head_hidden=64, W_region=None):
        super().__init__()
        self.H, self.W = H, W
        assert W_region is not None, "Provide region pooling matrix W_region [n_regions, H*W]"
        # Store as dense or sparse; einsum works with dense. Convert if needed.
        if W_region.is_sparse:
            W_region = W_region.to_dense()
        self.register_buffer("W_region", W_region)  # [R, L], not a parameter

        # Lightweight CNN backbone
        self.backbone = nn.Sequential(
            nn.Conv2d(n_in_ch, 32, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(32), nn.ReLU(inplace=True),

            nn.Conv2d(32, 64, kernel_size=3, padding=1, stride=2, bias=False),  # -> [64, 16, 20]
            nn.BatchNorm2d(64), nn.ReLU(inplace=True),

            nn.Conv2d(64, hidden, kernel_size=3, padding=1, dilation=2, bias=False),
            nn.BatchNorm2d(hidden), nn.ReLU(inplace=True),
        )

        # Per-region head (shared weights across regions)
        self.head = nn.Sequential(
            nn.Linear(hidden, head_hidden),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(head_hidden, 1),
        )
        # Optional: learn a small region bias
        self.region_bias = nn.Parameter(torch.zeros(n_regions))

    def forward(self, x):
        """
        x: [B, C=15, H=32, W=40]
        Returns: [B, n_regions]
        """
        B = x.size(0)
        feat = self.backbone(x)              # [B, hidden, H', W'] where H'=16, W'=20
        H2, W2 = feat.shape[-2:]
        feat = feat.view(B, feat.size(1), H2*W2)  # [B, hidden, L']
        # If you downsampled, you also need a W for 16x20. Simplest: build W for 16x20 once.
        # Below assumes W_region matches L' = H2*W2.
        pooled = torch.einsum("rl,bcl->brc", self.W_region, feat)  # [B, R, hidden]

        # Apply shared MLP to each region vector
        out = self.head(pooled)              # [B, R, 1]
        out = out.squeeze(-1) + self.region_bias  # [B, R]
        return out


In [None]:
# Example placeholder (uniform weights over a toy mask):
# indices: list of (region_id, flat_cell_id), values: area_fractions
W = torch.sparse_coo_tensor(indices, values, size=(39, 16*20))  # match backbone output grid
model = CNNRegionPool(n_in_ch=15, n_regions=39, H=32, W=40, hidden=64, head_hidden=64, W_region=W)


In [6]:
data_file = cfm["target_data_raw"]
df = pd.read_csv(data_file)

In [12]:
regions = df.columns.tolist()[1:]

In [21]:
len(regions)

NameError: name 'regions' is not defined

In [33]:
ds = xr.open_dataset("data/processed/era5_de_u_component_of_wind.nc")

df = ds.to_dataframe()

In [11]:
import pandas as pd
import xarray as xr
import numpy as np

In [16]:
df_X = pd.read_parquet("data/processed/de_uvtzq_scf_NUTS2_features.parquet")
df_y = pd.read_parquet("data/processed/de_uvtzq_scf_NUTS2_labels.parquet")

In [18]:
print(np.isinf(df_X.values).sum())
print(np.isinf(df_y.values).sum())

print(np.isnan(df_X.values).sum())
print(np.isnan(df_y.values).sum())

0
0
0
15341


In [25]:
df_y.to_parquet("data/processed/de_uvtzq_scf_NUTS2_labels.parquet")