In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from bsd_dataset import get_dataset, regions, DatasetRequest

In [3]:
if torch.cuda.is_available():
    print(torch.cuda.device_count())

8


This is the simplest experiment setup.
- Training period: 1981 - 2010.
- Validation period: 2011 - 2012.
- Testing period: 2013-2014.
- Train/validation/testing region: South America.
- Input is low-res precipitation, latitude, longitude from CMIP6, GFDL-ESM4.
- Target is hi-res precipitation, latitude, longitude from CHIRPS 0.25deg resolution.

In [14]:
input_datasets = [
    DatasetRequest(
        dataset='projections-cmip6',
        model='gfdl_esm4',
        variable='precipitation',
    )
]

target_dataset = DatasetRequest(dataset='chirps', resolution=0.25)

In [24]:
dataset = get_dataset(
    input_datasets,
    target_dataset,
    train_region=regions.SouthAmerica,
    val_region=regions.SouthAmerica,
    test_region=regions.SouthAmerica,
    train_dates=('1981-01-01', '2010-12-31'),
    val_dates=('2011-01-01', '2012-12-31'),
    test_dates=('2013-01-01', '2014-12-31'),
    download=False,  # CHANGE ME (as needed)
    extract=True,   # CHANGE ME (as needed)
    root='/home/data/BSDD/data',
    device='cuda:0'
)

In [25]:
train_dataset = dataset.get_split('train')
val_dataset = dataset.get_split('val')
test_dataset = dataset.get_split('test')

In [34]:
print(f'Training samples: {len(train_dataset)}')
print(f'Validation samples: {len(val_dataset)}')
print(f'Testing samples: {len(test_dataset)}')

Training samples: 10950
Validation samples: 730
Testing samples: 730


In [35]:
print('SIZE ON DISK')
!ls -halt /home/data/BSDD/data | grep .npz

SIZE ON DISK
-rw-r--r-- 1 jason.jewik mintgrp 188M May 28 00:01 test_y.npz
-rw-r--r-- 1 jason.jewik mintgrp 188M May 28 00:01 val_y.npz
-rw-r--r-- 1 jason.jewik mintgrp 2.8G May 28 00:01 train_y.npz
-rw-r--r-- 1 jason.jewik mintgrp  11M May 28 00:01 test_x.npz
-rw-r--r-- 1 jason.jewik mintgrp  11M May 28 00:01 val_x.npz
-rw-r--r-- 1 jason.jewik mintgrp 151M May 28 00:01 train_x.npz


All tensors are latitude by longitude.

In [33]:
x, y, info = train_dataset[0]

print(f'Input shape: {x.shape} ({x.device})')
print(f'Target shape: {y.shape} ({x.device})')

Input shape: torch.Size([1, 75, 48]) (cuda:0)
Target shape: torch.Size([280, 240]) (cuda:0)


In [27]:
print('INFO SUMMARY')
for k, v in info.items():
    print(f' - {k} shape: {v.shape} ({v.device})')

INFO SUMMARY
 - x_lat shape: torch.Size([75, 48]) (cuda:0)
 - x_lon shape: torch.Size([75, 48]) (cuda:0)
 - y_lat shape: torch.Size([75, 48]) (cuda:0)
 - y_lon shape: torch.Size([75, 48]) (cuda:0)
 - y_mask shape: torch.Size([280, 240]) (cuda:0)


Latitudes and longitudes are provided as unnormalized. Latitudes are in the range \[-90, 90\], and longitudes are in the range \[0, 360\]. At the end are functions to perform normalization (I will eventually migrate this into the dataset itself).

In [28]:
info['x_lat']

tensor([[-54.5000, -54.5000, -54.5000,  ..., -54.5000, -54.5000, -54.5000],
        [-53.5000, -53.5000, -53.5000,  ..., -53.5000, -53.5000, -53.5000],
        [-52.5000, -52.5000, -52.5000,  ..., -52.5000, -52.5000, -52.5000],
        ...,
        [ 17.5000,  17.5000,  17.5000,  ...,  17.5000,  17.5000,  17.5000],
        [ 18.5000,  18.5000,  18.5000,  ...,  18.5000,  18.5000,  18.5000],
        [ 19.5000,  19.5000,  19.5000,  ...,  19.5000,  19.5000,  19.5000]],
       device='cuda:0', dtype=torch.float64)

In [29]:
info['x_lon']

tensor([[270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750],
        [270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750],
        [270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750],
        ...,
        [270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750],
        [270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750],
        [270.6250, 271.8750, 273.1250,  ..., 326.8750, 328.1250, 329.3750]],
       device='cuda:0', dtype=torch.float64)

In [30]:
def normalize_latitudes(lats):
    # Converts from the range [-90, 90] to [0, 1]
    return (lats + 90) / 180

def normalize_longitudes(lons):
    # Converts from the range [0, 360] to [0, 1]
    return lons / 360

In [31]:
normalize_latitudes(info['x_lat'])

tensor([[0.1972, 0.1972, 0.1972,  ..., 0.1972, 0.1972, 0.1972],
        [0.2028, 0.2028, 0.2028,  ..., 0.2028, 0.2028, 0.2028],
        [0.2083, 0.2083, 0.2083,  ..., 0.2083, 0.2083, 0.2083],
        ...,
        [0.5972, 0.5972, 0.5972,  ..., 0.5972, 0.5972, 0.5972],
        [0.6028, 0.6028, 0.6028,  ..., 0.6028, 0.6028, 0.6028],
        [0.6083, 0.6083, 0.6083,  ..., 0.6083, 0.6083, 0.6083]],
       device='cuda:0', dtype=torch.float64)

In [32]:
normalize_longitudes(info['x_lon'])

tensor([[0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149],
        [0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149],
        [0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149],
        ...,
        [0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149],
        [0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149],
        [0.7517, 0.7552, 0.7587,  ..., 0.9080, 0.9115, 0.9149]],
       device='cuda:0', dtype=torch.float64)