In [2]:
# Quick hack to put us in the icenet-pipeline folder,
# assuming it was created as per 01.cli_demonstration.ipynb
import os
if os.path.exists("pytorch_example.ipynb"):
    os.chdir("../notebook-pipeline")
print("Running in {}".format(os.getcwd()))

%matplotlib inline

Running in /data/hpcdata/users/rychan/notebooks/notebook-pipeline


In [3]:
import numpy as np
import pandas as pd
import os
import random
import torch

# We also set the logging level so that we get some feedback from the API
import logging
logging.basicConfig(level=logging.INFO)

In [4]:
print('A', torch.__version__)
print('B', torch.cuda.is_available())
print('C', torch.backends.cudnn.enabled)

A 2.0.1+cu117
B True
C True


In [24]:
device = torch.device('cuda')
print('D', torch.cuda.get_device_properties(device))

D _CudaDeviceProperties(name='NVIDIA A2', major=8, minor=6, total_memory=14938MB, multi_processor_count=10)


In [None]:
!nvidia-smi

Thu Jul 20 16:44:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A2           On   | 00000000:98:00.0 Off |                    0 |
|  0%   48C    P0    20W /  60W |  14859MiB / 15356MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Dataset creation

Assuming we have ran [03.library_usage](03.library_usage.ipynb) `loader.notebook_api_data.json` file existing in the current directory.

In [6]:
from icenet.data.loaders import IceNetDataLoaderFactory

implementation = "dask"
loader_config = "loader.notebook_api_data.json"
dataset_name = "pytorch_notebook"
lag = 1

dl = IceNetDataLoaderFactory().create_data_loader(
    implementation,
    loader_config,
    dataset_name,
    lag,
    n_forecast_days=7,
    north=False,
    south=True,
    output_batch_size=4,
    generate_workers=8)

INFO:root:Loading configuration loader.notebook_api_data.json


In [7]:
dl

<icenet.data.loaders.dask.DaskMultiWorkerLoader at 0x7fe80f949940>

In [8]:
dl._config

{'sources': {'era5': {'name': 'notebook_api_data',
   'implementation': 'IceNetERA5PreProcessor',
   'anom': ['tas', 'zg500', 'zg250'],
   'abs': ['uas', 'vas'],
   'dates': {'train': ['2020_01_01',
     '2020_01_02',
     '2020_01_03',
     '2020_01_04',
     '2020_01_05',
     '2020_01_06',
     '2020_01_07',
     '2020_01_08',
     '2020_01_09',
     '2020_01_10',
     '2020_01_11',
     '2020_01_12',
     '2020_01_13',
     '2020_01_14',
     '2020_01_15',
     '2020_01_16',
     '2020_01_17',
     '2020_01_18',
     '2020_01_19',
     '2020_01_20',
     '2020_01_21',
     '2020_01_22',
     '2020_01_23',
     '2020_01_24',
     '2020_01_25',
     '2020_01_26',
     '2020_01_27',
     '2020_01_28',
     '2020_01_29',
     '2020_01_30',
     '2020_01_31',
     '2020_02_01',
     '2020_02_02',
     '2020_02_03',
     '2020_02_04',
     '2020_02_05',
     '2020_02_06',
     '2020_02_07',
     '2020_02_08',
     '2020_02_09',
     '2020_02_10',
     '2020_02_11',
     '2020_02_12',
   

We generate a config only dataset, which will get saved in `dataset_config.pytorch_notebook.json`.

In [9]:
dl.write_dataset_config_only()

INFO:root:Writing dataset configuration without data generation
INFO:root:91 train dates in total, NOT generating cache data.
INFO:root:21 val dates in total, NOT generating cache data.
INFO:root:2 test dates in total, NOT generating cache data.
INFO:root:Writing configuration to ./dataset_config.pytorch_notebook.json


We can now create the IceNetDataSet object:

In [10]:
from icenet.data.dataset import IceNetDataSet

dataset_config = "dataset_config.pytorch_notebook.json"
dataset = IceNetDataSet(dataset_config, batch_size=4)

INFO:root:Loading configuration dataset_config.pytorch_notebook.json


In [11]:
dataset._config

{'identifier': 'pytorch_notebook',
 'implementation': 'DaskMultiWorkerLoader',
 'channels': ['uas_abs_1',
  'vas_abs_1',
  'siconca_abs_1',
  'tas_anom_1',
  'zg250_anom_1',
  'zg500_anom_1',
  'cos_1',
  'land_1',
  'sin_1'],
 'counts': {'train': 91, 'val': 21, 'test': 2},
 'dtype': 'float32',
 'loader_config': '/data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json',
 'missing_dates': [],
 'n_forecast_days': 7,
 'north': False,
 'num_channels': 9,
 'shape': [432, 432],
 'south': True,
 'dataset_path': False,
 'loss_weight_days': True,
 'output_batch_size': 4,
 'var_lag': 1,
 'var_lag_override': {}}

In [12]:
dataset.loader_config

'/data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json'

## Custom PyTorch Dataset

In [47]:
from torch.utils.data import Dataset, DataLoader

class IceNetDataSetPyTorch(Dataset):
    def __init__(self,
                 configuration_path: str,
                 mode: str,
                 batch_size: int = 4,
                 shuffling: bool = False):
        self._ds = IceNetDataSet(configuration_path=configuration_path,
                                 batch_size=batch_size,
                                 shuffling=shuffling)
        self._dl = self._ds.get_data_loader()
                
        # check mode option
        if mode not in ["train", "val", "test"]:
            raise ValueError("mode must be either 'train', 'val' or 'test'")
        self._mode = mode
        
        self._dates = self._dl._config["sources"]["osisaf"]["dates"][self._mode]
    
    def __len__(self):
        return self._ds._counts[self._mode]
    
    def __getitem__(self, idx):
        return self._dl.generate_sample(date=pd.Timestamp(self._dates[idx].replace('_', '-')))

In [48]:
ds_torch = IceNetDataSetPyTorch(configuration_path=dataset_config,
                                mode="train")

INFO:root:Loading configuration dataset_config.pytorch_notebook.json
INFO:root:Loading configuration /data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json


In [49]:
ds_torch.__len__()

91

In [50]:
ds_torch._dates[0]

'2020_01_01'

In [53]:
ds_torch.__getitem__(0)

(array([[[ 0.5269795 ,  0.49944958,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5254056 ,  0.4970613 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5229517 ,  0.49159   ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         ...,
         [ 0.45743546,  0.5098583 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.45778623,  0.50784564,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.45920837,  0.5058264 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919]],
 
        [[ 0.5222138 ,  0.4954434 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.52211976,  0.49204698,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5185773 ,  0.48746935,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         ...,
         [ 0.44983226,  0.5089980

In [54]:
ds_torch._dl.generate_sample(date=pd.Timestamp(ds_torch._dates[0].replace('_', '-')))

(array([[[ 0.5269795 ,  0.49944958,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5254056 ,  0.4970613 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5229517 ,  0.49159   ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         ...,
         [ 0.45743546,  0.5098583 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.45778623,  0.50784564,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.45920837,  0.5058264 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919]],
 
        [[ 0.5222138 ,  0.4954434 ,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.52211976,  0.49204698,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         [ 0.5185773 ,  0.48746935,  0.        , ..., -0.9999424 ,
           1.        , -0.01072919],
         ...,
         [ 0.44983226,  0.5089980

## Generating PyTorch DataLoaders

In [56]:
train_dataset = IceNetDataSetPyTorch(configuration_path=dataset_config, mode="train")
val_dataset = IceNetDataSetPyTorch(configuration_path=dataset_config, mode="val")
test_dataset = IceNetDataSetPyTorch(configuration_path=dataset_config, mode="test")

INFO:root:Loading configuration dataset_config.pytorch_notebook.json
INFO:root:Loading configuration /data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json
INFO:root:Loading configuration dataset_config.pytorch_notebook.json
INFO:root:Loading configuration /data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json
INFO:root:Loading configuration dataset_config.pytorch_notebook.json
INFO:root:Loading configuration /data/hpcdata/users/rychan/notebooks/notebook-pipeline/loader.notebook_api_data.json


In [57]:
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=True)

## Iterating through DataLoaders

In [61]:
len(train_dataloader)

23

In [58]:
train_features, train_labels, sample_weights = next(iter(train_dataloader))

ERROR:root:Issue selecting data for non-prediction sample, please review siconca ground-truth: dates [Timestamp('2020-03-20 00:00:00'), Timestamp('2020-03-21 00:00:00'), Timestamp('2020-03-22 00:00:00'), Timestamp('2020-03-23 00:00:00'), Timestamp('2020-03-24 00:00:00'), Timestamp('2020-03-25 00:00:00'), Timestamp('2020-03-26 00:00:00'), Timestamp('2020-03-27 00:00:00'), Timestamp('2020-03-28 00:00:00'), Timestamp('2020-03-29 00:00:00'), Timestamp('2020-03-30 00:00:00'), Timestamp('2020-03-31 00:00:00'), Timestamp('2020-04-01 00:00:00'), Timestamp('2020-04-02 00:00:00'), Timestamp('2020-04-03 00:00:00'), Timestamp('2020-04-04 00:00:00'), Timestamp('2020-04-05 00:00:00'), Timestamp('2020-04-06 00:00:00'), Timestamp('2020-04-07 00:00:00'), Timestamp('2020-04-08 00:00:00'), Timestamp('2020-04-09 00:00:00'), Timestamp('2020-04-10 00:00:00'), Timestamp('2020-04-11 00:00:00'), Timestamp('2020-04-12 00:00:00'), Timestamp('2020-04-13 00:00:00'), Timestamp('2020-04-14 00:00:00'), Timestamp('202

RuntimeError: "not all values found in index 'time'"