# Creating datasets from uploaded data

This notebook demonstrates creating a Azure ML Datastore and assorted AzureML Datasets for the uploaded data.

We will create datasets both for inidividual events and datasets combining all the data. The main focus will be on the merged datasets (radar and model together), but we will also create separate  datasets as well, especially for gridded data, but that will require creating extra coding to make it nicely presentable, so will leave that for future issues.

In [1]:
import pathlib

In [2]:
import azureml
import azureml.core 


### Define parameters and set up workspace

In [3]:
# get existing workspace
prd_workspace = azureml.core.Workspace.from_config()

If `do_unregister=True`, then the old version will be removed to register a new dataset. Otherwise a new version number of the existing dataset will be created.

In [4]:
do_unregister = True

In [5]:
prd_prefix = 'prd'
radar_filter = 'radar'
mogreps_g_short_filter = 'mg' 
mogreps_g_filter  = 'mogreps_g'
mogreps_g_title  = 'mogreps-g'
merged_filter = 'merged'
merged_prefix = f'{prd_prefix}_{merged_filter}'
radar_prefix = f'{prd_prefix}_{radar_filter}'
mogreps_g_prefix = f'{prd_prefix}_{mogreps_g_filter}'
tabular_suffix = 'csv'
gridded_suffix = 'nc'
dataset_merged_name_template = merged_prefix + '_{event_name}'
prd_root_blob_dir = prd_prefix

In [6]:
datastore_name = 'precip_rediagnosis_train202209'
container_name = 'training202209'
# Get the account from the storage account listed below through Azure Portal. 
# Copy one of the account keys here to set the variable. 
# IMPORTANT: Make sure you DO NOT commit the account key to github
az_blob_account_name = 'preciprediagnosisstorage'
# az_blob_account_key = 'INSERT_KEY'
az_blob_account_key = 'Sxtj58XZrfGG1iEMs7mD/bHJ/W3cyrfTcPk6RuFj9KVquMiilwIuCQyRGDyoNq1O28XJA0O2bnBM+AStOKxPcA=='


In [7]:
try:
    # retrieve an existing datastore in the workspace by name
    prd_datastore = azureml.core.datastore.Datastore.get(prd_workspace, datastore_name)
    print('existing datastore retrieved')
except azureml.exceptions.AzureMLException:
    ret_val = azureml.core.datastore.Datastore.register_azure_blob_container(
        workspace=prd_workspace,
        datastore_name=datastore_name, 
        container_name=container_name,
        account_name=az_blob_account_name,
        account_key=az_blob_account_key,
    )
    print('new datastore created')
    print(ret_val)
    prd_datastore = azureml.core.datastore.Datastore.get(prd_workspace, datastore_name)
prd_datastore

new datastore created
{
  "name": "precip_rediagnosis_train202209",
  "container_name": "training202209",
  "account_name": "preciprediagnosisstorage",
  "protocol": "https",
  "endpoint": "core.windows.net"
}


{
  "name": "precip_rediagnosis_train202209",
  "container_name": "training202209",
  "account_name": "preciprediagnosisstorage",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

### Create all event file datasets 
Create all event file datasets for different sorts of data sources that combines all events

In [8]:
all_event_file_datasets_dict = {
    f'{merged_prefix}_all_events_files': {
        'description': 'A file dataset containing the merged dataframe for each of the events in the current training set.', 
        'filter': f'{prd_prefix}/*/{merged_prefix}*csv'
    },
    f'{radar_prefix}_tabular_files': {
        'description' : 'File dataset with all files from all events of tabular radar data.', 
        'filter': f'{prd_prefix}/*/{radar_filter}/{radar_prefix}*{tabular_suffix}'
    },
    f'{radar_prefix}_gridded_files': {
        'description' : 'File dataset with all files from all events of tabular radar data.', 
        'filter': f'{prd_prefix}/*/{radar_filter}/{radar_prefix}*{gridded_suffix}'},
    f'{mogreps_g_prefix}_tabular_files': {
        'description': 'File dataset with all files from all events of tabular mogreps-g data', 
        'filter': f'{prd_prefix}/*/{mogreps_g_title}/{mogreps_g_prefix}*{tabular_suffix}'
    },
}


In [9]:
all_event_file_datasets_dict['prd_mogreps_g_tabular_files']

{'description': 'File dataset with all files from all events of tabular mogreps-g data',
 'filter': 'prd/*/mogreps-g/prd_mogreps_g*csv'}

Notebook includes an option to unregister the dataset if required. If this flag is `False`, then a new version is created.

In [10]:
def do_unregister_dataset(workspace, dataset_name):
    try:
        ds_to_remove = azureml.core.Dataset.get_by_name(workspace, dataset_name)
        ds_to_remove.unregister_all_versions()
        print(f'dataset {dataset_name} unregistered')
    except:
        print(f'no dataset to unregister with name {dataset_name}')


In [11]:
for ds_name, ds_config in all_event_file_datasets_dict.items():
    print(ds_name)
    if do_unregister:
        do_unregister_dataset(prd_workspace, ds_name)
    current_ds = azureml.data.FileDataset.File.from_files((prd_datastore, ds_config['filter']))
    current_ds.register(workspace=prd_workspace,
                        name=ds_name,
                        description=ds_config['description'],
                        create_new_version=not do_unregister,
                       )

prd_merged_all_events_files
dataset prd_merged_all_events_files unregistered
prd_radar_tabular_files
dataset prd_radar_tabular_files unregistered
prd_radar_gridded_files
dataset prd_radar_gridded_files unregistered
prd_mogreps_g_tabular_files
dataset prd_mogreps_g_tabular_files unregistered


In [12]:
all_data_ds = azureml.core.Dataset.get_by_name(prd_workspace, f'{merged_prefix}_all_events_files')
all_data_ds

{
  "source": [
    "('precip_rediagnosis_train202209', 'prd/*/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "1a7d41d0-e45f-46da-a1cf-7fbfca9162f0",
    "name": "prd_merged_all_events_files",
    "version": 1,
    "description": "A file dataset containing the merged dataframe for each of the events in the current training set.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [13]:
current_event_list = [pathlib.Path(p1).parents[0].name for p1 in all_data_ds.to_path()]

In [14]:
current_event_list

['202002_storm_ciara',
 '202002_storm_dennis',
 '202008_storm_ellen',
 '202008_storm_francis',
 '202010_nswws_amber_oct',
 '202012_nswws_amber_dec',
 '202102_nswws_amber_feb',
 '202110_nswws_amber_oct',
 '202112_storm_barra',
 '2022_storm_eunice_franklin']

In [15]:
len(current_event_list)

10

Create a dataset with all data

In [16]:
all_merged_ds_name = f'{merged_prefix}_all_events'
if do_unregister:
    do_unregister_dataset(prd_workspace, all_merged_ds_name)
all_merged_ds = azureml.data.FileDataset.Tabular.from_delimited_files((prd_datastore, f'{prd_prefix}/*/{merged_prefix}*{tabular_suffix}'))
all_merged_ds = all_merged_ds.register(
    workspace=prd_workspace,
    name=all_merged_ds_name,
    description=f'Precip rediagnosis dataset for currently available training events.' ,
    create_new_version=not do_unregister,
)
all_merged_ds

dataset prd_merged_all_events unregistered


{
  "source": [
    "('precip_rediagnosis_train202209', 'prd/*/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "5b6ecaae-acd2-4dea-8a9b-24e2c13df826",
    "name": "prd_merged_all_events",
    "version": 1,
    "description": "Precip rediagnosis dataset for currently available training events.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [17]:
merged_df = all_merged_ds.to_pandas_dataframe()

First partition columns (ordered): ['realization', 'latitude', 'longitude', 'forecast_period', 'forecast_reference_time', 'time', 'cloud_area_fraction', 'surface_altitude', 'air_pressure_at_sea_level', 'rainfall_rate', 'convective_rainfall_rate', 'lwe_snowfall_rate', 'lwe_convective_snowfall_rate', 'cloud_volume_fraction_in_atmosphere_layer_5.0', 'cloud_volume_fraction_in_atmosphere_layer_10.0', 'cloud_volume_fraction_in_atmosphere_layer_20.0', 'cloud_volume_fraction_in_atmosphere_layer_30.0', 'cloud_volume_fraction_in_atmosphere_layer_50.0', 'cloud_volume_fraction_in_atmosphere_layer_75.0', 'cloud_volume_fraction_in_atmosphere_layer_100.0', 'cloud_volume_fraction_in_atmosphere_layer_150.0', 'cloud_volume_fraction_in_atmosphere_layer_200.0', 'cloud_volume_fraction_in_atmosphere_layer_250.0', 'cloud_volume_fraction_in_atmosphere_layer_300.0', 'cloud_volume_fraction_in_atmosphere_layer_400.0', 'cloud_volume_fraction_in_atmosphere_layer_500.0', 'cloud_volume_fraction_in_atmosphere_layer_6

In [18]:
[c1 for c1 in  merged_df.columns if 'fraction' in c1 and 'instant' in c1]

['fraction_sum_instant',
 'radar_fraction_in_band_instant_0.0',
 'radar_fraction_in_band_instant_0.25',
 'radar_fraction_in_band_instant_2.5',
 'radar_fraction_in_band_instant_7.0',
 'radar_fraction_in_band_instant_10.0']

In [19]:
ds_type_dict = {
    'files': azureml.core.dataset.Dataset.File.from_files,
    'tabular': azureml.core.dataset.Dataset.Tabular.from_delimited_files,
}

In [20]:
event_datasets = {}
for event_name in current_event_list:
    
    print(event_name)
    event_datasets[event_name] = {}
    event_ds_name = dataset_merged_name_template.format(
        event_name=event_name)
    event_paths = [
        (prd_datastore, f'prd/{event_name}/{merged_prefix}*{tabular_suffix}'),
    ]
    
    for ds_type, ds_func in ds_type_dict.items():
        dataset_options = {
            'workspace': prd_workspace,
            'name': event_ds_name + '_' + ds_type,
            'description': f'{ds_type} Dataset for merged data for {event_name}.', 
            'create_new_version': not do_unregister,
        } 
        if do_unregister:
            do_unregister_dataset(prd_workspace, dataset_options['name'])
        # tabular dataset (not working entirely correctly yet when reading in)
        event_datasets[event_name][ds_type] = ds_func(
            path=event_paths).register(**dataset_options)

202002_storm_ciara
dataset prd_merged_202002_storm_ciara_files unregistered
dataset prd_merged_202002_storm_ciara_tabular unregistered
202002_storm_dennis
dataset prd_merged_202002_storm_dennis_files unregistered
dataset prd_merged_202002_storm_dennis_tabular unregistered
202008_storm_ellen
dataset prd_merged_202008_storm_ellen_files unregistered
dataset prd_merged_202008_storm_ellen_tabular unregistered
202008_storm_francis
dataset prd_merged_202008_storm_francis_files unregistered
dataset prd_merged_202008_storm_francis_tabular unregistered
202010_nswws_amber_oct
dataset prd_merged_202010_nswws_amber_oct_files unregistered
dataset prd_merged_202010_nswws_amber_oct_tabular unregistered
202012_nswws_amber_dec
dataset prd_merged_202012_nswws_amber_dec_files unregistered
dataset prd_merged_202012_nswws_amber_dec_tabular unregistered
202102_nswws_amber_feb
dataset prd_merged_202102_nswws_amber_feb_files unregistered
dataset prd_merged_202102_nswws_amber_feb_tabular unregistered
202110_nsw