# Creating datasets from uploaded data

This notebook demonstrates creating a Azure ML Datastore and assorted AzureML Datasets for the uploaded data.

We will create datasets both for inidividual events and datasets combining all the data. The main focus will be on the merged datasets (radar and model together), but we will also create separate  datasets as well, especially for gridded data, but that will require creating extra coding to make it nicely presentable, so will leave that for future issues.

In [1]:
import pathlib

In [2]:
import azureml
import azureml.core 


### Define parameters and set up workspace

In [3]:
# get existing workspace
prd_workspace = azureml.core.Workspace.from_config()

If `do_unregister=True`, then the old version will be removed to register a new dataset. Otherwise a new version number of the existing dataset will be created.

In [4]:
do_unregister = True

In [16]:
prd_prefix = 'prd'
radar_filter = 'radar'
mogreps_g_short_filter = 'mg' 
mogreps_g_filter  = 'mogreps_g'
mogreps_g_title  = 'mogreps-g'
merged_filter = 'merged'
merged_prefix = f'{prd_prefix}_{merged_filter}'
radar_prefix = f'{prd_prefix}_{radar_filter}'
mogreps_g_prefix = f'{prd_prefix}_{mogreps_g_filter}'
tabular_suffix = 'csv'
gridded_suffix = 'nc'
dataset_merged_name_template = merged_prefix + '_{event_name}'
prd_root_blob_dir = prd_prefix

In [6]:
datastore_name = 'precip_rediagnosis_train202208'
container_name = 'training202208'
az_blob_account_name = 'preciprediagnosisstorage'
az_blob_account_key = 'INSERT_KEY'


In [7]:
try:
    # retrieve an existing datastore in the workspace by name
    prd_datastore = azureml.core.datastore.Datastore.get(prd_workspace, datastore_name)
    print('existing datastore retrieved')
except azureml.exceptions.AzureMLException as e1:
    ret_val = azureml.core.datastore.Datastore.register_azure_blob_container(
        workspace=prd_workspace,
        datastore_name=datastore_name, 
        container_name=container_name,
        account_name=az_blob_account_name,
        account_key=az_blob_account_key,
    )
    print('new datastore created')
    print(ret_val)
    prd_datastore = azureml.core.datastore.Datastore.get(prd_workspace, datastore_name)
prd_datastore

existing datastore retrieved


{
  "name": "precip_rediagnosis_train202208",
  "container_name": "training202208",
  "account_name": "preciprediagnosisstorage",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

### Create all event file datasets 
Create all event file datasets for different sorts of data sources that combines all events

In [17]:
all_event_file_datasets_dict = {
    f'{merged_prefix}_all_events_files': {
        'description': 'A file dataset containing the merged dataframe for each of the events in the current training set.', 
        'filter': f'{prd_prefix}/*/{merged_prefix}*csv'
    },
    f'{radar_prefix}_tabular_files': {
        'description' : 'File dataset with all files from all events of tabular radar data.', 
        'filter': f'{prd_prefix}/*/{radar_filter}/{radar_prefix}*{tabular_suffix}'
    },
    f'{radar_prefix}_gridded_files': {
        'description' : 'File dataset with all files from all events of tabular radar data.', 
        'filter': f'{prd_prefix}/*/{radar_filter}/{radar_prefix}*{gridded_suffix}'},
    f'{mogreps_g_prefix}_tabular_files': {
        'description': 'File dataset with all files from all events of tabular mogreps-g data', 
        'filter': f'{prd_prefix}/*/{mogreps_g_title}/{mogreps_g_prefix}*{tabular_suffix}'
    },
}


In [19]:
all_event_file_datasets_dict['prd_mogreps_g_tabular_files']

{'description': 'File dataset with all files from all events of tabular mogreps-g data',
 'filter': 'prd/*/mogreps-g/prd_mogreps_g*csv'}

Notebook includes an option to unregister the dataset if required. If this flag is `False`, then a new version is created.

In [35]:
def do_unregister_dataset(workspace, dataset_name):
    try:
        ds_to_remove = azureml.core.Dataset.get_by_name(workspace, dataset_name)
        ds_to_remove.unregister_all_versions()
        print(' dataset unregistered')
    except:
        print('no dataset to unregister')


In [41]:
for ds_name, ds_config in all_event_file_datasets_dict.items():
    print(ds_name)
    if do_unregister:
        do_unregister_dataset(prd_workspace, ds_name)
    current_ds = azureml.data.FileDataset.File.from_files((prd_datastore, ds_config['filter']))
    current_ds.register(workspace=prd_workspace,
                        name=ds_name,
                        description=ds_config['description'],
                        register
                       )

prd_merged_all_events_files
no dataset to unregister
prd_radar_tabular_files
 dataset unregistered
prd_radar_gridded_files
 dataset unregistered
prd_mogreps_g_tabular_files
 dataset unregistered


In [25]:
 all_event_file_datasets_dict[]

{'description': 'A file dataset containing the merged dataframe for each of the events in the current training set.',
 'filter': 'prd/*/prd_merged*csv'}

In [27]:
all_data_ds = azureml.core.Dataset.get_by_name(prd_workspace, f'{merged_prefix}_all_events_files')
all_data_ds

{
  "source": [
    "('precip_rediagnosis_train202208', 'prd/*/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "11e82d57-2a43-431e-ab29-b8e285144752",
    "name": "prd_merged_all_events_files",
    "version": 1,
    "description": "A file dataset containing the merged dataframe for each of the events in the current training set.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [28]:
current_event_list = [pathlib.Path(p1).parents[0].name for p1 in all_data_ds.to_path()]

In [29]:
current_event_list

['202002_storm_ciara',
 '202002_storm_dennis',
 '202008_storm_ellen',
 '202008_storm_francis',
 '202010_nswws_amber_oct',
 '202012_nswws_amber_dec',
 '202102_nswws_amber_feb',
 '202110_nswws_amber_oct',
 '202112_storm_barra',
 '2022_storm_eunice_franklin']

In [30]:
len(current_event_list)

10

Create a dataset with all data

In [34]:
all_merged_ds_name = f'{merged_prefix}_all_events'
if do_unregister:
    do_unregister_dataset(prd_workspace, all_merged_ds_name)
all_merged_ds = azureml.data.FileDataset.Tabular.from_delimited_files((prd_datastore, f'{prd_prefix}/*/{merged_prefix}*{tabular_suffix}'))
all_merged_ds = all_merged_ds.register(
    workspace=prd_workspace,
    name=all_merged_ds_name,
    description=f'Precip rediagnosis dataset for currently available training events.' 
    create_new_version=not do_unregister,
)
all_merged_ds

 dataset unregistered


{
  "source": [
    "('precip_rediagnosis_train202208', 'prd/*/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "65ddbd5e-a4b8-4a6c-94b2-d425954975d2",
    "name": "prd_merged_all_events",
    "version": 1,
    "description": "Precip rediagnosis dataset for currently available training events.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [42]:
all_merged_ds.to_pandas_dataframe()

Unnamed: 0,realization,latitude,longitude,forecast_period,forecast_reference_time,time,cloud_area_fraction,surface_altitude,air_pressure_at_sea_level,rainfall_rate,...,fraction_sum_agg,fraction_sum_instant,radar_fraction_in_band_aggregate_3hr_0.25,radar_fraction_in_band_aggregate_3hr_2.5,radar_fraction_in_band_aggregate_3hr_7.0,radar_fraction_in_band_aggregate_3hr_10.0,radar_fraction_in_band_instant_0.25,radar_fraction_in_band_instant_2.5,radar_fraction_in_band_instant_7.0,radar_fraction_in_band_instant_10.0
0,0.0,49.40625,-5.484375,0 days 06:00:00,2020-02-07 12:00:00,2020-02-07 18:00:00,0.984375,0.0,101050.0,0.000000,...,1.004706,1.004706,0.807059,0.197647,0.0000,0.0000,0.978824,0.025882,0.0000,0.0000
1,1.0,49.40625,-5.484375,0 days 06:00:00,2020-02-07 12:00:00,2020-02-07 18:00:00,1.000000,0.0,101036.0,0.536442,...,1.004706,1.004706,0.807059,0.197647,0.0000,0.0000,0.978824,0.025882,0.0000,0.0000
2,2.0,49.40625,-5.484375,0 days 06:00:00,2020-02-07 12:00:00,2020-02-07 18:00:00,0.984375,0.0,101104.0,0.000000,...,1.004706,1.004706,0.807059,0.197647,0.0000,0.0000,0.978824,0.025882,0.0000,0.0000
3,3.0,49.40625,-5.484375,0 days 06:00:00,2020-02-07 12:00:00,2020-02-07 18:00:00,0.843750,0.0,101005.0,0.000000,...,1.004706,1.004706,0.807059,0.197647,0.0000,0.0000,0.978824,0.025882,0.0000,0.0000
4,4.0,49.40625,-5.484375,0 days 06:00:00,2020-02-07 12:00:00,2020-02-07 18:00:00,1.000000,0.0,101038.0,0.000000,...,1.004706,1.004706,0.807059,0.197647,0.0000,0.0000,0.978824,0.025882,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2776207,,58.78125,1.546875,0.0,NaT,NaT,0.000000,1.0,1.0,1.000000,...,22.250000,21.937500,21.687500,21.375000,20.8125,20.3125,22.687500,24.687500,24.0000,23.2500
2776208,,58.78125,1.546875,0.0,NaT,NaT,0.000000,1.0,1.0,1.000000,...,22.437500,21.312500,20.312500,19.937500,19.9375,19.8125,20.250000,22.875000,24.5000,24.8125
2776209,,58.78125,1.546875,0.0,NaT,NaT,0.000000,1.0,1.0,1.000000,...,18.500000,17.875000,17.562500,17.687500,17.8750,18.1250,18.812500,21.625000,23.0000,23.6250
2776210,,58.78125,1.546875,0.0,NaT,NaT,0.000000,1.0,1.0,1.000000,...,22.625000,20.875000,19.875000,19.750000,20.0000,20.0000,20.187500,23.125000,24.8125,25.1875


In [None]:
#TODO: we should write out the event config with each dataset
# TODO: We should decide what events are there by scanning for config files
# TODO: we should also create datasets for radar tabular, mogepres-g tabular, rasdar gridded and mogreps-g gridded




In [None]:
#TODO: do we need to unregister datasets first. If so, we should do it just before registering. POssibly have flag to create new version or unregister.


In [39]:
event_datasets = {}
for event_name in current_event_list:
    print(event_name)
    event_datasets[event_name] = {}
    event_ds_name = dataset_merged_name_template.format(
        event_name=event_name)
    event_ds_name_files = event_ds_name + '_files'
    event_paths = [
        (prd_datastore, f'prd/{event_name}/{merged_prefix}*{tabular_suffix}'),
    ]
    if do_unregister:
        do_unregister_dataset(prd_workspace, event_ds_name)
        do_unregister_dataset(prd_workspace, event_ds_name_files)

    # tabular dataset (not working entirely correctly yet when reading in)
    event_datasets[event_name]['tabular'] = azureml.core.dataset.Dataset.Tabular.from_delimited_files(
        path=event_paths)
    event_datasets[event_name]['tabular'] = event_datasets[event_name]['tabular'].register(
        workspace=prd_workspace,
        name=event_ds_name,
        description=f'Tabular Dataset for merged data for {event_name}.', # TODO: this info should come from config written out with data.
        create_new_version=not do_unregister,
    )
    # file dataset
    event_datasets[event_name]['files'] = azureml.core.dataset.Dataset.File.from_files(
        path=event_paths)
    event_datasets[event_name]['files'] = event_datasets[event_name]['files'].register(
        workspace=prd_workspace,
        name=event_ds_name_files,
        description=f'Tabular Dataset for merged data for {event_name}.',
        create_new_version=not do_unregister,
    )
    

202002_storm_ciara
 dataset unregistered
no dataset to unregister
202002_storm_dennis
 dataset unregistered
no dataset to unregister
202008_storm_ellen
 dataset unregistered
no dataset to unregister
202008_storm_francis
 dataset unregistered
no dataset to unregister
202010_nswws_amber_oct
 dataset unregistered
no dataset to unregister
202012_nswws_amber_dec
 dataset unregistered
no dataset to unregister
202102_nswws_amber_feb
 dataset unregistered
no dataset to unregister
202110_nswws_amber_oct
 dataset unregistered
no dataset to unregister
202112_storm_barra
 dataset unregistered
no dataset to unregister
2022_storm_eunice_franklin
 dataset unregistered
no dataset to unregister
