# Creating datasets from uploaded data

This notebook demonstrates creating a Azure ML Datastore and assorted AzureML Datasets for the uploaded data.

We will create datasets both for inidividual events and datasets combining all the data. The main focus will be on the merged datasets (radar and model together), but we will also create separate  datasets as well, especially for gridded data, but that will require creating extra coding to make it nicely presentable, so will leave that for future issues.

In [1]:
import pathlib

In [2]:
import azureml
import azureml.core 


### Define parameters and set up workspace

In [3]:
# get existing workspace
workspace = azureml.core.Workspace.from_config()

In [43]:
prd_prefix = 'prd'
merged_prefix = f'{prd_prefix}_merged'
radar_prefix = f'{prd_prefix}_radar'
mogreps_g_prefix = f'{prd_prefix}_mg'
tabular_suffix = 'csv'
gridded_suffix = 'nc'
dataset_merged_name_template = merged_prefix + '_{event_name}'
prd_root_blob_dir = prd_prefix

In [5]:
datastore_name = 'precip_rediagnosis_train202208'
container_name = 'training202208'
az_blob_account_name = 'preciprediagnosisstorage'
az_blob_account_key = 'INSERT_KEY'


In [6]:
ret_val = azureml.core.datastore.Datastore.register_azure_blob_container(
    workspace=workspace,
    datastore_name=datastore_name, 
    container_name=container_name,
    account_name=az_blob_account_name,
    account_key=az_blob_account_key,
)

In [9]:
    # retrieve an existing datastore in the workspace by name
datastore = azureml.core.datastore.Datastore.get(workspace, datastore_name)


In [12]:
all_data_ds = azureml.data.FileDataset.File.from_files((datastore, f'{prd_prefix}/*/{merged_prefix}*csv'))
all_data_ds

{
  "source": [
    "('precip_rediagnosis_train202208', 'prd/*/prd*csv')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}

In [33]:
current_event_list = [pathlib.Path(p1).parents[0].name for p1 in all_data_ds.to_path()]

In [34]:
current_event_list

['202002_storm_ciara',
 '202002_storm_dennis',
 '202010_nswws_amber_oct',
 '202012_nswws_amber_dec',
 '202102_nswws_amber_feb',
 '202110_nswws_amber_oct',
 '202112_storm_barra',
 '2022_storm_eunice_franklin']

Create a dataset with all data

In [46]:
all_merged_ds = azureml.data.FileDataset.Tabular.from_delimited_files((datastore, f'{prd_prefix}/*/{merged_prefix}*csv'))
all_merged_ds = event_datasets[event_name].register(
        workspace=workspace,
        name=f'{merged_prefix}_all_events',
        description=f'Precip rediagnosis dataset for currently asvailable training events.' 
    )
all_merged_ds



{
  "source": [
    "('precip_rediagnosis_train202208', 'prd/2022_storm_eunice_franklin/prd_merged*csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "c1283043-a993-47a9-abf3-3bd929021e2f",
    "name": "prd_merged_all_events",
    "version": 1,
    "description": "Precip rediagnosis dataset for currently asvailable training events.",
    "workspace": "Workspace.create(name='precip_rediagnosis', subscription_id='07efdc52-cd27-48ed-9443-3aad2b6b777b', resource_group='precip_rediagnosis')"
  }
}

In [48]:
all_merged_ds.to_pandas_dataframe()

Unnamed: 0,time,latitude,longitude,radar_max_rain_aggregate_3hr,radar_mean_rain_aggregate_3hr,radar_max_rain_instant,radar_mean_rain_instant,fraction_sum_agg,fraction_sum_instant,radar_fraction_in_band_aggregate_3hr_0.25,...,wind_speed_2750.0,wind_speed_3000.0,wind_speed_3250.0,wind_speed_3500.0,wind_speed_3750.0,wind_speed_4000.0,wind_speed_4500.0,wind_speed_5000.0,wind_speed_5500.0,wind_speed_6000.0
0,2022-02-18 12:00:00,49.40625,-5.484375,1.244792,0.171409,14.46875,0.25875,1.0,1.004706,0.943529,...,29.6250,29.6875,29.5625,29.3750,29.3125,29.5625,32.3750,36.6875,42.1875,51.4375
1,2022-02-18 12:00:00,49.40625,-5.484375,1.244792,0.171409,14.46875,0.25875,1.0,1.004706,0.943529,...,29.2500,29.0625,28.9375,29.3125,30.3750,32.1875,37.5625,43.0625,49.2500,54.0625
2,2022-02-18 12:00:00,49.40625,-5.484375,1.244792,0.171409,14.46875,0.25875,1.0,1.004706,0.943529,...,32.5625,33.3750,34.6250,36.1250,37.6875,39.3750,40.6875,40.4375,42.8750,52.0625
3,2022-02-18 12:00:00,49.40625,-5.484375,1.244792,0.171409,14.46875,0.25875,1.0,1.004706,0.943529,...,31.0625,32.0000,33.1250,34.2500,35.5625,37.1250,40.0000,40.9375,43.7500,52.3750
4,2022-02-18 12:00:00,49.40625,-5.484375,1.244792,0.171409,14.46875,0.25875,1.0,1.004706,0.943529,...,32.8750,32.6875,33.1875,33.8125,34.1875,34.1875,35.0000,35.5000,40.0000,52.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320539,2022-02-21 06:00:00,58.78125,1.546875,0.000000,0.000000,0.00000,0.00000,1.0,1.000000,1.000000,...,13.5625,13.3125,13.1250,13.0000,13.0000,13.0625,13.1875,13.3750,15.1250,17.9375
320540,2022-02-21 06:00:00,58.78125,1.546875,0.000000,0.000000,0.00000,0.00000,1.0,1.000000,1.000000,...,12.6875,12.5000,12.1875,12.0000,12.0000,12.4375,13.4375,13.8125,14.8125,15.9375
320541,2022-02-21 06:00:00,58.78125,1.546875,0.000000,0.000000,0.00000,0.00000,1.0,1.000000,1.000000,...,11.1250,11.1875,11.3750,11.6875,12.3125,13.0000,14.1875,15.6875,15.4375,13.1875
320542,2022-02-21 06:00:00,58.78125,1.546875,0.000000,0.000000,0.00000,0.00000,1.0,1.000000,1.000000,...,14.4375,14.6250,14.8750,15.1250,15.3125,15.5000,15.8750,16.4375,16.8750,17.1875


In [None]:
#TODO: we should write out the event config with each dataset
# TODO: We should decide what events are there by scanning for config files
# TODO: we should also create datasets for radar tabular, mogepres-g tabular, rasdar gridded and mogreps-g gridded




In [44]:
event_datasets = {}
for event_name in current_event_list:
    print(event_name)
    event_paths = [
        (datastore, f'prd/{event_name}/{merged_prefix}*{tabular_suffix}'),
    ]
    event_datasets[event_name] = azureml.core.dataset.Dataset.Tabular.from_delimited_files(
        path=event_paths)
    event_datasets[event_name] = event_datasets[event_name].register(
        workspace=workspace,
        name=dataset_merged_name_template.format(
            event_name=event_name),
        description=f'Dataset for {event_name}.' # TODO: this info should come from config written out with data.
    )

202002_storm_ciara
202002_storm_dennis
202010_nswws_amber_oct
202012_nswws_amber_dec
202102_nswws_amber_feb
202110_nswws_amber_oct
202112_storm_barra
2022_storm_eunice_franklin
