# Creating datasets from uploaded data

This notebook demonstrates creating a Azure ML Datastore and assorted AzureML Datasets for the uploaded data.

We should be 

In [6]:
import pathlib

In [2]:
import azureml
import azureml.core 


In [4]:
# get existing workspace
workspace = azureml.core.Workspace.from_config()

In [19]:
datastore_name = 'precip_rediagnosis_train202208'
container_name = 'training202208'
az_blob_account_name = 'preciprediagnosisstorage'
az_blob_account_key = 'INSERT_KEY_HERE'


In [20]:
ret_val = azureml.core.datastore.Datastore.register_azure_blob_container(
    workspace=workspace,
    datastore_name=datastore_name, 
    container_name=container_name,
    account_name=az_blob_account_name,
    account_key=az_blob_account_key,
)

In [21]:
ret_val

{
  "name": "precip_rediagnosis_train202208",
  "container_name": "training202208",
  "account_name": "preciprediagnosisstorage",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [23]:
    # retrieve an existing datastore in the workspace by name
datastore = azureml.core.datastore.Datastore.get(workspace, datastore_name)


In [None]:
#TODO: read in the event configs

# for each config, create a datatset as shown below
# you may need to search for the merged filename
# maybe this could be handled by an intake catalog
# otherwise the list of files could be constructed by a utility from the config.

In [26]:
# create a TabularDataset from 3 file paths in datastore
storm_barra_datastore_paths = [
    (datastore, 'prd/202112_storm_barra/prd_merged_20211206T1800Z_20211209T0000Z.csv'),
]


In [27]:
prd_storm_barra_ds = azureml.core.dataset.Dataset.Tabular.from_delimited_files(path=storm_barra_datastore_paths)
prd_storm_barra_ds

{
  "source": [
    "('precip_rediagnosis_train202208', 'prd/202112_storm_barra/prd_merged_20211206T1800Z_20211209T0000Z.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [28]:
prd_storm_barra_ds = prd_storm_barra_ds.register(workspace=workspace,
                                 name='prd_202112_storm_barra',
                                 description='Dataset for Dec 2021 storm barra.')

In [30]:
prd_storm_barra_df = prd_storm_barra_ds.to_pandas_dataframe()
prd_storm_barra_df

Unnamed: 0,realization,latitude,longitude,forecast_period,forecast_reference_time,time,cloud_area_fraction,rainfall_rate,lwe_snowfall_rate,surface_altitude,...,fraction_sum_agg,fraction_sum_instant,radar_fraction_in_band_aggregate_3hr_0.25,radar_fraction_in_band_aggregate_3hr_2.5,radar_fraction_in_band_aggregate_3hr_7.0,radar_fraction_in_band_aggregate_3hr_10.0,radar_fraction_in_band_instant_0.25,radar_fraction_in_band_instant_2.5,radar_fraction_in_band_instant_7.0,radar_fraction_in_band_instant_10.0
0,0,49.40625,-5.484375,0 days 06:00:00,2021-12-06 12:00:00,2021-12-06 18:00:00,0.875000,0.000000,0.0,0.0,...,1.0,1.0,0.997647,0.002353,0.0,0.0,1.0,0.0,0.0,0.0
1,1,49.40625,-5.484375,0 days 06:00:00,2021-12-06 12:00:00,2021-12-06 18:00:00,0.968750,0.000000,0.0,0.0,...,1.0,1.0,0.997647,0.002353,0.0,0.0,1.0,0.0,0.0,0.0
2,2,49.40625,-5.484375,0 days 06:00:00,2021-12-06 12:00:00,2021-12-06 18:00:00,1.000000,0.000000,0.0,0.0,...,1.0,1.0,0.997647,0.002353,0.0,0.0,1.0,0.0,0.0,0.0
3,3,49.40625,-5.484375,0 days 06:00:00,2021-12-06 12:00:00,2021-12-06 18:00:00,0.625000,0.000000,0.0,0.0,...,1.0,1.0,0.997647,0.002353,0.0,0.0,1.0,0.0,0.0,0.0
4,4,49.40625,-5.484375,0 days 06:00:00,2021-12-06 12:00:00,2021-12-06 18:00:00,0.531250,0.000000,0.0,0.0,...,1.0,1.0,0.997647,0.002353,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265279,30,58.78125,1.546875,0 days 06:00:00,2021-12-08 18:00:00,NaT,1.000000,0.315160,0.0,0.0,...,1.0,1.0,0.368421,0.631579,0.0,0.0,1.0,0.0,0.0,0.0
265280,31,58.78125,1.546875,0 days 06:00:00,2021-12-08 18:00:00,NaT,1.000000,0.781193,0.0,0.0,...,1.0,1.0,0.368421,0.631579,0.0,0.0,1.0,0.0,0.0,0.0
265281,32,58.78125,1.546875,0 days 06:00:00,2021-12-08 18:00:00,NaT,1.000000,0.838190,0.0,0.0,...,1.0,1.0,0.368421,0.631579,0.0,0.0,1.0,0.0,0.0,0.0
265282,33,58.78125,1.546875,0 days 06:00:00,2021-12-08 18:00:00,NaT,1.000000,0.479445,0.0,0.0,...,1.0,1.0,0.368421,0.631579,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
prd_storm_barra_df.time.value_counts()

2021-12-08 06:00:00    26712
2021-12-08 18:00:00    26712
2021-12-07 06:00:00    26712
2021-12-07 18:00:00    26712
2021-12-06 18:00:00    26712
2021-12-08 12:00:00    26712
2021-12-07 12:00:00    24876
2021-12-07 00:00:00      516
2021-12-08 00:00:00      516
2021-12-09 00:00:00      300
Name: time, dtype: int64