In [29]:
#authentication
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace

ia = InteractiveLoginAuthentication(tenant_id='16b3c013-d300-468d-ac64-7eda0820b6d3')

# You can find tenant id under azure active directory->properties
ws = Workspace.get(name='Prod',
                     subscription_id='fe38c376-b42a-4741-9e7c-f5d7c31e5873',
                     resource_group='ProdRG',auth=ia)

## ** Create Tabular Dataset from WEB URL (CSV files) and Use Unregistered Dataset Directly in Training Script**

In [30]:
#creates an unregistered TabularDataset from a WEB URL.

from azureml.core.dataset import Dataset

web_path ='https://dprepdata.blob.core.windows.net/demo/Titanic.csv'
titanic_ds = Dataset.Tabular.from_delimited_files(path=web_path)
titanic_ds.take(3).to_pandas_dataframe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [31]:
#Set data schema
#when you create a TabularDataset, column data types are inferred automatically. 
#If the inferred types don't match your expectations, you can update your dataset schema by specifying column types 

from azureml.data.dataset_factory import DataType

titanic_ds = Dataset.Tabular.from_delimited_files(path=web_path, set_column_types={'Survived': DataType.to_bool()})

# preview the first 3 rows of titanic_ds
titanic_ds.take(3).to_pandas_dataframe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,False,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,True,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,True,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [None]:
'''import os
# Create a folder for the script files
script_folder = 'workshop_examples'
os.makedirs(script_folder, exist_ok=True)

print(script_folder)'''

In [None]:
'''
%%writefile $script_folder/train_titanic.py
#access data in training script

import argparse
from azureml.core import Dataset, Run

parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str)
args = parser.parse_args()

run = Run.get_context()
ws = run.experiment.workspace

# get the input dataset by ID
dataset = Dataset.get_by_id(ws, id=args.input_data)

# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
'''

In [None]:
'''
# define compute target, add the code
# define environment as myenv., add the code
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=script_folder,
                      script='train_titanic.py',
                      # pass dataset as an input with friendly name 'titanic'
                      arguments=['--input-data', titanic_ds.as_named_input('titanic')],# titanic_ds is given without registering as a dataset.
                      compute_target=compute_target,
                      environment=myenv)
                             
# Submit the run configuration for your training run
run = experiment.submit(src)
run.wait_for_completion(show_output=True)
'''

## Create Tabular Dataset from Datastore (PARQUET files) and Register

In [39]:
#creates an unregistered TabularDataset from a DATASTORE  + PARQUET 

from azureml.core import Workspace, Datastore, Dataset

datastore_name = ws.get_default_datastore()
weather_ds = 'weather-data-florida'

#load data to blob storage from local folder (weather-data) to be able to create a dataset
datastore_name.upload('weather_data', weather_ds, overwrite=True, show_progress=True)

datastore_path = [(datastore_name, weather_ds + '/*/*/data.parquet')]
dataset        = Dataset.Tabular.from_parquet_files(path=datastore_path, partition_format = weather_ds + '/{partition_time:yyyy/MM}/data.parquet')
# **** To read files in .csv or .tsv format, use  from_delimited_files() method. *****
#dataset= Dataset.Tabular.from_delimited_files(path=datastore_path)


Uploading an estimated of 12 files
Uploading weather_data/2019/01/.amlignore
Uploaded weather_data/2019/01/.amlignore, 1 files out of an estimated total of 12
Uploading weather_data/2019/01/.amlignore.amltmp
Uploaded weather_data/2019/01/.amlignore.amltmp, 2 files out of an estimated total of 12
Uploading weather_data/2019/01/data.parquet
Uploaded weather_data/2019/01/data.parquet, 3 files out of an estimated total of 12
Uploading weather_data/2019/02/data.parquet
Uploaded weather_data/2019/02/data.parquet, 4 files out of an estimated total of 12
Uploading weather_data/2019/03/data.parquet
Uploaded weather_data/2019/03/data.parquet, 5 files out of an estimated total of 12
Uploading weather_data/2019/04/data.parquet
Uploaded weather_data/2019/04/data.parquet, 6 files out of an estimated total of 12
Uploading weather_data/2019/05/data.parquet
Uploaded weather_data/2019/05/data.parquet, 7 files out of an estimated total of 12
Uploading weather_data/2019/06/data.parquet
Uploaded weather_da

In [33]:
dataset.to_pandas_dataframe().head()

Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
0,720735,73805,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
1,720735,73805,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2,720735,73805,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
3,720735,73805,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
4,720735,73805,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


### Wrangle Data: Tabular Datasets
* keep_columns()
* drop_columns()
* filter()


In [34]:
from datetime import datetime
tsd=dataset.filter(dataset['datetime']<datetime(2019,1,2))
#tsd = tsd.time_after(datetime(2019, 1, 1)).time_before(datetime(2019, 1, 10))
tsd.to_pandas_dataframe().head(5)

Method filter: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
0,720735,73805,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
1,720735,73805,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2,720735,73805,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
3,720735,73805,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
4,720735,73805,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


In [35]:
tsd2 = tsd.keep_columns(columns=['snowDepth', 'datetime', 'partition_time'], validate=False)
tsd2.to_pandas_dataframe().tail()

Unnamed: 0,datetime,snowDepth,partition_time
154,2019-01-01 20:00:00,,2019-01-01
155,2019-01-01 21:00:00,,2019-01-01
156,2019-01-01 21:00:00,,2019-01-01
157,2019-01-01 22:00:00,,2019-01-01
158,2019-01-01 23:00:00,,2019-01-01


In [36]:
tsd2 = tsd.drop_columns(columns=['wban', 'snowDepth'])
tsd2.take(5).to_pandas_dataframe()

Unnamed: 0,usaf,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,cloudCoverage,...,precipTime,precipDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
0,720735,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
1,720735,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2,720735,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,,...,1.0,0.0,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
3,720735,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
4,720735,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,,...,1.0,0.0,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


### Register Tabular Dataset

You can register a new dataset under the same name by creating a new version. A dataset version is a way to bookmark the state of your data so that you can apply a specific version of the dataset for experimentation or future reproduction.

In [44]:
#Assign "datetime" column as timestamp and "partition_time" from folder path as partition_timestamp 
# for Tabular Dataset to activate Time Series related APIs. The column to be assigned should be a Date type.
#dataset = dataset.with_timestamp_columns(timestamp='datetime', partition_timestamp='partition_time')

# register dataset to Workspace
registered_ds = dataset.register(workspace=ws, 
                            name=weather_ds, 
                            create_new_version=True, 
                            description='Data for Tabular Dataset- time-series.', 
                            tags={ 'type': 'TabularDataset' })

In [45]:
registered_ds.to_pandas_dataframe().head()

Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
0,720735,73805,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
1,720735,73805,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2,720735,73805,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
3,720735,73805,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
4,720735,73805,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


### Reload the Dataset from Workspace

In [46]:
# get dataset by dataset name
tsd = Dataset.get_by_name(ws, name=weather_ds)
#get_by_id() 

tsd.to_pandas_dataframe().head(5)
#returns the latest version of the dataset

Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
0,720735,73805,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
1,720735,73805,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2,720735,73805,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
3,720735,73805,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
4,720735,73805,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


### Create a dataset from pandas dataframe

In [53]:
from azureml.core import Workspace, Datastore, Dataset
import pandas as pd

pandas_df = pd.read_csv('https://ml.azure.com/fileexplorerAzNB?wsid=/subscriptions/fe38c376-b42a-4741-9e7c-f5d7c31e5873/resourcegroups/ProdRG/providers/Microsoft.MachineLearningServices/workspaces/Prod&tid=16b3c013-d300-468d-ac64-7eda0820b6d3&activeFilePath=Users/yelizkilinc/automl-workshop/Data%20Basics/workshop_examples/Data/inference_data')

datastore = Datastore.get(ws, 'workspaceblobstore')
dataset = Dataset.Tabular.register_pandas_dataframe(pandas_df, datastore, "dataset_from_pandas_df", show_progress=True)
dataset.take(3).to_pandas_dataframe()

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/df416d89-5001-493c-9301-86bbb57c81a7/
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: '<!doctype html><html lang="en" class="notranslate" translate="no"><head><meta charset="utf-8"/><meta name="Description" content="Azure Machine Learning Studio is a GUI-based integrated development environment for constructing and operationalizing Machine Learning workflow on Azure."/><link rel="shortcut icon" href="/favicon.ico"/><title>Microsoft Azure Machine Learning Studio</title><script>window.App_Version="WorkspacePortal-2022-Oct-10-2022-10-13.1"' -> '<!doctype html><html lang="en" class="notranslate" translate="no"><head><meta charset="utf-8"/><meta name="Description" content="Azure Machine Learning Studio is a GUI-based integrated development environment for constructing and operationalizing Machine Learning

DatasetValidationError: DatasetValidationError:
	Message: Failed to validate the data.
The Dataflow produced no records.| session_id=995f453d-1638-4dc7-9737-8fa9d8296a51
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Failed to validate the data.\nThe Dataflow produced no records.| session_id=995f453d-1638-4dc7-9737-8fa9d8296a51"
    }
}

Method filter: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,...,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,__index_level_0__,partition_time
2019-01-01 00:00:00,720735,73805,2019-01-01 00:00:00,30.349,-85.788,21.0,140.0,5.1,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390756,2019-01-01
2019-01-01 00:39:00,720735,73805,2019-01-01 00:39:00,30.349,-85.788,21.0,150.0,5.7,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390757,2019-01-01
2019-01-01 00:53:00,720735,73805,2019-01-01 00:53:00,30.349,-85.788,21.0,150.0,4.6,21.1,1019.5,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390758,2019-01-01
2019-01-01 01:01:00,720735,73805,2019-01-01 01:01:00,30.349,-85.788,21.0,150.0,4.6,21.1,,...,,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390759,2019-01-01
2019-01-01 01:53:00,720735,73805,2019-01-01 01:53:00,30.349,-85.788,21.0,140.0,4.1,21.1,1019.6,...,0.0,,NORTHWEST FLORIDA BEACHES INTL ARPT,US,720735-73805,2019,1,1.0,2390760,2019-01-01


## **Create File Dataset **

Use from_files() method to load files in any format and create un registered filedataset.

In [27]:
#Represents a collection of file references in datastores or public URLs to use in Azure Machine Learning.

from azureml.core.dataset import Dataset

web_paths = [
            'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
            'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
            ]

mnist_ds = Dataset.File.from_files(path = web_paths)
mnist_ds.to_path() # list the files

['/http%3A/%2Fyann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
 '/http%3A/%2Fyann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz']

In [28]:

mnist_file_dataset = mnist_ds.register(workspace=ws,
                                        name='mnist_opendataset',
                                        description='mnist training and test dataset',
                                        create_new_version=True)

In [18]:
#Create dataset from datastore

{
  "source": [
    "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
    "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
    "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
    "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz"
  ],
  "definition": [
    "GetFiles"
  ]
}

In [None]:
#Access registered filedataset 

In [None]:
# ??? Create adls gen2 storage ??
'''# Create Azure Data Lake Storage Gen2 datastore without credentials.
adls2_dstore = Datastore.register_azure_data_lake_gen2(workspace=ws, 
                                                       datastore_name='credentialless_adls2', 
                                                       filesystem='tabular', 
                                                       account_name='myadls2')'''

In [None]:
# retrieve an existing datastore in the workspace by name 
dstore = Datastore.get(workspace, datastore_name) 
print(dstore)