# Tabular dataset with pipleine - light version to share , 19.11.2020

In [None]:
import azureml.core
from azureml.core import Datastore, Dataset, Workspace, Experiment, RunConfiguration
from azureml.core.model import Model
from azureml.core.runconfig import CondaDependencies
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from azureml.pipeline.core import Pipeline, PipelineData
import os
azureml.core.VERSION

In [None]:
#Write workspace to file
from azureml.core import Workspace

subscription_id = '8b2f4    '
resource_group  = 'HelenMachineLearning'
workspace_name  = 'HelenMachineLearning'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    ws.write_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

In [None]:
# My info
ws = Workspace.from_config()
datastore = ws.get_default_datastore()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, datastore.name, sep = '\n')

helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')
helen_datastore

In [None]:
# Attache Azure ML Compute as Cluster of low cost nodes
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "automl-compute")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

# Datastore - registering at once

In [None]:
#CREATING DATASTORE

from azureml.core import Workspace, Experiment, Datastore, Dataset

blob_datastore_name='helen_blobstore' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "helenml") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME", "storagehelen") # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", "as== ") # Storage account key

helen_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)

In [None]:
# List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    print (name, datastore.datastore_type, datastore.account_name, datastore.container_name)


# Tabular dataset - registering at once

In [None]:
script_folder="./helen/data/"
os.listdir(script_folder)

In [None]:
# Uploading data files once
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')



helen_datastore.upload_files(files = ['./helen/data/diabetes_data.txt'],
                       target_path = '/helen/data',
                       overwrite = True,
                       show_progress = True)


helen_datastore.upload_files(files = ['./helen/data/diabetes_labels.txt'],
                       target_path = '/helen/data',
                       overwrite = True,
                       show_progress = True)

In [None]:
# Registering Tabular data ONCE

datastore = ws.get_default_datastore()

helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')

##########################
#diabetes data
##############
diabetes_data = Dataset.Tabular.from_delimited_files(path=[(helen_datastore, '/helen/data/diabetes_data.txt')],separator=' ')
diabetes_data = diabetes_data.register(workspace=ws, 
                                 name='diabetes_data',
                                 description='diabetes data',
                                 create_new_version=True)


##########################
#diabetes labels
##############
diabetes_labels = Dataset.Tabular.from_delimited_files(path=[(helen_datastore, '/helen/data/diabetes_labels.txt')],separator=' ')
diabetes_labels = diabetes_labels.register(workspace=ws,
                                 name='diabetes_labels',
                                 description='diabetes labels',
                                 create_new_version=True)


# Tabular data set - accessing it in scipt

In [None]:
#Accessing dataset which is already registered
# get dataset by dataset name
diabetes_data = Dataset.get_by_name(workspace=ws, name='diabetes_data')

df = diabetes_data.to_pandas_dataframe()
df.head(10)

In [None]:
#Accessing dataset which is already registered
# get dataset by dataset name
diabetes_labels = Dataset.get_by_name(workspace=ws, name='diabetes_labels')

df = diabetes_labels.to_pandas_dataframe()
df.head(10)

# Scripts

In [None]:
# create a directory in my local comuter
script_folder = './helen/script'
os.makedirs(script_folder, exist_ok=True)
os.listdir(script_folder)

In [None]:
%%writefile ./helen/script/diabetes_prep.py

# simple read and train

from azureml.core import Dataset, Run
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from matplotlib import pyplot as plot
from azureml.core import Workspace, Datastore

##########################################
##########################################
# AML content - start
##########################################
##########################################

print ('HELEN PREP STEP ')
output_dir='./helen/output'
os.makedirs ('./helen/output',exist_ok=True)
run = Run.get_context()

##########################################
# get the input dataset by name
##########################################

dataset = run.input_datasets['diabetes_data']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
x_array=df.to_numpy()


dataset = run.input_datasets['diabetes_labels']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
y_array=df.to_numpy()
##########################################
##########################################


run.log('data cnt',df.count())

##########################################
##########################################
# AML content - end
##########################################
##########################################


# My regural python code
y=y_array
X=x_array
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
reg = Ridge(alpha = 0.03)
reg.fit(data['train']['X'], data['train']['y'])
preds = reg.predict(data['test']['X'])
print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))



##########################################
##########################################
# AML content - start
##########################################
##########################################

# Log mse in Azure ML logs
run.log('mse', mean_squared_error(preds, data['test']['y']))

# Save the model to the outputs directory for capture
model_file = 'diabetes_helen.pkl'
model_file_name=os.path.join(output_dir, model_file)
joblib.dump(value = reg, filename = model_file_name);
print(run.get_file_names())

# upload the model file explicitly into artifacts Azure ML artifacts
run.upload_file(name = model_file_name, path_or_stream = model_file_name)

# register model in Azure ML Resitry 
model = run.register_model(model_name='helen_test',model_path=model_file_name)
print(model.name, model.id, model.version, sep='\t')

for a in range (len(preds)):
    run.log_row("Error: Estimate  - Actual", x=a, y=abs (float (preds[a]) - float(y_test[a])))
    

# Creating file to oputput
num_rows, num_cols = X_test.shape
pred = preds.reshape((num_rows, 1))
actual=y_test.reshape((num_rows, 1))

tmp_npy = np.append (X_test, actual, 1)
helen_numpy = np.append (tmp_npy, pred, 1)
print ('helen_numpy shape ',helen_numpy.shape)

helen_pandas=pd.DataFrame(data=helen_numpy)

LOCALFILENAME='helen_score_file.txt'
score_dir='./logs'
score_dir='./helen/score'

# Uploading file as articraft
os.makedirs (score_dir,exist_ok=True)
score_file = os.path.join(score_dir, LOCALFILENAME) 
helen_pandas.to_csv(score_file, sep=',', encoding='utf-8', index=False)
print ('file name', score_file)

# upload scored data explicitly into artifacts 
run.upload_file(name = score_file, path_or_stream = score_file)

##########################################
# create output refernce for dataset in pipeline step
##########################################
mounted_output_path = os.environ['AZUREML_DATAREFERENCE_diabetes_temp_ds']
os.makedirs(mounted_output_path, exist_ok=True)
score_file = os.path.join(mounted_output_path, LOCALFILENAME) 
helen_pandas.to_csv(score_file, sep=',', encoding='utf-8', index=False)
print ('file name to somewhere', score_file)

##########################################
##########################################


##########################################
##########################################
# AML content - end
##########################################
##########################################

In [None]:
%%writefile ./helen/script/diabetes_train.py


# simple prep and train
from azureml.core import Dataset, Run
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from matplotlib import pyplot as plot
from azureml.core import Workspace, Datastore


##########################################
##########################################
# AML content - start
##########################################
##########################################
print ('HELEN TRAIN STEP ')

output_dir='./helen/output'
os.makedirs ('./helen/output',exist_ok=True)

run = Run.get_context()

#####################################
# get the input dataset by name
#####################################
dataset = run.input_datasets['diabetes_data']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
x_array=df.to_numpy()


dataset = run.input_datasets['diabetes_labels']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
y_array=df.to_numpy()


dataset = run.input_datasets['diabetes_temp_ds']
# load dataset into pandas dataframe
df = dataset.to_pandas_dataframe()
xy_array = df.to_numpy()
#####################################
# get the input dataset by name
#####################################

run.log('data cnt',df.count())

##########################################
##########################################
# AML content - end
##########################################
##########################################



# my regular python code
y=y_array
X=x_array
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
reg = Ridge(alpha = 0.03)
reg.fit(data['train']['X'], data['train']['y'])
preds = reg.predict(data['test']['X'])
print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))




##########################################
##########################################
# AML content - start
##########################################
##########################################
# Log mse to Azure ML
run.log('mse', mean_squared_error(preds, data['test']['y']))

# Save the model to the outputs directory for capture
model_file = 'diabetes_helen.pkl'
model_file_name=os.path.join(output_dir, model_file)
joblib.dump(value = reg, filename = model_file_name);
print(run.get_file_names())

# upload the model file explicitly into artifacts in Azure ML
run.upload_file(name = model_file_name, path_or_stream = model_file_name)

# register model in Azure ML
model = run.register_model(model_name='helen_test',model_path=model_file_name)
print(model.name, model.id, model.version, sep='\t')

# Log in Azure ML
for a in range (len(preds)):
    run.log_row("Error: Estimate  - Actual", x=a, y=abs (float (preds[a]) - float(y_test[a])))
    
# Logging histogram plot in Azue ML 
num_rows, num_cols = X_test.shape
pred = preds.reshape((num_rows, 1))
actual=y_test.reshape((num_rows, 1))
tmp_npy = np.append (X_test, actual, 1)
helen_numpy = np.append (tmp_npy, pred, 1)


f=helen_numpy
print (f.shape)
fnrow=f.shape[0]
fncol=f.shape[1]
print ( " rows ", fnrow, "columns ", fncol)

# Histograms to all columns
i=0
for i in range (fncol):
    title= str (i) + ' nr column  '
    plot.title(title)
    plot.hist (f[:,[i]],bins=30,color='blue',edgecolor='white')
    #CORRECTplot.show()
    run.log_image ('Helen plot_' + str (i),plot=plot)
    plot.clf()

##########################################
##########################################
# AML content - end
##########################################
##########################################

# Creating pipeline

## Creating - Dataset to be used between pipeline steps

In [None]:
# define intermediate data
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')

diabetes_temp_ds = PipelineData('diabetes_temp_ds', datastore=helen_datastore).as_dataset()

# register output data as dataset
diabetes_temp_ds= diabetes_temp_ds.register(name='diabetes_temp_ds', create_new_version=True)


## Creating - Python step

In [None]:
# python script configuration

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = compute_target

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn','matplotlib'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], 
    pin_sdk_version=False)


In [None]:
# python step
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep

helen_prep_step1 = PythonScriptStep(name='diabetes_prep',
                             script_name="diabetes_prep.py",
                             inputs=[diabetes_data.as_named_input('diabetes_data'),diabetes_labels.as_named_input('diabetes_labels')],
                             #CORRECT outputs=[diabetes_temp_ds.as_named_input('diabetes_temp_ds')],
                             outputs=[diabetes_temp_ds],
                             source_directory=script_folder,
                             compute_target=compute_target,
                             runconfig=aml_run_config,
                             allow_reuse=True)

## Creating pipeline - with one step

In [None]:
# run pipeline with one step
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[helen_prep_step1])

pipeline_run = Experiment(ws, 'helen_1steps_pipeline').submit(pipeline)

# this will output a table with link to the run details in azure portal
pipeline_run
#Console logs
pipeline_run.wait_for_completion(show_output=True)

## Creating - estimator SKLearn step

In [None]:
# CONFIGURATION FOR TRAINING
# Azure ML will create for me docker image 
from azureml.train.estimator import Estimator
from azureml.train.sklearn import SKLearn
from azureml.widgets import RunDetails

data_dir='./helen/data'
script_dir='./helen/script'

est = SKLearn(source_directory=script_dir,
                entry_script='diabetes_train.py',
                pip_packages = ['azureml-sdk','azureml-dataprep[fuse,pandas]','matplotlib'],
                compute_target=compute_target
                )


In [None]:
# set up training step 
helen_train_step = EstimatorStep(name='diabates_train',
                         estimator=est,
                         estimator_entry_script_arguments=[],
                         # parse prepared_fashion_ds into TabularDataset and use it as the input
                         inputs=[diabetes_temp_ds.parse_delimited_files(), diabetes_data.as_named_input('diabetes_data'),diabetes_labels.as_named_input('diabetes_labels')],
                         compute_target=compute_target)

## Creating pipeline  - with two steps

In [None]:
# build pipeline & run experiment
# run pipeline 
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[helen_prep_step1,helen_train_step])

pipeline_run = Experiment(ws, 'diabetes_pipeline').submit(pipeline)

# this will output a table with link to the run details in azure portal
pipeline_run

In [None]:
# GUI
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show() 

In [None]:
#Console logs
pipeline_run.wait_for_completion(show_output=True)