# Azure Machine Learning - End to End example

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from azureml.core import Workspace, Dataset
from azureml.core import Environment
from azureml.core import Datastore, Dataset, Workspace, Experiment, RunConfiguration

# Check versions
import azureml.core
import sklearn
import joblib
import pandas

print("Azure SDK version:", azureml.core.VERSION)
print('scikit-learn version is {}.'.format(sklearn.__version__))
print('joblib version is {}.'.format(joblib.__version__))
print('pandas version is {}.'.format(pandas.__version__))


##  Accessing workspace

In [None]:
#Write workspace to file
from azureml.core import Workspace

subscription_id = '8b2f4e94-e7b0-42e5-b775-dd2d5968c4e6'
resource_group  = 'HelenMachineLearning'
workspace_name  = 'HelenMachineLearning'
workspace_name  = 'HelenDatabricksLearning'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    #CORRECT ws.write_config()
    print( ws.name, ws.resource_group, ws.location, ws.subscription_id,  sep = '\n')
    print('my workspace: '+ ws.name )
except:
    ws = Workspace.from_config()
    print( ws.name, ws.resource_group, ws.location, ws.subscription_id,  sep = '\n')
    print('my workspace: '+ ws.name )
    print('Workspace found')

## Accessing data in Azure ML workspace

In [None]:
#CREATING DATASTORE

from azureml.core import Workspace, Experiment, Datastore, Dataset

blob_datastore_name='helen_blobstore' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "helenml") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME", "storagehelen") # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", "sg==") # Storage account key

helen_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)

In [None]:
# Uploading data files once
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')


helen_datastore.upload_files(files = ['./helen/data/diabetes.csv'],
                       target_path = '/helen/data',
                       overwrite = False,
                       show_progress = True)

In [None]:
# Registering Tabular data ONCE

datastore = ws.get_default_datastore()

helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')

##########################
#diabetes data
##############
diabetes = Dataset.Tabular.from_delimited_files(path=[(helen_datastore, '/helen/data/diabetes.csv')],separator=',')
diabetes = diabetes.register(workspace=ws, 
                                 name='diabetes',
                                 description='diabetes data and labels',
                                 create_new_version=True)

In [None]:
datastore = ws.get_default_datastore()
print ("my default datastore: "+ datastore.name, sep = '\n')

datastore = Datastore.get_default (workspace=ws)
print ('my default datastore: ' + datastore.name)
     
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')
print ('my helen_datastore datastore: ' + helen_datastore.name)

# List all datastores registered in the current workspace
datastores = ws.datastores
print ('all attached datasores :')
for name, datastore in datastores.items():
    print ('datastore name :',  name, ' Def: ', datastore.datastore_type, datastore.account_name, datastore.container_name)

In [None]:
#Accessing dataset which is already registered
# get dataset by dataset name
diabetes = Dataset.get_by_name(workspace=ws, name='diabetes')

d_data = diabetes.to_pandas_dataframe()
d_data.head(10)


# Starting working
## Context

In [None]:
#My environment

# CORRECT ws = Workspace.from_config()
print( ws.name, ws.resource_group, ws.location, ws.subscription_id,  sep = '\n')
print('my workspace: '+ ws.name )

## Remote compute

In [None]:
# Attache Azure ML Compute as Cluster of low cost nodes
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "automl-compute")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

## Script

In [None]:
# create a directory in my local comuter
script_dir = './helen/script'
os.makedirs(script_dir, exist_ok=True)
os.listdir(script_dir)

In [None]:
%%writefile ./helen/script/diabetes2021_test.py

import argparse
from azureml.core import Run, Dataset
#import pandas as pd
import numpy as np
import joblib 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


run = Run.get_context()
ws = run.experiment.workspace

parser = argparse.ArgumentParser()

parser.add_argument('--kernel', type=str, default='linear',
                        help='Kernel type to be used in the algorithm')
parser.add_argument('--penalty', type=float, default=1.0,
                        help='Penalty parameter of the error term')
parser.add_argument('--ds', type=str, dest='dataset_id')


args = parser.parse_args()
run.log('Kernel type', np.str(args.kernel))
run.log('Penalty', np.float(args.penalty))

dataset = Dataset.get_by_id(ws, id=args.dataset_id)
data = dataset.to_pandas_dataframe()
print ('by id')
print (data.head(10))

#INCORRECT SYNTAX dataset_ = run.input_datasets[args.dataset_id]
dataset = run.input_datasets['diabetes']
data = dataset.to_pandas_dataframe()
print ('by name')
print (data.head(10))


###########################
from matplotlib import pyplot as plt
import numpy as np

x = np.arange(0, 3.47*2, 0.05)
y = np.tan(x)
plt.plot(x,y)
plt.xlabel("angle")
plt.ylabel("Tan value")
plt.title('Tan wave')
plt.show()

############################

run.log('Accuracy', np.average(y))


run.log_image ('Helen plot',plot=plt)

# Save the trained model
os.makedirs('outputs', exist_ok=True)

run.complete()

In [None]:
%%writefile ./helen/script/diabetes2021_prep.py

# Full example

import parser
from azureml.core import Dataset, Run
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib
import joblib
from matplotlib import pyplot as plot
from azureml.core import Workspace, Datastore

##########################################
##########################################
# AML content - start
##########################################
##########################################

print ('HELEN PREP STEP ')
output_dir='./helen/output'
os.makedirs ('./helen/output',exist_ok=True)
run = Run.get_context()
ws = run.experiment.workspace

##########################################
# get arguments 2021
##########################################

parser = argparse.ArgumentParser()

parser.add_argument('--kernel', type=str, default='linear',
                        help='Kernel type to be used in the algorithm')
parser.add_argument('--ridge', type=float, default=0.03,
                        help='Penalty parameter of the error term')
parser.add_argument('--ds', type=str, dest='dataset_id')


args = parser.parse_args()
run.log('Kernel type', np.str(args.kernel))
run.log('Ridge', np.float(args.ridge))


##########################################
# get the input dataset by name
##########################################


dataset = run.input_datasets['diabetes']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()

# THIS IS ALSO CORRECT 
dataset = Dataset.get_by_id(ws, id=args.dataset_id)
df = dataset.to_pandas_dataframe()


dd_data=df
dd_data=dd_data.drop(columns=["Target"])
x_array=dd_data.to_numpy()
print ("correct x !!!! ", type (dd_data))
run.log('data x cnt',df.count())

dd_target=df
dd_target=dd_target[["Target"]]
y_array=dd_target.to_numpy()
print ("correct y !!!! ", type (dd_data))
run.log('data y cnt',df.count())

##########################################
##########################################


run.log('data cnt',df.count())

##########################################
##########################################
# AML content - end
##########################################
##########################################


# My regural python code
y=y_array
X=x_array
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
reg = Ridge(alpha = 0.03)
reg = Ridge(alpha = np.float(args.ridge))

reg.fit(data['train']['X'], data['train']['y'])
preds = reg.predict(data['test']['X'])
print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))


##########################################
# Huper parameter tuning
##########################################
# model accuracy for X_test
mse= mean_squared_error(preds, data['test']['y'])
run.log('Accuracy', np.float(mse))

##########################################
##########################################
# AML content - start
##########################################
##########################################

# Log mse in Azure ML logs
run.log('mse',  np.float(mse))

# Save the model to the outputs directory for capture
model_file = 'diabetes_helen.pkl'
model_file_name=os.path.join(output_dir, model_file)
joblib.dump(value = reg, filename = model_file_name);
print(run.get_file_names())

# upload the model file explicitly into artifacts Azure ML artifacts
run.upload_file(name = model_file_name, path_or_stream = model_file_name)

# register model in Azure ML Resitry 
model = run.register_model(model_name='helen_test',model_path=model_file_name)
print(model.name, model.id, model.version, sep='\t')

for a in range (len(preds)):
    run.log_row("Error: Estimate  - Actual", x=a, y=abs (float (preds[a]) - float(y_test[a])))
    

# Creating file to oputput
num_rows, num_cols = X_test.shape
pred = preds.reshape((num_rows, 1))
actual=y_test.reshape((num_rows, 1))

tmp_npy = np.append (X_test, actual, 1)
helen_numpy = np.append (tmp_npy, pred, 1)
print ('helen_numpy shape ',helen_numpy.shape)

helen_pandas=pd.DataFrame(data=helen_numpy)

LOCALFILENAME='helen_score_file.txt'
score_dir='./logs'
score_dir='./helen/score'

# Uploading file as articraft
os.makedirs (score_dir,exist_ok=True)
score_file = os.path.join(score_dir, LOCALFILENAME) 
helen_pandas.to_csv(score_file, sep=',', encoding='utf-8', index=False)
print ('file name', score_file)

# upload scored data explicitly into artifacts 
run.upload_file(name = score_file, path_or_stream = score_file)



##########################################
# Plots
##########################################

    
# Logging histogram plot in Azue ML 
num_rows, num_cols = X_test.shape
pred = preds.reshape((num_rows, 1))
actual=y_test.reshape((num_rows, 1))
tmp_npy = np.append (X_test, actual, 1)
helen_numpy = np.append (tmp_npy, pred, 1)


f=helen_numpy
print (f.shape)
fnrow=f.shape[0]
fncol=f.shape[1]
print ( " rows ", fnrow, "columns ", fncol)

# Histograms to all columns
i=0
for i in range (fncol):
    title= str (i) + ' nr column  '
    plot.title(title)
    plot.hist (f[:,[i]],bins=30,color='blue',edgecolor='white')
    #CORRECTplot.show()
    run.log_image ('Helen plot_' + str (i),plot=plot)
    plot.clf()

##########################################
##########################################
# AML content - end
##########################################
##########################################



##########################################
# create output refernce for dataset in pipeline step
##########################################

#mounted_output_path = os.environ['AZUREML_DATAREFERENCE_diabetes_temp_ds']
#os.makedirs(mounted_output_path, exist_ok=True)
#score_file = os.path.join(mounted_output_path, LOCALFILENAME) 
#helen_pandas.to_csv(score_file, sep=',', encoding='utf-8', index=False)
#print ('file name to somewhere', score_file)



# Preparing for AML run

In [None]:
%%writefile sklearn_conda_dependencies.yml

dependencies:
- python=3.6.2
- scikit-learn
- matplotlib
- pip:
  - azureml-defaults

In [None]:
# Create env from conda dependencies

from azureml.core import Environment

try: 
    sklearn_env =Environment.get(workspace=ws,name="sklearn-env")
    print ('environment exists ')
    sklearn_env
except: 
    sklearn_env = Environment.from_conda_specification(name = 'sklearn-env', file_path = './sklearn_conda_dependencies.yml')
    sklearn_env.docker.enabled = True
    sklearn_env.python.user_managed_dependencies = False
    sklearn_env.register(workspace = ws)

In [None]:
#create env from earlier saved environment
#sklearn_env =Environment.get(workspace=ws,name="sklearn-env")
sklearn_env

In [None]:
#Accessing dataset which is already registered
# get dataset by dataset name

diabetes = Dataset.get_by_name(workspace=ws, name='diabetes')

d_data = diabetes.to_pandas_dataframe()
d_data.head(10)



## Scriptrunconfig - configuring the execution for remote and local

In [None]:
# SIMPLE TEST
#####################
# SHORT test version 
####################
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

script_dir = script_dir
script_file = 'diabetes2021_test.py'


experiment = Experiment(workspace=ws, name="diabetes2021_env")

est = ScriptRunConfig(source_directory=script_dir,
                      script=script_file,
                      arguments=['--kernel', 'linear', '--penalty', 1.0,'--ds', diabetes.as_named_input('diabetes')]
                     )

# Submit the estimator as part of your experiment run
est.run_config.target=compute_target

# Correct: Submit Local
est.run_config.target='local'
est.run_config.environment=sklearn_env



In [None]:
# COMPLETE TEST
###########################
# THIS FULL AND COMPLETE
###########################

from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

script_dir = script_dir
script_file = 'diabetes2021_prep.py'
# CORRECT script_file= 'diabetes2021_test.py'


experiment = Experiment(workspace=ws, name="diabetes2021_env")

est = ScriptRunConfig(source_directory=script_dir,
                      script=script_file,
                      arguments=['--kernel', 'linear', '--ridge', 0.03,'--ds', diabetes.as_named_input('diabetes')]
                      #outputs=[diabetes_scored]
                      )

# Submit the estimator as part of your experiment run
est.run_config.target=compute_target

# Correct: Submit Local
est.run_config.target='local'
est.run_config.environment=sklearn_env


## Single run - remote or local

In [None]:
# Single run
experiment_run = experiment.submit(est)

RunDetails(experiment_run).show()


experiment_run.wait_for_completion(show_output=True)

## Hyperparameter tuning run - remote

In [None]:
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
    

param_sampling = RandomParameterSampling( {
    "--kernel": choice('linear', 'rbf'),
    "--ridge": choice(0.01, 0.03, 0.05)
    }
)


hyperdrive_config = HyperDriveConfig(run_config=est,
                                     hyperparameter_sampling=param_sampling, 
                                     primary_metric_name='mse',
                                     primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                                     max_total_runs=12,
                                     max_concurrent_runs=4)

# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_config)

In [None]:
RunDetails(hyperdrive_run).show()

In [None]:
hyperdrive_run.wait_for_completion(show_output=True)

### Retrieving results of hypermarameter run

In [None]:
best_run = hyperdrive_run.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])

In [None]:
a= hyperdrive_run.get_children_sorted_by_primary_metric()
for aa in a:
    print(aa)

In [None]:
for a in best_run.get_file_names():
    print(a)

In [None]:
model = best_run.register_model(model_name='diabates_hyper', model_path='helen/output/diabetes_helen.pkl')

In [None]:
best_run.get_metrics



# Scoring

In [None]:

import parser
from azureml.core import Dataset, Run
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
#from sklearn.externals import joblib
import joblib
from matplotlib import pyplot as plot
from azureml.core import Workspace, Datastore

In [None]:
# Fetching model from Azure ML ws

import os
modelname='diabates_hyper'
model_file= "diabetes_helen.pkl"

model = Model(ws, modelname)

output_dir='helen'
os.makedirs (output_dir,exist_ok=True)

#model = Model(ws, modelname, version=4)
model.download(target_dir=output_dir, exist_ok=True)
print (model)



In [None]:
# Scoring - i have here issue with sklearn versions

import joblib as joblib
model_file_name = os.path.join(output_dir, model_file)
# checking file exists
os.stat(model_file_name)

# ready for scoring
my_model = joblib.load(model_file_name)

In [None]:
import parser
from azureml.core import Dataset, Run
import os
import numpy as np
import pandas as pd

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

import joblib
from matplotlib import pyplot as plot
from azureml.core import Workspace, Datastore

# load the TabularDataset or any dataset to pandas DataFrame 
df = diabetes.to_pandas_dataframe()


dd_data=df
dd_data=dd_data.drop(columns=["Target"])
x_array=dd_data.to_numpy()
print ("correct x !!!! ", type (dd_data))


dd_target=df
dd_target=dd_target[["Target"]]
y_array=dd_target.to_numpy()
print ("correct y !!!! ", type (dd_data))

# My regural python code
y=y_array
X=x_array
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


data = {
   "test":{"X": X, "y": y}
}
preds = my_model.predict(data['test']['X'])
mse= mean_squared_error(preds, data['test']['y']) 
print ('mse = ', mse)



###########################
from matplotlib import pyplot as plt
import numpy as np

x = preds
y = y
#plt.plot(x,y)
plt.scatter(x, y)
plt.xlabel("predicted")
plt.ylabel("actual")
plt.title('Predicted vs Actual')
plt.show()


# Appendix - environments

In [None]:
# Helper for environemnt
from azureml.core import Environment

envs = Environment.list(workspace=ws)

# List Environments and packages in my workspace
for env in envs:
    if env.startswith(""):
    #if env.startswith("sk"):
        print("Name",env)
        print("packages", envs[env].python.conda_dependencies.serialize_to_string())
        
# Use curated environment from AML named "AzureML-Tutorial"

# Correct curated_environment = Environment.get(workspace=ws, name="AzureML-Tutorial")
# Correct Custom environment: Environment.get(workspace=ws,name="myenv",version="1")