# Support notebook working with Azure ML

This notebook contains support code working with Azure ML
- working workspace
- training with data
- scoring 
- run logs



# Working with workspace

In [None]:
# The main library
import azureml.core
from azureml.core import Dataset, Model , Workspace, Experiment
import os

#import azureml.contrib.dataset

azureml.core.VERSION

In [None]:
#Write workspace to file
from azureml.core import Workspace

subscription_id = 'df6'
resource_group  = 'Helen_MachineLearning'
workspace_name  = 'Helen_MachineLearning'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    ws.write_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

In [None]:
# read current workspace from file
ws=Workspace.from_config()
print (ws)

In [None]:
# List experiments
list_experiments = Experiment.list(ws)
print (list_experiments)

In [None]:
# List runs under experiment 
experiment = Experiment(workspace=ws, name="therealbank_training")
print (experiment)
list_runs = experiment.get_runs()
for run in list_runs:
    print(run.id)


In [None]:
# Archiving experiment
experiment = Experiment(workspace=ws, name="therealbank_training")
experiment.archive()

In [None]:
# list datasets
Dataset.get_all(workspace=ws)
list_datasets= Dataset.list(workspace=ws)
for ds in list_datasets:
    print(ds.name)

In [None]:
# Fetch my dataset (to pandas)
dataset = Dataset.get_by_name(workspace=ws, name='therealbank')
df = dataset.to_pandas_dataframe()
df.head(10)

In [None]:
# Fetch all my models

list_d= Model.list(workspace=ws)
for ds in list_d:
    print(ds.name, ds.version)
 

In [None]:
# List specific model and delete it
list_d= Model.list(workspace=ws,name='AutoMLaf4c0e6eb68')
for ds in list_d:
    print(ds.name, ds.version)
    ds.delete()

In [None]:
# List specific model version and delete it
modelname='helen_test'
model = Model(ws, modelname,version=1)
model.delete()
model

In [None]:
# Attach compute 
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "automl-compute")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

In [None]:
#Current compute 
print(compute_target.get_status().serialize())

In [None]:
# Print my default datastore
datastore = ws.get_default_datastore()
print (datastore)
print (datastore.datastore_type, datastore.account_name, datastore.container_name)

In [None]:
# List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    print (name, datastore.datastore_type, datastore.account_name, datastore.container_name)
   

In [None]:
# Creating DATASTORE
from azureml.core import Workspace, Experiment, Datastore, Dataset
blob_datastore_name='helen_blobstore' # Name of the datastore to workspace
container_name=os.getenv("BLOB_CONTAINER", "helenml") # Name of Azure blob container
account_name=os.getenv("BLOB_ACCOUNTNAME", "storagehelen") # Storage account name
account_key=os.getenv("BLOB_ACCOUNT_KEY", "jecd6 ....") # Storage account key

helen_datastore = Datastore.register_azure_blob_container(workspace=ws, 
                                                         datastore_name=blob_datastore_name, 
                                                         container_name=container_name, 
                                                         account_name=account_name,
                                                         account_key=account_key)

In [None]:
# Uploading data to Datastore 
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')



helen_datastore.upload_files(files = ['./diabetes/data/diabetes_data.txt'],
                       target_path = '/helen/data',
                       overwrite = True,
                       show_progress = True)

helen_datastore.upload_files(files = ['./diabetes/data/diabetes_labels.txt'],
                       target_path = '/helen/data',
                       overwrite = True,
                       show_progress = True)

In [None]:
# Attach specific datastore
helen_datastore = Datastore.get(workspace=ws, datastore_name='helen_blobstore')

In [None]:
# List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    #print(name, datastore.datastore_type)
    print (name, datastore.datastore_type, datastore.account_name, datastore.container_name)
    #print(datastore)

In [None]:
#download - download creates folders directly into root, or reuses which are already there
datastore.download(target_path= './',
                   prefix='helen/data/',
                   show_progress=True)

In [None]:
# Creating datasets 
data_dir='./helen/data'

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)
    

datastore = ws.get_default_datastore()

##########################
#diabetes data
##############
diabetes_data = Dataset.Tabular.from_delimited_files(path=[(datastore, '/helen/data/diabetes_data.txt')],separator=' ')

diabetes_data = diabetes_data.register(workspace=ws,
                                 name='diabetes_data',
                                 description='diabetes data',
                                      create_new_version=True)


##########################
#diabetes labels
##############
diabetes_labels = Dataset.Tabular.from_delimited_files(path=[(datastore, '/helen/data/diabetes_labels.txt')],separator=' ')

diabetes_labels = diabetes_labels.register(workspace=ws,
                                 name='diabetes_labels',
                                 description='diabetes labels',
                                      create_new_version=True)


##########################
# diabates al las file
# create a FileDataset pointing to files in folder and its subfolders recursively
#datastore_paths = [(datastore, 'helen/data')]
##########################
diabetes_file = Dataset.File.from_files(path=[(datastore,'/helen/data/diabetes_data.txt')])

diabetes_file = diabetes_file.register(workspace=ws,
                                 name='diabetes_data_file',
                                 description='diabetes data file')


In [None]:
#Accessing dataset which is already registered
# get dataset by dataset name
diabetes_data = Dataset.get_by_name(workspace=ws, name='diabetes_data')
diabetes_labels = Dataset.get_by_name(workspace=ws, name='diabetes_labels')

df = diabetes_data.to_pandas_dataframe()
df.head(10)

In [None]:
# create a directory in my local comuter
script_folder = './helen/script'
os.makedirs(script_folder, exist_ok=True)

In [None]:
# To work with datasets neeeded newer pandas versions
# needed at least version  0.24.1 and above. 

pip install --upgrade pandas

In [None]:
# for plots
pip install matplotlib

In [None]:
# Going from dataset to pandas and then to numpy
# Here is i 'm creating from pandas numpy array'
import pandas as pd
import numpy as np
df = diabetes_data.to_pandas_dataframe()
df.count()
df_np=df.to_numpy()
print (np.count_nonzero(df_np [:,0]))
pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

In [None]:
# making dir
data_dir='./helen/data'

if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

In [None]:
# making dir 
os.listdir(data_dir)
os.makedirs ('./helen/output',exist_ok=True)

# Training with data

In [None]:
%%writefile ./helen/script/helen_train_simple2.py

# the version 2 training - simple read and train

from azureml.core import Dataset, Run
import os
import numpy as np


output_dir='./helen/output'
os.makedirs ('./helen/output',exist_ok=True)

run = Run.get_context()
# get the input dataset by name
dataset = run.input_datasets['diabetes_data']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
x_array=df.to_numpy()

print ('helen is printing X dataframe cnt',df.count())
print ('helen is printing X numpy cnt',np.count_nonzero(x_array [:,0]))

dataset = run.input_datasets['diabetes_labels']
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
y_array=df.to_numpy()


print ('helen is printing y dataframe cnt',df.count())
print ('helen is printing y numpy cnt',np.count_nonzero(y_array [:,0]))


run.log('data cnt',df.count())


# load diabetes dataset, a well-known small dataset that comes with scikit-learn
# REading dataset from file 
# Below writing to file
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

#X, y = load_diabetes(return_X_y = True)
y=y_array
X=x_array
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
reg = Ridge(alpha = 0.03)
reg.fit(data['train']['X'], data['train']['y'])
preds = reg.predict(data['test']['X'])
print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))

 # Output the Mean Squared Error to the notebook and to the run
run.log('mse', mean_squared_error(preds, data['test']['y']))

 # Save the model to the outputs directory for capture
model_file = 'diabetes_helen.pkl'
model_file_name=os.path.join(output_dir, model_file)

joblib.dump(value = reg, filename = model_file_name);


print(run.get_file_names())

# upload the model file explicitly into artifacts 
run.upload_file(name = model_file_name, path_or_stream = model_file_name)

# register model
model = run.register_model(model_name='helen_test',model_path=model_file_name)
print(model.name, model.id, model.version, sep='\t')



for a in range (len(preds)):
    print (str (preds[a]) + '  actual:' + str (y_test[a]) + ' actual ',  X_test[a] )

In [None]:
# Working locally, Nov 2020 

import azureml.core
from azureml.core import Dataset, Model , Workspace, Experiment
import os
from azureml.train.estimator import Estimator
from azureml.train.sklearn import SKLearn
from azureml.widgets import RunDetails

# read current workspace from file
ws=Workspace.from_config()

#Accessing dataset which is already registered 
diabetes_data = Dataset.get_by_name(workspace=ws, name='diabetes_data')
diabetes_labels = Dataset.get_by_name(workspace=ws, name='diabetes_labels')



data_dir='./helen/data'
script_dir='./helen/script'
experiment = Experiment(workspace=ws, name="local_python_run")


est = SKLearn(source_directory=script_dir,
                entry_script='helen_train_simple2.py',
                # pass dataset object as an input with name 'titanic'
                inputs=[diabetes_data.as_named_input('diabetes_data'),diabetes_labels.as_named_input('diabetes_labels')],
                #CORRECT
                pip_packages = ['azureml-sdk','azureml-dataprep[fuse,pandas]'],
                #conda_packages=['azureml-sdk','numpy','scikit-learn'],
                #WORKS correctly  
                compute_target='local'
                #Wroks correctly 
                # compute_target=compute_target
               )

# Submit the estimator as part of your experiment run

experiment_run = experiment.submit(est)

RunDetails(experiment_run).show()

#experiment_run = experiment.submit(est)
experiment_run.wait_for_completion(show_output=True)


In [None]:
%%writefile ./helen/script/helen_train_simple.py

# the version 1 training - simple read from disk of one file

from azureml.core import Dataset, Run
import os
import numpy as np


output_dir='./helen/output'

run = Run.get_context()
# get the input dataset by name
dataset = run.input_datasets['diabetes_data']

os.makedirs ('./helen/output',exist_ok=True)

    
# load the TabularDataset to pandas DataFrame
df = dataset.to_pandas_dataframe()
df_np=df.to_numpy()


print ('helen is printing dataframe cnt',df.count())
print ('helen is printing numpy cnt',np.count_nonzero(df_np [:,0]))

run.log('data cnt',df.count())

# Scoring 

In [None]:
# All models
from azureml.core.model import Model
Model.list(ws )

In [None]:
# Find all models called "best_model" and display their version numbers
from azureml.core.model import Model
models = Model.list(ws, name='helen_test')
for m in models:
    print(m.name, m.version)

In [None]:
# SCORING
# Downlaoding it, in order to use it for scoring
from azureml.core import Workspace
from azureml.core.model import Model
import os
ws = Workspace.from_config()


modelname='helen_test'
model_file= "diabetes_helen.pkl"

output_dir='./helen/download'
os.makedirs (output_dir,exist_ok=True)

model = Model(ws, modelname, version=4)
model.download(target_dir=output_dir, exist_ok=True)
print (model)

# verify the downloaded model file
model_file_name = os.path.join(output_dir, model_file)
os.stat(model_file_name)




In [None]:
# SCORING

# Scoring with model, which was downloaded earlier
#load the model from disk and predict
from sklearn.externals import joblib
# REading dataset from web
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

loaded_model = joblib.load(open(model_file_name, 'rb'))
print (model_file_name)

#loading data
X, y = load_diabetes(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
# finding score
result = loaded_model.score(X, y)
print(result)

# predicting values
estimate = loaded_model.predict(X_test)

for a in range (len(estimate)):
    print (str (estimate[a]) + '  actual:' + str (y_test[a]) + ' input ', X_test [a])


# RUN logs

In [None]:
# run information
minimum_rmse_runid = None
minimum_rmse = None

for run in experiment.get_runs():
    if run.id=='test-experiment_1582140794_f8006a7f':
        print (run.get_metrics())
        print (run.get_details())
    # each logged metric becomes a key in this returned dict
   



In [None]:
# Run information
# Finding the best run out of all runs
minimum_rmse_runid = None
minimum_rmse = None

for run in experiment.get_runs():
    # I use my run.id what i know
    if run.id=='test-experiment_1582140794_f8006a7f':
        run_metrics = run.get_metrics()
        run_details = run.get_details()
        # each logged metric becomes a key in this returned dict
        run_rmse = run_metrics["mse"]
        run_id = run_details["runId"]

        if minimum_rmse is None:
            minimum_rmse = run_rmse
            minimum_rmse_runid = run_id
        else:
            if run_rmse < minimum_rmse:
                minimum_rmse = run_rmse
                minimum_rmse_runid = run_id

print("Best run_id: " + minimum_rmse_runid)
print("Best run_id rmse: " + str(minimum_rmse))

# Best runs files
from azureml.core import Run
best_run = Run(experiment=experiment, run_id=minimum_rmse_runid)
print(best_run.get_file_names())


In [None]:
model_path = Model.get_model_path(model_name='helen_test')
#print (model_path)
# deserialize the model file back into a sklearn model

#model = joblib.load(model_path)

In [None]:
# ORIGINAL SCRIPT
# load diabetes dataset, a well-known small dataset that comes with scikit-learn
# REading dataset from file 
# Below writing to file
from sklearn.datasets import load_diabetes
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

X, y = load_diabetes(return_X_y = True)
y=y_matrix
X=X_matrix
columns = ['age', 'gender', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
data = {
   "train":{"X": X_train, "y": y_train},        
   "test":{"X": X_test, "y": y_test}
}
reg = Ridge(alpha = 0.03)
reg.fit(data['train']['X'], data['train']['y'])
preds = reg.predict(data['test']['X'])
print('Mean Squared Error is', mean_squared_error(preds, data['test']['y']))
joblib.dump(value = reg, filename = 'model.pkl');

for a in range (len(preds)):
    print (str (preds[a]) + '  actual:' + str (y_test[a]) + ' actual ',  X_test[a] )

## Environment

In [None]:
#to list cureated environemnets
envs = Environment.list(workspace=ws)

for env in envs:
    if env.startswith("AzureML"):
        print("Name",env)
        print("packages", envs[env].python.conda_dependencies.serialize_to_string())

In [None]:
# existing environemnt out of box.

from azureml.core import Environment

curated_env = Environment.get(workspace=ws, name="AzureML-Minimal")
curated_env = Environment.get(workspace=ws, name="AzureML-Tutorial")


# extra code

In [None]:
#
# To activate this environment, use:
# > source activate /azureml-envs/azureml_b2ad30260b00c8bf1a18b629f070b89f
#
# To deactivate an active environment, use:
# > source deactivate
#

In [None]:
# NOT YET TESTED; BUT SHOULD WORK 

from azureml.train.estimator import Estimator
data_dir='./helen/data'
script_dir='./helen/script'



est = Estimator(source_directory=script_dir,
                entry_script='helen_train.py',
                # pass dataset object as an input with name 'titanic'
                inputs=[diabetes_data.as_named_input('diabetes_data')],
                #conda_packages=['azureml-sdk','numpy','scikit-learn'],
                compute_target=compute_target,
                environment_definition= curated_env
               )

# Submit the estimator as part of your experiment run
experiment_run = experiment.submit(est)
experiment_run.wait_for_completion(show_output=True)