<img src='https://github.com/jtobelem-simplon/prepa-dp100/blob/master/images/top.png?raw=true'>

# Configuration (à lancer avant tous les notebooks)

In [1]:
# version de python
import platform
platform.python_version()

'3.8.5'

In [2]:
# la liste des packages installés
!conda list

# packages in environment at /home/lab/anaconda3/envs/azure:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main  
adal                      1.2.4                    pypi_0    pypi
applicationinsights       0.11.9                   pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
attrs                     20.2.0             pyh9f0ad1d_0    conda-forge
azure-common              1.1.25                   pypi_0    pypi
azure-core                1.8.1                    pypi_0    pypi
azure-graphrbac           0.61.1                   pypi_0    pypi
azure-identity            1.2.0                    pypi_0    pypi
azure-mgmt-authorization  0.61.0                   pypi_0    pypi
azure-mgmt-containerregistry 2.8.0                    pypi_0    pypi
azure-mgmt-keyvault       2.2.0                    pypi_0    pypi
azure-mgmt-resource       10.2.0                   p

In [3]:
# version de la SDK azureml
import azureml.core
print("Ready to use Azure ML", azureml.core.VERSION)

Ready to use Azure ML 1.13.0


Si le notebook est executé en dehors d'Azure, il faut télécharger le fichier config.json depuis le portail https://portal.azure.com/, et le mettre dans le workspace qui contient le notebook.

Si le notebook est exécuté directement depuis le workspace Azure, le fichier de config devrait déjà être là.

In [4]:
# connexion au workspace
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, "loaded")

jt-dp100 loaded


# Enregistrement des données

In [5]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'titanic dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/titanic/train.csv'], # Upload the titanic csv files in /data
                        target_path='titanic-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic dataset',
                                description='titanic data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


In [6]:
# Split the dataset into training and validation subsets
titanic_ds = ws.datasets.get("titanic dataset")
train_ds, test_ds = titanic_ds.random_split(percentage=0.7, seed=0)

In [7]:
titanic_ds.to_pandas_dataframe().head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Configure the compute target

In [8]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "aml-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. aml-cluster


# Configure Automated Machine Learning

In [9]:
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(name='Automated ML Experiment on titanic dataset',
                             task='classification',
                             compute_target=compute_target,
                             enable_local_managed=True,
                             training_data = train_ds,
                             validation_data = test_ds,
                             label_column_name='Survived',
                             iterations=6,
                             primary_metric = 'AUC_weighted',
                             max_concurrent_iterations=4,
                             featurization='auto'
                             )

print("Ready for Auto ML run.")



Ready for Auto ML run.


# Run an Automated Machine Learning Experiment

In [10]:
from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails

print('Submitting Auto ML experiment...')
automl_experiment = Experiment(ws, 'titanic_automl')
automl_run = automl_experiment.submit(automl_config)
RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

Submitting Auto ML experiment...
Running on remote or ADB.


_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values that may be more appropriate based on the data type and b

{'runId': 'AutoML_2e556d2d-d2e4-44e4-87cb-23bf69b8bf41',
 'target': 'aml-cluster',
 'status': 'Completed',
 'startTimeUtc': '2020-09-16T07:40:28.981703Z',
 'endTimeUtc': '2020-09-16T07:47:17.236768Z',
 'properties': {'num_iterations': '6',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'AUC_weighted',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'aml-cluster',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"494eae71-e785-4558-83f7-a9a44e4c2c0e\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"titanic-data/*.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"jt-dp100-ressources\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"9114a63e-9210-4e32-97ca-b7d9e8ac403d\\\\\\", \\\\\\"worksp

In [11]:
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)
best_run_metrics = best_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print(metric_name, metric)



Run(Experiment: titanic_automl,
Id: AutoML_2e556d2d-d2e4-44e4-87cb-23bf69b8bf41_4,
Type: azureml.scriptrun,
Status: Completed)
None
weighted_accuracy 0.8252730109204368
norm_macro_recall 0.5630472333075227
precision_score_macro 0.8000380879832414
precision_score_micro 0.8045112781954887
average_precision_score_micro 0.8553915020217457
log_loss 0.467519297197934
AUC_weighted 0.8503186610280541
matthews_correlation 0.5812669185939333
recall_score_weighted 0.8045112781954887
f1_score_micro 0.8045112781954887
balanced_accuracy 0.7815236166537614
accuracy 0.8045112781954887
f1_score_weighted 0.8014088530148901
precision_score_weighted 0.8030879904006827
AUC_micro 0.8638136695121261
recall_score_macro 0.7815236166537614
recall_score_micro 0.8045112781954887
AUC_macro 0.8503186610280541
average_precision_score_macro 0.8420941709743515
f1_score_macro 0.7881127450980392
average_precision_score_weighted 0.8517689178870268
accuracy_table aml://artifactId/ExperimentRun/dcid.AutoML_2e556d2d-d2e4-44

In [12]:
for step in fitted_model.named_steps:
    print(step)

AttributeError: 'NoneType' object has no attribute 'named_steps'

In [None]:
from azureml.core import Model

# Register model
best_run.register_model(model_path='outputs/best.pkl', model_name='titanic_model_automl',
                        tags={'Training context':'Auto ML'},
                        properties={'AUC': best_run_metrics['AUC_weighted'], 'Accuracy': best_run_metrics['accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

# Ne pas oublier à la fin de l'expérience!!
(si votre travail à utilisé une instance de calcul)

<img src='https://github.com/jtobelem-simplon/prepa-dp100/blob/master/images/down.png?raw=true'>



In [None]:
# stop toutes les instances de calcul
from azureml.core.compute import ComputeTarget, AmlCompute, ComputeInstance
from azureml.core.compute_target import ComputeTargetException

for compute in ComputeTarget.list(ws):
    if type(compute) is ComputeInstance and compute.get_status().state != 'Stopped':
        print('try to stop compute', compute.name)
        compute.stop(show_output=True)

In [None]:
# liste tous les compute pour vérifier qu'elles sont éteintes
for compute in ComputeTarget.list(ws):
    if type(compute) is ComputeInstance:
        print(compute.name, compute.get_status())

# Ressources

[api azure](https://docs.microsoft.com/en-us/python/api/azureml-core)

[parcours d'apprentissage microsoft](https://docs.microsoft.com/fr-fr/learn/paths/build-ai-solutions-with-azure-ml-service/)

[le repository microsoft](https://github.com/MicrosoftDocs/mslearn-aml-labs.git)