<img src='https://github.com/jtobelem-simplon/prepa-dp100/blob/master/images/top.png?raw=true'>

# Configuration (à lancer avant tous les notebooks)

In [None]:
# version de python
import platform
platform.python_version()

In [None]:
# la liste des packages installés
!conda list

In [None]:
# version de la SDK azureml
import azureml.core
print("Ready to use Azure ML", azureml.core.VERSION)

Si le notebook est executé en dehors d'Azure, il faut télécharger le fichier config.json depuis le portail https://portal.azure.com/, et le mettre dans le workspace qui contient le notebook.

Si le notebook est exécuté directement depuis le workspace Azure, le fichier de config devrait déjà être là.

In [None]:
# connexion au workspace
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, "loaded")

# Datastores

In Azure ML, datastores are references to storage locations, such as Azure Storage blob containers. Every workspace has a default datastore - usually the Azure storage blob container that was created with the workspace. If you need to work with data that is stored in different locations, you can add custom datastores to your workspace and set any of them to be the default.

[datastore dans la SDK](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.datastore.datastore?view=azure-ml-py)

## View Datastores

Run the following code to determine the datastores in your workspace:

In [None]:
# Get the default datastore
default_ds = ws.get_default_datastore()

# Enumerate all datastores, indicating which is the default
for ds_name in ws.datastores:
    print(ds_name, "- Default =", ds_name == default_ds.name)

## Upload Data to a Datastore

In [None]:
default_ds.upload_files(files=['data/titanic/train.csv'], # Upload the csv files in /data
                       target_path='titanic-data/', # Put it in a folder path in the datastore
                       overwrite=True, # Replace existing files of the same name
                       show_progress=True)

In [None]:
data_ref = default_ds.path('titanic-data').as_download(path_on_compute='titanic-data')
print(data_ref)

Pour utiliser cette référence, il faut passer par un script...

## création d'un script

In [None]:
import os, shutil

# Create a folder for the experiment files
script_folder_name = 'script/3-titanic-files'
experiment_folder = './' + script_folder_name
os.makedirs(script_folder_name, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('data/titanic/train.csv', os.path.join(script_folder_name, "titanic.csv"))

In [None]:
%%writefile $script_folder_name/titanic-data.py

from azureml.core import Run
import pandas as pd

# Get the args
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder reference')
args = parser.parse_args()

# Get the experiment run context
run = Run.get_context()


# load the titanic data from the data reference
data_folder = args.data_folder
print("Loading data from", data_folder)
# Load all files and concatenate their contents as a single dataframe
all_files = os.listdir(data_folder)
titanic = pd.concat((pd.read_csv(os.path.join(data_folder,csv_file)) for csv_file in all_files))


# Count the rows and log the result
row_count = (len(titanic))
run.log('observations', row_count)
print('Analyzing {} rows of data'.format(row_count))

# Save a sample of the data and upload it to the experiment output
titanic.sample(100).to_csv('sample.csv', index=False, header=True)
run.upload_file(name = 'outputs/sample.csv', path_or_stream = 'sample.csv')

# Complete the run
run.complete()

In [None]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.widgets import RunDetails

# Set up the parameters
script_params = {
    '--data-folder': data_ref # data reference to download files from datastore
}


# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='titanic-data.py',
                    script_params=script_params,
                    compute_target = 'local'
                   )

# Create an experiment
experiment_name = 'titanic-data'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(config=estimator)
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

# Datasets

In [None]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'titanic train dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/titanic/train.csv'], # Upload the titanic csv files in /data
                        target_path='titanic-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/train.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic train dataset',
                                description='titanic training data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

In [None]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'titanic test dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/titanic/test.csv'], # Upload the titanic csv files in /data
                        target_path='titanic-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-data/test.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='titanic test dataset',
                                description='titanic testing data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

In [None]:
titanic_ds = ws.datasets.get("titanic train dataset")
titanic_ds.to_pandas_dataframe().head()

In [None]:
# Ne pas oublier à la fin de l'expérience!!
(si votre travail à utilisé une instance de calcul)

<img src='https://github.com/jtobelem-simplon/prepa-dp100/blob/master/images/down.png?raw=true'>



# Ne pas oublier à la fin de l'expérience!!
(si votre travail à utilisé une instance de calcul)

<img src='https://github.com/jtobelem-simplon/prepa-dp100/blob/master/images/down.png?raw=true'>



In [None]:
# stop toutes les instances de calcul
from azureml.core.compute import ComputeTarget, AmlCompute, ComputeInstance
from azureml.core.compute_target import ComputeTargetException

for compute in ComputeTarget.list(ws):
    if type(compute) is ComputeInstance and compute.get_status().state != 'Stopped':
        print('try to stop compute', compute.name)
        compute.stop(show_output=True)

In [None]:
# liste tous les compute pour vérifier qu'elles sont éteintes
for compute in ComputeTarget.list(ws):
    if type(compute) is ComputeInstance:
        print(compute.name, compute.get_status())

# Ressources

[api azure](https://docs.microsoft.com/en-us/python/api/azureml-core)

[parcours d'apprentissage microsoft](https://docs.microsoft.com/fr-fr/learn/paths/build-ai-solutions-with-azure-ml-service/)

[le repository microsoft](https://github.com/MicrosoftDocs/mslearn-aml-labs.git)