# タクシー乗客数予測モデル構築（リモートトレーニング編）
#### 参考１：https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb

## Azure ML Service用の準備

In [27]:
import azureml
from azureml.core import Workspace, Run

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Azure ML SDK Version:  1.0.15


In [28]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, sep = '\t')

Found the config file in: c:\usr\dev\notebook\config.json
hiouchiyamls	AzureMLserviceRG	westus2


In [29]:
experiment_name = 'ml-regression-remote-taxi'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

import pandas as pd

project_folder = './sample_projects/ml-regression-remote-taxi'
output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = exp.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
SDK version,1.0.15
Subscription ID,2c30e7ba-539b-4017-a5bc-5028d8e170ec
Workspace Name,hiouchiyamls
Resource Group,AzureMLserviceRG
Location,westus2
Project Directory,./sample_projects/ml-regression-remote-taxi
Experiment Name,ml-regression-remote-taxi


## トレーニング用スクリプト用意

In [30]:
import os
os.makedirs(project_folder, exist_ok=True)

In [31]:
%%writefile $project_folder/get_data.py

import numpy as np
from azure.storage.blob import BlockBlobService
import pandas as pd
import os.path

def get_data():
    
    if not os.path.exists('./inputdata.csv') :
        account_name='cognitiveservicerg978'
        account_key='IQndkgEB3mLChKaaq3VW1B5Hp97tnH1EXg4XMJRujWOkVpJpsC52H6O29I7GT33K7LFIvU0V7iCFcD9XPZFlyQ=='
        container_name='taxi-demo'
        blob_name='processed_data/part-00000-tid-2330783742724801539-e7e51907-ff34-4439-94e0-576f18c193ea-1202-c000.csv'
        
        service = BlockBlobService(account_name=account_name, account_key=account_key)
        service.get_blob_to_path(container_name,blob_name,'inputdata.csv')
    
    df = pd.read_csv('./inputdata.csv')
    X_train = df.drop(columns=["pickup_year","pickup_month","pickup_day","count","avg_trip_distance","avg_trip_time_in_secs"],axis=1)
    y_train = df["count"].values

    return { "X" : X_train, "y" : y_train }

Overwriting ./sample_projects/ml-regression-remote-taxi/get_data.py


## トレーニング実施（AML Compute - Persistent compute target 編）

In [32]:
from azureml.core.compute import ComputeTarget, AmlCompute
AmlCompute.supported_vmsizes(workspace = ws)

[{'name': 'Standard_F2s_v2', 'vCPUs': 2, 'memoryGB': 4.0},
 {'name': 'Standard_F4s_v2', 'vCPUs': 4, 'memoryGB': 8.0},
 {'name': 'Standard_F8s_v2', 'vCPUs': 8, 'memoryGB': 16.0},
 {'name': 'Standard_F16s_v2', 'vCPUs': 16, 'memoryGB': 32.0},
 {'name': 'Standard_F32s_v2', 'vCPUs': 32, 'memoryGB': 64.0},
 {'name': 'Standard_F64s_v2', 'vCPUs': 64, 'memoryGB': 128.0},
 {'name': 'Standard_F72s_v2', 'vCPUs': 72, 'memoryGB': 144.0},
 {'name': 'Standard_D1_v2', 'vCPUs': 1, 'memoryGB': 3.5},
 {'name': 'Standard_D2_v2', 'vCPUs': 2, 'memoryGB': 7.0},
 {'name': 'Standard_D3_v2', 'vCPUs': 4, 'memoryGB': 14.0},
 {'name': 'Standard_D4_v2', 'vCPUs': 8, 'memoryGB': 28.0},
 {'name': 'Standard_D11_v2', 'vCPUs': 2, 'memoryGB': 14.0},
 {'name': 'Standard_D12_v2', 'vCPUs': 4, 'memoryGB': 28.0},
 {'name': 'Standard_D13_v2', 'vCPUs': 8, 'memoryGB': 56.0},
 {'name': 'Standard_D14_v2', 'vCPUs': 16, 'memoryGB': 112.0},
 {'name': 'Standard_DS1_v2', 'vCPUs': 1, 'memoryGB': 3.5},
 {'name': 'Standard_DS2_v2', 'vCPUs':

### Persistent compute target作成

In [33]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpuclusterdemo"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS15_v2', #'Standard_NC6'
                                                           min_nodes=1,
                                                           max_nodes=1)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded..............
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [34]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# create a new RunConfig object
run_config = RunConfiguration(framework="python")

# Set compute target to AmlCompute target created in previous step
run_config.target = cpu_cluster.name

# enable Docker 
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['numpy','pandas'], 
    #pip_packages=['azure-cli-core<2.0.55', 'azureml-defaults', 'azureml-sdk[automl]', 'azure-storage'])
    pip_packages=['azureml-sdk[automl]', 'azure-storage'])

In [38]:
import logging

automl_settings = {
    "iteration_timeout_minutes" : 30,
    "iterations" : 3,
    "primary_metric" : 'r2_score',
    "preprocess" : True,
    "verbosity" : logging.INFO,
    "n_cross_validations": 5
}

from azureml.train.automl import AutoMLConfig

# AML コンピュート
automated_ml_config = AutoMLConfig(task = 'regression',
                             debug_log = 'automated_ml_errors.log',
                             path = project_folder,
                             run_configuration=run_config,
                             data_script = project_folder + "/get_data.py",
                             **automl_settings)

In [39]:
run_persistent = exp.submit(automated_ml_config, show_output = False)
run_persistent

Experiment,Id,Type,Status,Details Page,Docs Page
ml-regression-remote-taxi,AutoML_fa4815ca-5242-4236-85ea-fdd23f9db9b4,automl,Preparing,Link to Azure Portal,Link to Documentation


In [40]:
from azureml.widgets import RunDetails
RunDetails(run_persistent).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [41]:
%%time
# Shows output of the run on stdout.
run_persistent.wait_for_completion(show_output=True)


*******************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
TRAINFRAC: Fraction of the training data to train on.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
*******************************************************************************************************************

 ITERATION   PIPELINE                                       TRAINFRAC  DURATION      METRIC      BEST
         0   StandardScalerWrapper DecisionTree             1          0:03:13       0.2208    0.2208
         1   MaxAbsScaler ExtremeRandomTrees                1          0:31:01          nan    0.2208
ERROR:                                                 
         2    Ensemble                                      1          0:01:41       0.2208  

{'runId': 'AutoML_fa4815ca-5242-4236-85ea-fdd23f9db9b4',
 'target': 'cpuclusterdemo',
 'status': 'Completed',
 'startTimeUtc': '2019-02-16T09:50:50.286935Z',
 'endTimeUtc': '2019-02-16T10:26:59.846627Z',
 'properties': {'num_iterations': '3',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'r2_score',
  'train_split': '0',
  'MaxTimeSeconds': '1800',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'cpuclusterdemo',
  'DataPrepJsonString': None,
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'dependencies_versions': '{"azureml-widgets": "1.0.15", "azureml-train": "1.0.15", "azureml-train-restclients-hyperdrive": "1.0.15", "azureml-train-core": "1.0.15", "azureml-train-automl": "1.0.15", "azureml-telemetry": "1.0.15", "azureml-sdk": "1.0.15.1", "azureml-pipeline": "1.0.15", "azureml-pipeline-steps": "1.0.15", "azureml-pipeline-core": "1.0.15", "azureml-dataprep": "1.0.12", "azure

In [42]:
children = list(run_persistent.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

rundata = pd.DataFrame(metricslist).sort_index(1)
rundata

Unnamed: 0,0,1,2
explained_variance,0.220774,,0.220774
mean_absolute_error,4.990241,,4.990241
median_absolute_error,3.262016,,3.262016
normalized_mean_absolute_error,0.015792,,0.015792
normalized_median_absolute_error,0.010323,,0.010323
normalized_root_mean_squared_error,0.027534,,0.027534
normalized_root_mean_squared_log_error,0.154563,,0.154563
r2_score,0.220771,,0.220771
root_mean_squared_error,8.700757,,8.700757
root_mean_squared_log_error,0.783464,,0.783464


In [43]:
best_run, fitted_model = run_persistent.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: ml-regression-remote-taxi,
Id: AutoML_fa4815ca-5242-4236-85ea-fdd23f9db9b4_2,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(logger=None, task=None)), ('prefittedsoftvotingregressor', PreFittedSoftVotingRegressor(estimators=[('DecisionTree', Pipeline(memory=None,
     steps=[('standardscalerwrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x0000...tate=None,
           splitter='best'))]))],
               flatten_transform=None, weights=[1.0]))])


In [44]:
description = 'Automated Machine Learning Model'
tags = None
model = run_persistent.register_model(description=description, tags=tags)
print(run_persistent.model_id) # Use this id to deploy the model as a web service in Azure

Registering model AutoMLfa4815ca5best
AutoMLfa4815ca5best


In [58]:
import json
import numpy as np
input_data = "{\"pickup_hour\":1, \"pickup_latitude_group\":59, \"pickup_longitude_group\":76, \"pickup_dayofweek\":2, \"TMAX\":40, \"TMIN\":26, \"AWND\":6, \"PRCP\":0, \"SNOW\":0, \"SNWD\":0}"
inputdata = json.loads(input_data)
pickup_hour = inputdata['pickup_hour']
pickup_latitude_group = inputdata['pickup_latitude_group']
pickup_longitude_group = inputdata['pickup_longitude_group']
pickup_dayofweek = inputdata['pickup_dayofweek']
TMAX = inputdata['TMAX']
TMIN = inputdata['TMIN']
AWND = inputdata['AWND']
PRCP = inputdata['PRCP']
SNOW = inputdata['SNOW']
SNWD = inputdata['SNWD']
X = [[int(pickup_hour), pickup_latitude_group, pickup_longitude_group, pickup_dayofweek, TMAX, TMIN, AWND, PRCP, SNOW, SNWD]]
X = np.array(X)
X

array([[ 1, 59, 76,  2, 40, 26,  6,  0,  0,  0]])

In [66]:
df = pd.DataFrame({ 'pickup_hour' : inputdata['pickup_hour'],
                        'pickup_latitude_group' : inputdata['pickup_latitude_group'],
                        'pickup_longitude_group' : inputdata['pickup_longitude_group'],
                        'pickup_dayofweek' : inputdata['pickup_dayofweek'],
                        'TMAX' : inputdata['TMAX'],
                        'TMIN' : inputdata['TMIN'],
                        'AWND' : inputdata['AWND'],
                        'PRCP' : inputdata['PRCP'],
                        'SNOW' : inputdata['SNOW'],
                        'SNWD' : inputdata['SNWD'] }, index=[0])
df

Unnamed: 0,pickup_hour,pickup_latitude_group,pickup_longitude_group,pickup_dayofweek,TMAX,TMIN,AWND,PRCP,SNOW,SNWD
0,1,59,76,2,40,26,6,0,0,0


In [64]:
import pandas as pd
df = pd.read_csv('./testdata.csv', sep=',')
df.head(3)

Unnamed: 0,pickup_hour,pickup_latitude_group,pickup_longitude_group,pickup_dayofweek,TMAX,TMIN,AWND,PRCP,SNOW,SNWD
0,0,59,76,2,40,26,6.93,0,0,0


In [68]:
fitted_model.predict(df)

array([1.22552934])