In [None]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from azureml.core import Workspace, Datastore, Dataset, Experiment, Environment
from azureml.core.model import Model, InferenceConfig
from azureml.core.webservice import AksWebservice, Webservice
from azureml.core.compute import AksCompute
from dateutil.relativedelta import relativedelta
from datetime import datetime, timedelta

In [None]:
CODE_PATH = '../code'
DATA_PATH = '../sample_data'
DOWNLOAD_PATH = '../download'
AML_UTIL_PATH = '../ml_service'
sys.path.append(os.path.join(os.getcwd(), CODE_PATH))
sys.path.append(os.path.join(os.getcwd(), AML_UTIL_PATH))
import utils
import consts
import train
import aml_utils as amlutils

In [None]:
# This is only needed when we run in a Jupyter notebook and the external files are changed
import importlib
importlib.reload(utils)
importlib.reload(consts)
importlib.reload(train)
importlib.reload(amlutils)

In [None]:
ws = Workspace.from_config(path='../.azureml')
experiment = Experiment(ws, 'nyctaxi_automl')
compute_target_name = os.environ['AML_COMPUTE']
inference_target_name = os.environ['AML_INFERENCE_COMPUTE']

## Read raw data and prepare for training

Read the raw data in the sample_data_folder if exists, or get it from Azure Open DataSet

In [None]:
from pathlib import Path

RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
Path(RAW_DATA_PATH).mkdir(parents=True, exist_ok=True)

In [None]:
start = datetime.strptime("1/1/2015","%m/%d/%Y")
end = datetime.strptime("1/31/2015","%m/%d/%Y")

DATA_FILE_NAME = '{}{:02d}_automl.csv'.format(start.year, start.month)
RAW_DATA_FILE = os.path.join(RAW_DATA_PATH, DATA_FILE_NAME)

if not os.path.isfile(RAW_DATA_FILE):
    print ("Downloading raw data from Azure Open dataset")
    from azureml.opendatasets import NycTlcGreen

    # get Jan data first, later we will also predict on Jul data to detect if there's any drift

    dfraw = pd.DataFrame([])

    for sample_month in range(1):
        temp_df_green = NycTlcGreen(start + relativedelta(months=sample_month), end + relativedelta(months=sample_month)) \
            .to_pandas_dataframe()
        dfraw = dfraw.append(temp_df_green.sample(1000))

    dfraw.to_csv(RAW_DATA_FILE)
else:
    print ("Reading raw data from existing file")
    dfraw = utils.read_raw_data(RAW_DATA_FILE)

dfraw.head(5)

In [None]:
dftrain = utils.process_raw_data(dfraw)
dftrain.head(5)

In [None]:
dftrain.duration.describe()

Create AML datastore

In [None]:
datastore_name = os.environ['AML_DATASTORE']
try:
    datastore = Datastore.get(ws, datastore_name=datastore_name)
    print('datastore {} exists'.format(datastore_name))
except Exception:
    print('create datastore {}'.format(datastore_name))
    container_name = os.environ["BLOB_CONTAINER"]
    account_name = os.environ["BLOB_ACCOUNTNAME"]
    account_key = os.environ["BLOB_ACCOUNT_KEY"]

    datastore = Datastore.register_azure_blob_container(
        workspace=ws, 
        datastore_name=datastore_name, 
        container_name=container_name, 
        account_name=account_name,
        account_key=account_key)

### Upload prepared data so that it can be accessed when training remotely

In [None]:
x_train, x_test, y_train, y_test = train.split_data(dftrain)
training = pd.concat([x_train, y_train], axis=1, join='inner')
testing = pd.concat([x_test, y_test], axis=1, join='inner')

In [None]:
TRAIN_DATA_PATH = os.path.join(DATA_PATH, 'train')

if os.path.exists(TRAIN_DATA_PATH):
    import glob
    print("Remove existing processed training data")
    files = glob.glob(os.path.join(TRAIN_DATA_PATH, '*'))
    for f in files:
        os.remove(f)
else:
    print("Create folder for training data")
    Path(TRAIN_DATA_PATH).mkdir(parents=True, exist_ok=True)                

utils.write_train_data(training, TRAIN_DATA_PATH, DATA_FILE_NAME)
datastore.upload_files(files=[os.path.join(TRAIN_DATA_PATH, DATA_FILE_NAME)], target_path='train', overwrite=True)

## Train locally or remotely

In [None]:
import logging
from azureml.train.automl import AutoMLConfig
from azureml.automl.core.featurization import FeaturizationConfig

local_training = False

featurization_config = FeaturizationConfig()
featurization_config.add_column_purpose('passengerCount', 'Numeric')

automl_settings = {
     'task': 'regression',
     'iterations': 20,
     'iteration_timeout_minutes': 2,
     'experiment_timeout_minutes': 20,
     'whitelist_models': ['LightGBM'], 
     'primary_metric': 'normalized_root_mean_squared_error',
     'n_cross_validations': 5,
     'label_column_name': 'duration',
     'verbosity': logging.INFO,
     'preprocess': False,
     'model_explainability': True,
     'featurization': featurization_config
}

if local_training:
    automl_config = AutoMLConfig(
        training_data=training,
        **automl_settings)
else:
    tabular_train_dataset = Dataset.Tabular.from_delimited_files(
        path=[(datastore, os.path.join('train', DATA_FILE_NAME))])
    compute_target = ws.compute_targets[compute_target_name]
    
    automl_config = AutoMLConfig(
        path=CODE_PATH,
        training_data=tabular_train_dataset,
        compute_target=compute_target,
        **automl_settings)

### Train without a pipeline

In [None]:
run = experiment.submit(automl_config, show_output=True)

### Train remotely with a Pipeline

In [None]:
from azureml.train.automl.runtime import AutoMLStep

TRAIN_STEP_NAME = 'nyc_automl_regression'

trainWithAutoMLStep = AutoMLStep(
    name=TRAIN_STEP_NAME,
    automl_config=automl_config
)

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainWithAutoMLStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)
print("Pipeline submitted for execution.")

In [None]:
# TODO: REMOVE!
from azureml.pipeline.core import PipelineRun
pipeline_run = PipelineRun(experiment, "37291ab0-e698-423f-bcec-d2a39b8d4b3b")

In [None]:
pipeline_run

In [None]:
from azureml.train.automl.run import AutoMLRun

train_step_run = pipeline_run.find_step_run(TRAIN_STEP_NAME)[0]
automl_run = AutoMLRun(experiment=experiment, run_id=train_step_run.id)

### Get training results

In [None]:
run, model = automl_run.get_output()

In [None]:
y_predict = model.predict(x_test)
actual_vs_predicted = y_test.to_frame()
actual_vs_predicted['predicted'] = y_predict
actual_vs_predicted.sort_index().plot(figsize=(20, 5), rot=45)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

y_actual = y_test.values.flatten().tolist()
rmse = sqrt(mean_squared_error(y_actual, y_predict))
mape = utils.MAPE(y_test, y_predict)

run.log('rmse', rmse)
run.log('mape', mape)
print("rmse:{0}, mape:{1}".format(rmse, mape))

### Save the model

In [None]:
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', consts.model_name_automl)
print('writing model file to {}'.format(model_file))
joblib.dump(value=model, filename=model_file)
if local_training:
    run.upload_file(name=consts.model_name_automl, path_or_stream=model_file)

## Publish the pipeline

In [None]:
published_pipeline = pipeline_run.publish_pipeline(
     name="NYCtaxi_pipeline",
     description="Pipeline to train NYC taxi data for duration prediction",
     version="1.0")

In [None]:
# also publish a versioned endpoint of the pipeline
from azureml.pipeline.core import PipelineEndpoint

pipeline_endpoint = PipelineEndpoint.publish(workspace=ws, 
                                             name="NYCtaxi_pipeline_endpoint_notebook",
                                             pipeline=published_pipeline, 
                                             description="Pipeline endpoint from Notebook")

## Explain the model

In [None]:
from azureml.explain.model._internal.explanation_client import ExplanationClient
import matplotlib.pyplot as plt

client = ExplanationClient.from_run(run)
engineered_explanations = client.download_model_explanation(raw=False)
global_importance = engineered_explanations.get_feature_importance_dict()
l2h=dict(sorted(global_importance.items(), key=lambda x: x[1]))
plt.figure(figsize=(20, 5))
plt.barh(range(len(l2h)), l2h.values(), tick_label=list(l2h.keys()))

## Explain the model for test data

In [None]:
from azureml.train.automl.runtime.automl_explain_utilities import automl_setup_model_explanations

automl_explainer_setup_obj = automl_setup_model_explanations(model, X=x_train, 
                                                             X_test=x_test, y=y_train, 
                                                             task='regression')

In [None]:
from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel
from azureml.explain.model.mimic_wrapper import MimicWrapper

# Initialize the Mimic Explainer
engineered_explainer = MimicWrapper(ws, automl_explainer_setup_obj.automl_estimator, LGBMExplainableModel, 
                         init_dataset=automl_explainer_setup_obj.X_transform, run=run,
                         features=automl_explainer_setup_obj.engineered_feature_names, 
                         feature_maps=[automl_explainer_setup_obj.feature_map])
raw_explainer = MimicWrapper(ws, model, LGBMExplainableModel, 
                         init_dataset=x_train, 
                         run=run,
                         features=automl_explainer_setup_obj.raw_feature_names)

In [None]:
raw_explanations = explainer.explain(['local', 'global'], eval_dataset=x_test)
raw_testdata_importance = raw_explanations.get_feature_importance_dict()
l2h=dict(sorted(raw_testdata_importance.items(), key=lambda x: x[1]))
plt.figure(figsize=(20, 5))
plt.barh(range(len(l2h)), l2h.values(), tick_label=list(l2h.keys()))

In [None]:
engineered_explanations = engineered_explainer.explain(
    ['local', 'global'], eval_dataset=automl_explainer_setup_obj.X_test_transform)
testdata_importance = engineered_explanations.get_feature_importance_dict()
l2h=dict(sorted(testdata_importance.items(), key=lambda x: x[1]))
plt.figure(figsize=(20, 5))
plt.barh(range(len(l2h)), l2h.values(), tick_label=list(l2h.keys()))

## Explain the model during inference

In [None]:
from azureml.interpret.scoring.scoring_explainer import TreeScoringExplainer, save, load

# Initialize the ScoringExplainer
#scoring_explainer = TreeScoringExplainer(explainer.explainer, feature_maps=[automl_explainer_setup_obj.feature_map])
scoring_explainer = TreeScoringExplainer(raw_explainer.explainer)

# Pickle scoring explainer locally
save(scoring_explainer, directory='outputs', exist_ok=True)

# Register scoring explainer
run.upload_file('raw_scoring_explainer.pkl', 'outputs/scoring_explainer.pkl')
scoring_explainer_model = run.register_model(model_name='scoring_explainer', model_path='raw_scoring_explainer.pkl')

In [None]:
from azureml.core import Run

try:
    automl_run
    print("use existing run")
except NameError:
    print("get a run")
    automl_run = Run(experiment, run_id = '{specify_target_run_id}')

In [None]:
registered_model = automl_run.register_model(
    model_name=consts.model_name_automl,
    tags={"trainedIn":"pipeline", "trainedBy":"automl"},
    description='AutoML model for predicting taxi trip duration')

### Download the model, make some predictions.

In [None]:
try:
    model
    print("use existing model")
except NameError:   
    print("get the model")
    registered_model = Model(ws, consts.model_name_automl)
    registered_model.download(target_dir=DOWNLOAD_PATH)
    downloaded_model = joblib.load(os.path.join(DOWNLOAD_PATH, consts.model_name_automl))

In [None]:
# input is an array of datapoints, each has an array of features
input_sample = pd.DataFrame(data=[
    {'vendorID': 1, 'passengerCount': 1, 'tripDistance': 1.00, 'pickupLongitude': -73.957909, 'pickupLatitude': 40.670761, 
     'dropoffLongitude': -73.952194, 'dropoffLatitude': 40.662312, 'totalAmount': 8.15, 'month_num': 1, 
     'day_of_month': 17, 'day_of_week': 5, 'hour_of_day': 1}])
# output is an array of predictions
output_sample = model.predict(input_sample)
output_sample

## Deploy the model as a web service

In [None]:
# Automl can be deployed without code in the portal
# the env file and scoring file are auto-generated, and can be found in the "Outputs" of the "Run"
inference_env = Environment.from_conda_specification(
    name = consts.inference_environment_name,
    file_path = os.path.join(CODE_PATH, 'inference_automl_env.yml'))
inference_config = InferenceConfig(source_directory = CODE_PATH,
                                   entry_script = 'score_automl.py',
                                   environment = inference_env)

In [None]:
aks_target = AksCompute(ws, inference_target_name)
deployment_config = AksWebservice.deploy_configuration(
    cpu_cores = 1, memory_gb = 2, collect_model_data=False, enable_app_insights=False)

try: 
    service = Webservice(ws, consts.service_name_automl)
    print("Service {} exists, update it".format(consts.service_name_automl))
    service.update(models=[scoring_explainer_model, registered_model], inference_config=inference_config)
except:
    print('deploy a new service {}'.format(consts.service_name_automl))
    service = Model.deploy(ws, consts.service_name_automl, [scoring_explainer_model, registered_model], inference_config, deployment_config, aks_target)
    #service.wait_for_deployment(show_output = True)
    #print(service.state)
    #print(service.get_logs())

#print(service.scoring_uri)

### Test against the deployed service

In [None]:
import requests
import json

headers = {'Content-Type': 'application/json'}

if service.auth_enabled:
    headers['Authorization'] = 'Bearer '+service.get_keys()[0]
elif service.token_auth_enabled:
    headers['Authorization'] = 'Bearer '+service.get_token()[0]

print(headers)

test_sample = '{"data": [\
    {"vendorID": 1, "passengerCount": 2, "tripDistance": 1.00, "pickupLongitude": -73.957909, "pickupLatitude": 40.670761, \
     "dropoffLongitude": -73.952194, "dropoffLatitude": 40.662312, "totalAmount": 8.15, "month_num": 1, \
     "day_of_month": 17, "day_of_week": 5, "hour_of_day": 1}]}'

response = requests.post(service.scoring_uri, data=test_sample, headers=headers)
print(response.status_code)
print(response.json())