In [None]:
import azureml.core
import string
import os
import datetime

print("SDK version:", azureml.core.VERSION)

# Required data folder structure
    proj_root_anyname
      data
        anyname1
          Annotations
          JPEGImages
          pascal_label_map.pbtxt
        anyname2
          Annotations
          JPEGImages
          pascal_label_map.pbtxt
        anynameN
        ...
    models
      faster_rcnn_anyname
        -- must have model.ckpt.data*
        pipeline.config
    tfrecords -- output folder
      pascal_label_map.pbtxt 
      pipeline.config -- with path, num_classes, num_steps correctly set
      test.record 
      train.record
      val.record


# Training parameters

In [None]:
proj_root='pets'
base_model = 'faster_rcnn_resnet101_coco_2018_01_28'
force_regenerate_tfrecords = False
training_steps = 10
support_gpu = True
classname_in_filename = True #pet data doesn't have class in annotation xml

# Environment parameters

In [None]:
subscription_id = os.getenv("SUBSCRIPTION_ID")
resource_group = os.getenv("RESOURCE_GROUP")
workspace_name = os.getenv("WORKSPACE_NAME")
proj_datastore = os.getenv("PROJ_DATASTORE", default = None)
logs_datastore = os.getenv("LOGS_DATASTORE", default = 'logsds')
compute_cpu = os.getenv("AML_COMPUTE_CPU", default = 'amlcpu')
compute_gpu = os.getenv("AML_COMPUTE_GPU", default = 'amlnv6')

docker_registry_address = os.getenv("ACR_ID")
docker_registry_username = os.getenv("ACR_USERNAME")
docker_registry_password = os.getenv("ACR_PASSWORD")
training_docker_image_short_name = os.getenv("TRAINING_DOCKER_SHORT_NAME")

# Constants

In [None]:
DATA_SUBDIR='data'
TFRECORDS_SUBDIR='tfrecords'
MODELS_SUBDIR='models'
SCRIPT_FOLDER = './scripts'
SCRIPT_FILE = 'train.py'

# Set up Azure ML environment

In [None]:
from azureml.core import Workspace, Experiment, Run, Datastore
from azureml.core.runconfig import ContainerRegistry

ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

In [None]:
# one time set up for a datastore that contains the project root
#ds = Datastore.register_azure_blob_container(
#        workspace=ws,
#        datastore_name=proj_datastore,
#        container_name='container_name_for_proj_root',
#        account_name='storage_account',
#        account_key='account_key',
#        create_if_not_exists=True)
# optionally set it to default datastore for the AML workspace
#ws.set_default_datastore(proj_datastore)

# one time set up for a datastore that will contain Tensorflow logs for Tensorboard
#dslogs = Datastore.register_azure_blob_container(
#        workspace=ws,
#        datastore_name=logs_datastore,
#        container_name='container_name_for_logs',
#        account_name='same_account_as_the_default_datastore_for_azureml',
#        account_key='account_key',
#        create_if_not_exists=True)

In [None]:
if proj_datastore is None:
    ds = ws.get_default_datastore()
else:
    ds = Datastore.get(ws, datastore_name=proj_datastore)
dslogs = Datastore.get(ws, datastore_name=logs_datastore)
print(ds.container_name, dslogs.container_name)

compute_name = compute_gpu if support_gpu else compute_cpu
compute_target = ws.compute_targets[compute_name]
model_name = proj_root if proj_root.isalnum() else ''.join(ch for ch in proj_root if ch.isalnum())
experiment_name = model_name
exp = Experiment(workspace=ws, name=experiment_name)

print("datastore:{}, compute:{}".format(ds.container_name, type(compute_target)))
print("proj_root:{}, model_name:{}".format(proj_root, model_name))

image_registry_details = ContainerRegistry()
image_registry_details.address = docker_registry_address
image_registry_details.username = docker_registry_username
image_registry_details.password = docker_registry_password
training_docker_image = docker_registry_address + '/' + training_docker_image_short_name

# Notebook specific settings

In [None]:
from azureml.widgets import RunDetails

tensorboard_local_dir_prefix = '/mnt/pliu'

# Train with a Script

In [None]:
from azureml.core.runconfig import RunConfiguration, DataReferenceConfiguration
from azureml.core import ScriptRunConfig

dr = DataReferenceConfiguration(datastore_name=ds.name, 
                                path_on_datastore=proj_root,
#                                path_on_compute='/datastore', path_on_compute doesn't work with mount
                                overwrite=True)
drlogs = DataReferenceConfiguration(datastore_name=dslogs.name, 
                                path_on_datastore=proj_root,
#                                path_on_compute='/datastore', path_on_compute doesn't work with mount
                                overwrite=True)

run_cfg = RunConfiguration()
run_cfg.environment.docker.enabled = True
run_cfg.environment.docker.gpu_support = support_gpu
run_cfg.environment.docker.base_image = training_docker_image # docker image fullname
run_cfg.environment.docker.base_image_registry = image_registry_details
run_cfg.data_references = {ds.name: dr, dslogs.name: drlogs} #tell the system to mount, later ds.mount() means mount from this path not root
#extra arguments to docker run
#run_amlcompute.environment.docker.arguments = <xref:azureml.core.runconfig.list>
run_cfg.environment.python.user_managed_dependencies = True
#run_cfg.auto_prepare_environment = False

run_cfg.target = compute_target

### run training script

In [None]:
currentDT = datetime.datetime.now()
currentDTstr = currentDT.strftime("%Y%m%d_%H%M")
print('logs will be in {}'.format(currentDTstr))

base_mount = ds.as_mount() #this corresponds to run_cfg.data_referencese, so here it starts from proj_root rather than ds root path
data_dir = os.path.join(str(base_mount), DATA_SUBDIR)
tfrecords_dir = os.path.join(str(base_mount), TFRECORDS_SUBDIR)
base_model_dir = os.path.join(str(base_mount), MODELS_SUBDIR, base_model)
logs_mount = dslogs.as_mount()
logs_dir = os.path.join(str(logs_mount), currentDTstr)

script_params = [
    '--data_dir', data_dir,
    '--base_model_dir', base_model_dir, 
    '--tfrecords_dir', tfrecords_dir,
    '--force_regenerate_tfrecords', force_regenerate_tfrecords,
    '--num_steps', training_steps,
    '--log_dir', logs_dir,
    '--classname_in_filename', classname_in_filename
]

src = ScriptRunConfig(source_directory = SCRIPT_FOLDER, http://pliudsvm.westcentralus.cloudapp.azure.com:8888/notebooks/src/tensorflow_objectdetection_azureml/aml_train/aml-train.ipynb#
                      script = SCRIPT_FILE, 
                      run_config = run_cfg,
                      arguments=script_params)

run = exp.submit(src)
print('run details {}'.format(run.get_details))
RunDetails(run).show()

### export logs locally for Tensorboard
-  the below method works if logs are not large
-  if logs are large, AML will timeout, so export logs to another data source, and simply download from Azure blob storage 

In [None]:
from azureml.tensorboard import Tensorboard

local_tensorboard_logdir = os.path.join(tensorboard_local_dir_prefix, experiment_name)
os.makedirs(local_tensorboard_logdir, exist_ok=True)

# if a previous run rather than the one done in this session
#run_id = 'previous_run_id'
#exp = Experiment(workspace=ws, name=experiment_name)
#run = Run(experiment = exp, run_id = run_id)
tb = Tensorboard([run], local_root=local_tensorboard_logdir)
# If successful, start() returns a string with the URI of the instance.
tb.start() #start may fail, but you can manually run tensorboard --logdir=local_tensorboard_logdir
#tb.stop()

# Or train with an Estimator

In [None]:
from azureml.train.estimator import Estimator

currentDT = datetime.datetime.now()
currentDTstr = currentDT.strftime("%Y%m%d_%H%M")
print('logs will be in {}'.format(currentDTstr))

# notice the base starts with proj_root, different from ScriptRun
base_mount = ds.path(proj_root).as_mount()
data_dir = os.path.join(str(base_mount), DATA_SUBDIR)
tfrecords_dir = os.path.join(str(base_mount), TFRECORDS_SUBDIR)
base_model_dir = os.path.join(str(base_mount), MODELS_SUBDIR, base_model)
logs_mount = dslogs.path(proj_root).as_mount()
logs_dir = os.path.join(str(logs_mount), currentDTstr)

# notice the different format for parameters from ScriptRun
script_params = {
    '--data_dir': data_dir,
    '--base_model_dir': base_model_dir, 
    '--tfrecords_dir': tfrecords_dir,
    '--force_regenerate_tfrecords': force_regenerate_tfrecords,
    '--num_steps': training_steps,
    '--log_dir': logs_dir,
    '--classname_in_filename': classname_in_filename
}

est = Estimator(source_directory=SCRIPT_FOLDER,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script=SCRIPT_FILE,
                    use_docker=True,
                    use_gpu=support_gpu,
                    image_registry_details=image_registry_details,
                    user_managed=True,
                    custom_docker_image=training_docker_image_short_name, #notice this is short name, different from ScriptRun
                    inputs=[base_mount, logs_mount]) #tell the system to mount, or if the script params contain ds.mount(), it will mount without this

run = exp.submit(est)
RunDetails(run).show()

# Or train with a Pipeline

In [None]:
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep

currentDT = datetime.datetime.now()
currentDTstr = currentDT.strftime("%Y%m%d_%H%M")
print('logs will be in {}'.format(currentDTstr))

#even though runcfg is used in pipeline, datareference in runcfg is ignored, so it's similar to Estimator
# base starts with proj_root
# you have to specify inputs/outputs to get mounted
base_mount = ds.path(proj_root).as_mount() 
data_dir = os.path.join(str(base_mount), DATA_SUBDIR)
tfrecords_dir = os.path.join(str(base_mount), TFRECORDS_SUBDIR)
base_model_dir = os.path.join(str(base_mount), MODELS_SUBDIR, base_model)
logs_mount = dslogs.path(proj_root).as_mount()
logs_dir = os.path.join(str(logs_mount), currentDTstr)

script_params = [
    '--data_dir', data_dir,
    '--base_model_dir', base_model_dir, 
    '--tfrecords_dir', tfrecords_dir,
    '--force_regenerate_tfrecords', force_regenerate_tfrecords,
    '--num_steps', training_steps,
    '--log_dir', logs_dir,
    '--classname_in_filename', classname_in_filename
]

trainStep = PythonScriptStep(
    source_directory=SCRIPT_FOLDER,
    script_name=SCRIPT_FILE,
    name="train_step",
    arguments=script_params, 
    inputs=[base_mount, logs_mount], 
    #outputs=[output_tfrecords],#this is used for intermediate data, can be accessed by following steps, but not blob
    compute_target=compute_target,
    runconfig=run_cfg,
    allow_reuse=False, #if true, reuse previous results if settings/inputs are same
    version='0.1' #version tag to denote a change in functionality of this step
    #params=dict of name/value pairs, env variables as "AML_PARAMETER_"
)

steps = [trainStep]

pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()
run = Experiment(ws, experiment_name).submit(pipeline)
RunDetails(run).show()

# Register the model

In [None]:
#run_id = 'existing_run_id_rather_than_the_one_just_trained'
#exp = Experiment(workspace=ws, name=experiment_name)
#run = Run(exp, run_id)
model = run.register_model(model_name=model_name, model_path='outputs/model/frozen_inference_graph.pb')
print('registered model {}, version: {}'.format(model.name, model.version))