In [43]:
!pip install -U azureml-sdk

Requirement already up-to-date: azureml-sdk in /home/nbuser/anaconda3_501/lib/python3.6/site-packages (1.0.57)




In [44]:
import azureml.core
print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.57


In [45]:
from azureml.core import Workspace
ws = Workspace(subscription_id="ee85ed72-2b26-48f6-a0e8-cb5bcf98fbd9", resource_group="DatasetTestRG", workspace_name="Dataset-Test-WS")
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

Dataset-Test-WS
DatasetTestRG
eastus
ee85ed72-2b26-48f6-a0e8-cb5bcf98fbd9


In [46]:
from azureml.core.compute import AmlCompute
compute_target_name = "nlp-test"
compute_target = AmlCompute(ws, compute_target_name)

In [47]:
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.core.environment import DockerSection
runconfig = RunConfiguration(framework="python")
runconfig.environment.python.conda_dependencies = CondaDependencies('dependencies.yml')
runconfig.environment.docker = DockerSection()
runconfig.environment.docker.enabled = True
runconfig.environment.docker.gpu_support = True
runconfig.environment.docker.base_image = azureml.core.runconfig.DEFAULT_GPU_IMAGE

In [48]:
from azureml.data.data_reference import DataReference
from azureml.core import Datastore
ds = Datastore.get(ws, 'workspaceblobstore')

In [49]:
# data module
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

# train_input_data = DataReference(
#     datastore=ds,
#     data_reference_name="train_input_data",
#     path_on_datastore="train/cleaned_train_sampled.csv")
train_input_data = DataReference(
    datastore=ds,
    data_reference_name="train_input_data",
    path_on_datastore="debug/train_16.csv")
train_data = PipelineData("train_data",datastore=ds)

load_train_data_step = PythonScriptStep(
    name="load_data_step",
    script_name="run_modules/data_module.py", 
    arguments=["--input_data", train_input_data, 
               "--output_dir", train_data],
    inputs=[train_input_data],
    outputs=[train_data],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)

In [50]:
# data module —— load test dataset
# test_input_data = DataReference(
#     datastore=ds,
#     data_reference_name="test_input_data",
#     path_on_datastore="test/cleaned_test_data_labels.csv")
test_input_data = DataReference(
    datastore=ds,
    data_reference_name="test_input_data",
    path_on_datastore="debug/test_32.csv")

test_data = PipelineData("test_data",datastore=ds)

load_test_data_step = PythonScriptStep(
    name="load_test_data_step",
    script_name="run_modules/data_module.py", 
    arguments=["--input_data", test_input_data, 
               "--output_dir", test_data],
    inputs=[test_input_data],
    outputs=[test_data],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)

In [51]:
#split data module
# dataset1 = PipelineData("output_data1",datastore=ds)
# dataset2 = PipelineData("output_data2",datastore=ds)
# #parameters
# row_split_rate = 0.7
# random = True
# random_seed = 200
# split_data_step = PythonScriptStep(
#     name="split_data_step",
#     script_name="run_modules/split_data_module.py", 
#     arguments=["--input_dir", train_data, 
#                "--output_dir1", dataset1,
#                "--output_dir2", dataset2,
#                "--row_split_rate", row_split_rate,
#                "--random", random,
#                "--random_seed", random_seed],
#     inputs=[train_data],
#     outputs=[dataset1, dataset2],
#     compute_target=compute_target,
#     runconfig = runconfig,
#     source_directory='.'
# )


In [52]:
dataset1 = train_data
dataset2 = test_data

In [53]:
# bert module
chosed_model = PipelineData("chosed_bert",datastore=ds)

language = "En"
uncased = True
build_bert_step = PythonScriptStep(
    name="build_bert_step",
    script_name="run_modules/bert_url_module.py", 
    arguments=["--out_model_dir", chosed_model,
               "--language", language,
              "--uncased", uncased],
    outputs=[chosed_model],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)


In [54]:
# custom added layer module
label_num = 6
#category = 'lstm_multi_label_classifier' 
layer_name = 'multi_label_classifier'
added_layer = PipelineData("added_layer",datastore=ds)
custom_layer_step = PythonScriptStep(
    name="custom_layer_step",
    script_name="run_modules/add_layer_module.py", 
    arguments=["--label_num", label_num, 
               "--layer_name", layer_name,
               "--output_dir", added_layer],
    outputs=[added_layer],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)


In [55]:
# train module 

trained_model_dir = PipelineData("trained_model_dir", datastore=ds)
is_training_bert= False # Device cannot support training and is generally setted False.
train_batch_size = 2
use_gpu = True
num_gpu_cores = 4
learning_rate = 5e-5
num_train_epochs = 3.0
train_column_names ="comment_text"
label_column_names = "toxic severe_toxic obscene threat insult identity_hate"
train_step = PythonScriptStep(
    name="train_step",
    script_name="run_modules/train_module.py", 
    arguments=["--bert_dir", chosed_model,
               "--added_layer_config", added_layer,
               "--train_data", dataset1,
               "--output_dir", trained_model_dir,
               "--is_training_bert", is_training_bert,
               "--train_batch_size", train_batch_size,
               "--use_gpu",use_gpu,
               "--num_gpu_cores", num_gpu_cores,
               "--learning_rate", learning_rate,
               "--num_train_epochs", num_train_epochs,
               "--train_column_names",train_column_names,
               "--label_column_names", label_column_names],
    inputs=[chosed_model, added_layer, dataset1],
    outputs=[trained_model_dir],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)



In [56]:
# score/predict module

predict_result_dir = PipelineData("predict_dir",datastore=ds)
predict_batch_size = 2
use_gpu = True
num_gpu_cores = 4
predict_column_names = "comment_text"
predict_step = PythonScriptStep(
    name="predict_step",
    script_name="run_modules/predict_module.py", 
    arguments=["--bert_dir", chosed_model,
               "--added_layer_config", added_layer,
               "--test_data", dataset2,
               "--trained_model_dir", trained_model_dir,
               "--output_dir", predict_result_dir,
               "--predict_batch_size", predict_batch_size,
               "--use_gpu", use_gpu,
               "--num_gpu_cores", num_gpu_cores,
               "--predict_column_names", predict_column_names],
    inputs=[chosed_model, added_layer, dataset2, trained_model_dir],
    outputs=[predict_result_dir],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)

In [57]:
# evaluate module
evaluate_result_dir = PipelineData("evaluate_results",datastore=ds)
pr = True
roc = True
label_columns = "toxic severe_toxic obscene threat insult identity_hate"
probability_columns = "probability_1 probability_2 probability_3 probability_4 probability_5 probability_6"
evaluate_step = PythonScriptStep(
    name="evaluate_step",
    script_name="run_modules/evaluate_module.py", 
    arguments=[ "--input_data", predict_result_dir,
                "--pr", pr,
                "--roc", roc,
                "--output_dir", evaluate_result_dir,
                "--label_columns", label_columns,
                "--probability_columns", probability_columns],
    inputs=[predict_result_dir],
    outputs=[evaluate_result_dir],
    compute_target=compute_target,
    runconfig = runconfig,
    source_directory='.'
)

In [58]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
pipeline_toxic= Pipeline(workspace=ws, steps=[evaluate_step])
#pipeline_split_data = Pipeline(workspace=ws, steps=[split_data_step])


In [59]:
pipeline_run = Experiment(ws, 'toxic_comment').submit(pipeline_toxic)
#pipeline_run = Experiment(ws, 'toxic_comment').submit(pipeline_split_data)

Created step evaluate_step [43fde2f1][9d9a0040-37ce-4322-a0fe-28459ba3df3f], (This step will run and generate new outputs)
Created step predict_step [4efde956][6f3a0b0e-d684-4f20-9d3f-a4ef7231777d], (This step will run and generate new outputs)
Created step build_bert_step [eec64796][08b290bb-46ee-48c5-8cc0-71fb35e07e7a], (This step will run and generate new outputs)
Created step custom_layer_step [563c1795][050aa33d-276a-46e8-bd7a-9296c35e0d0c], (This step will run and generate new outputs)
Created step load_test_data_step [d603d49e][4544d11b-74d9-46d2-bdae-ebe57a636ab5], (This step will run and generate new outputs)
Created step train_step [2e098d75][25e855d9-f092-4867-b8da-03585bf8d2d7], (This step will run and generate new outputs)
Created step load_data_step [143d58cd][27a7ff85-da93-4618-99bc-eb975d6a7dbe], (This step will run and generate new outputs)
Using data reference test_input_data for StepId [0f0ac046][a191684b-1c2b-4289-9f01-1b69a7d71127], (Consumers of this data are elig

In [21]:
from azureml.core import Experiment
from azureml.pipeline.core import PipelineRun
pipeline_run = Experiment(ws, 'toxic_comment')
pipeline_run = PipelineRun(pipeline_run, "0bf16d6b-1fc4-44a9-ab0f-f065256e01a8")
pipeline_run.cancel()
