## AzureML

In [1]:
from azureml.core import Workspace, Experiment, Run

Failure while loading azureml_run_type_providers. Failed to load entrypoint azureml.scriptrun = azureml.core.script_run:ScriptRun._from_run_dto with exception (ruamel.yaml 0.17.16 (x:\anaconda\lib\site-packages), Requirement.parse('ruamel.yaml<0.17.5,>=0.15.35')).


In [2]:
sub_id = "6560575d-fa06-4e7d-95fb-f962e74efd7a"
resource_group = "UW-Embeddings"
ws_name = "TxtsumDev"

ws = Workspace.get(
    name=ws_name,
    subscription_id=sub_id,
    resource_group=resource_group
)

experiment_name = "hf-pytorch-demo"
run_id = "bart-samsum-pytorch"

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [3]:
experiments = Experiment.list(ws)
print(experiments)

experiment = Experiment(workspace=ws, name=experiment_name)

[Experiment(Name: hf-deepspeed-demo,
Workspace: TxtsumDev), Experiment(Name: hf-pytorch-demo,
Workspace: TxtsumDev), Experiment(Name: hf-sweep-demo,
Workspace: TxtsumDev), Experiment(Name: hf-test,
Workspace: TxtsumDev)]


In [4]:
runs = Run.list(experiment)
for run in runs:
    print(run.id)
run = Run(experiment, run_id)
print(run)

08108a85-7197-4388-b05f-923c0b509e10
1178575243
17
08c1ae08-4f66-419c-99e1-8f9f98d42e3c
greenai
bart-samsum-pytorch
Run(Experiment: hf-pytorch-demo,
Id: bart-samsum-pytorch,
Type: azureml.scriptrun,
Status: Completed)


In [5]:
metrics = run.get_metrics()
print(metrics.keys())

for k in metrics.keys():
    if "rouge" in k:
        print(f"{k}: {metrics.get(k)[-1]}")

dict_keys(['loss', 'learning_rate', 'epoch', 'eval_loss', 'eval_rouge1', 'eval_rouge2', 'eval_rougeL', 'eval_rougeLsum', 'eval_gen_len', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'train_runtime', 'train_samples_per_second', 'train_steps_per_second', 'total_flos', 'train_loss'])
eval_rouge1: 54.6725
eval_rouge2: 29.7084
eval_rougeL: 45.0512
eval_rougeLsum: 50.5033


In [None]:
run_details = run.get_details()
run_details

In [7]:
command = run_details.get("runDefinition").get("command")
command_args = command.rstrip("\n").replace(" ", ": ").split(": --")

# temp args to ignore
ignore_args = (
    "python",
    "deepspeed",
    "model_name_or_path",
    "config_name",
    "dataset_name",
    "dataset_path",
    "evaluation_strategy",
    "logging_strategy",
    "do_train",
    "do_eval",
    "do_predict",
    "predict_with_generate",
    "overwrite_output_dir",
    "output_dir",
    "logging_dir",
    "ddp_find_unused_parameters"
)

hyperparams = "\n".join(f"{arg}" for arg in command_args if not any(i in arg for i in ignore_args))

print(hyperparams)

max_source_length: 512
max_target_length: 90
fp16: True
seed: 1
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
learning_rate: 5e-5
weight_decay: 0.1


In [8]:
# check formatting
hyperparam_md = f"""## Hyperparameters
```yaml
{hyperparams}
```
"""

print(hyperparam_md)

## Hyperparameters
```yaml
max_source_length: 512
max_target_length: 90
fp16: True
seed: 1
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
learning_rate: 5e-5
weight_decay: 0.1
```



In [9]:
compute_details = {
    "size": ws.compute_targets.get(run_details.get("target")).vm_size,
    "node_count": run_details.get("runDefinition").get("nodeCount")
}

In [10]:
# region = westus2
# gpu device, dedicated, low priority ($/hr)
sku_mapping = {
    "STANDARD_ND96ASR_V4": ("8 x NVIDIA A100 40GB (NVLink 3.0)", 27.20, 5.44),
    "STANDARD_ND40RS_V2": ("8 x NVIDIA V100 32GB (NVLink)", 22.03, 4.41),
    "STANDARD_NC24S_V3": ("4 x NVIDIA V100 16GB", 12.24, 2.45),
    "STANDARD_NC6": ("1 x NVIDIA K80 12GB", 0.90, 0.18)
}

compute_table = f"""| Region | US West 2 |
| AzureML Compute SKU | {compute_details["size"]} |
| Compute SKU GPU Device | {sku_mapping.get(compute_details["size"])[0]} |
| Compute Node Count | {compute_details["node_count"]} |
"""
print(compute_table)


| AzureML Compute SKU | STANDARD_ND40RS_V2 |
| Compute SKU GPU Device | 8 X V100 32GB |
| Compute Node Count | 1 |



## Azure Monitor

In [11]:
#%pip install azure-identity
#%pip install azure-mgmt-monitor
import datetime
from azure.mgmt.monitor import MonitorManagementClient
from azure.identity import AzureCliCredential#, DefaultAzureCredential

In [12]:
ws_resource_id = (
    f"subscriptions/{sub_id}/"
    f"resourceGroups/{resource_group}/"
    f"providers/Microsoft.MachineLearningServices/workspaces/{ws_name}"
)

monitor_client = MonitorManagementClient(AzureCliCredential(), sub_id)

In [14]:
#today = datetime.datetime.now().date()
#start_date = today - datetime.timedelta(days=7)

start_datetime = datetime.datetime.strptime(run_details.get("startTimeUtc"), "%Y-%m-%dT%H:%M:%S.%fZ")
end_datetime = datetime.datetime.strptime(run_details.get("endTimeUtc"), "%Y-%m-%dT%H:%M:%S.%fZ")

metrics_data = monitor_client.metrics.list(
    ws_resource_id,
    # add 1 min buffer to end time, td: hr/d depending on interval
    timespan=f"{start_datetime}/{end_datetime + datetime.timedelta(minutes=1)}",
    interval="PT1M",
    metricnames="GpuEnergyJoules",
    aggregation="Total",
    filter=f"RunID eq '{run_id}'"
)

for item in metrics_data.value:
    print(f"| Timestamp (RFC 3339) | {item.name.localized_value} |")
    for timeserie in item.timeseries:
        for data in timeserie.data:
            print(f"| {data.time_stamp} | {data.total} |")

| Timestamp (RFC 3339) | GpuEnergyJoules |
| 2021-08-29 01:46:00+00:00 | 0.0 |
| 2021-08-29 01:47:00+00:00 | 0.0 |
| 2021-08-29 01:48:00+00:00 | 0.0 |
| 2021-08-29 01:49:00+00:00 | 20673.0 |
| 2021-08-29 01:50:00+00:00 | 58252.0 |
| 2021-08-29 01:51:00+00:00 | 96313.0 |
| 2021-08-29 01:52:00+00:00 | 72549.0 |
| 2021-08-29 01:53:00+00:00 | 78087.0 |
| 2021-08-29 01:54:00+00:00 | 108031.0 |


In [15]:
run_duration = end_datetime - start_datetime

ts = run_duration.seconds

m = ts // 60
s = ts % 60
print(f"{m}m{s}s")

8m28s


In [16]:
# cost = sku_price * nodes * run_duration
compute_cost = []
compute_cost.append(sku_mapping.get(compute_details["size"])[1] * compute_details["node_count"] * ts / 3600)
compute_cost.append(sku_mapping.get(compute_details["size"])[2] * compute_details["node_count"] * ts / 3600)
"${:,.2f}".format(compute_cost[0])

'$3.11'

In [17]:
print(f"""| Run Duration | {m}m {s}s |
| Compute Cost (LowPriority/Dedicated) | ${"{:,.2f}".format(compute_cost[0])} / ${"{:,.2f}".format(compute_cost[1])} USD |
""")

| Region | US West 2 |
| Run Duration | 8m28s |
| Compute Cost (LowPriority/Dedicated) | $3.11 / $0.62 USD |



In [18]:
monitor_metrics = {
    "CpuUtilizationPercentage": "Average",
    "GpuUtilizationPercentage": "Average",
    "GpuMemoryUtilizationMegabytes": "Average",
    "GpuEnergyJoules": "Total"
}

monitor_results = []
for k, v in monitor_metrics.items():
    results = monitor_client.metrics.list(
        ws_resource_id,
        # add 1 min buffer to end time, td: hr/d depending on interval
        timespan=f"{start_datetime}/{end_datetime + datetime.timedelta(minutes=1)}",
        interval="P1D",
        metricnames=k,
        aggregation=v,
        filter=f"RunID eq '{run_id}'"
    )

    for item in results.value:
        print(f"| {item.name.localized_value} ({v}) |")
        for timeserie in item.timeseries:
            for data in timeserie.data:
                monitor_results.append(getattr(data, v.lower()))
                print(f"| {getattr(data, v.lower())} |")

| CpuUtilizationPercentage (Average) |
| 40.5 |
| GpuUtilizationPercentage (Average) |
| 59.9 |
| GpuMemoryUtilizationMegabytes (Average) |
| 20681.5 |
| GpuEnergyJoules (Total) |
| 325874.0 |


In [19]:
monitor_table = f"""| Average CPU Utilization | {"{:,.1f}".format(monitor_results[0])}% |
| Average GPU Utilization | {"{:,.1f}".format(monitor_results[1])}% |
| Average GPU Memory Usage | {"{:,.2f}".format(monitor_results[2]/1000)} GB |
| Total GPU Energy Usage | {"{:,.2f}".format(monitor_results[3]/1000)} kJ |
"""

print(monitor_table)

| Average CPU Utilization | 40.5% |
| Average GPU Utilization | 59.9% |
| Average GPU Memory Usage | 20.68 GB |
| Total GPU Energy Usage | 325.87 kJ |



## MLflow

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

In [None]:
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
client = MlflowClient()

In [None]:
experiments = client.list_experiments()
#experiment = client.get_experiment_by_name("hf-pytorch-demo")
print(experiments)

In [None]:
runs = client.list_run_infos(experiments[1].experiment_id)
run = client.get_run(runs[0].run_id)
print(runs)
print(run)

In [None]:
def print_metric_info(history):
    for m in history:
        print(f"name: {m.key}")
        print(f"value: {m.value}")
        print(f"step: {m.step}")
        print(f"timestamp (unix ms): {m.timestamp}")
        print("--")

metrics = ["eval_rouge1", "eval_rouge2", "eval_rougeL"]
for metric in metrics:
    run_metrics = client.get_metric_history(run.info.run_id, metric)
    print_metric_info(run_metrics)