# LLM Perf Benchmark

---

## Get the ml_client

In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential


ws_ml_client = MLClient.from_config(
    DefaultAzureCredential()
)

## Load pipeline job and submit

In [None]:
from azure.ai.ml import load_job


job_paths = [
    # path to yaml pipeline job
]

job_list = []
for job_path in job_paths:
    pipeline_job = load_job(job_path)
    job = ws_ml_client.create_or_update(pipeline_job)
    job_list.append(job)
    print(job.name)

## Collect results from completed job

### Results from output file

In [None]:
import os
import shutil
import pandas as pd
import json

from scripts.aml_run_utils import _download_outputs

# modify the following variables
models = "34b"
exp_name = ""
endpoint_sku = "Standard_ND40rs_v2"
pipeline_sku = "serverless"
model_params = {
    "top_p": 0.5,
    "temperature": 0.0001,
    "do_sample": True,
    "return_full_text": False,
    "max_new_tokens": 50
}

endpoint_nodes = 1
endpoint_concurrency_per_instance = 1
n_samples = 100
temp_dir = "out/temp"
result_path = f"out/result/{models}.csv"
df = pd.DataFrame()

os.makedirs(temp_dir, exist_ok=True)
os.makedirs(os.path.dirname(result_path), exist_ok=True)

for job in job_list:
    download_path = os.path.join(temp_dir, job.name)
    os.makedirs(download_path, exist_ok=True)
    job_name = job.name
    output_name = job.outputs.perf_data.port_name
    output_dir = f"{download_path}/named-outputs/{output_name}"

    _download_outputs(ws_ml_client, job_name, download_path, output_name, all=True)

    output_path = [
        os.path.join(output_dir, file)
        for file in os.listdir(output_dir) if file.endswith(".jsonl")
    ][0]

    temp_df = pd.read_json(output_path, lines=True)
    latency_avg = temp_df["latency"].mean()
    latency_std = temp_df["latency"].std()

    metrics_json = {}
    metrics_json["model"] = job.display_name.split("__")[0]
    metrics_json["latency_ms_avg"] = latency_avg
    metrics_json["latency_ms_std"] = latency_std
    metrics_json["n_samples"] = n_samples
    metrics_json["endpoint-sku"] = endpoint_sku
    metrics_json["endpoint_nodes"] = endpoint_nodes
    metrics_json["endpoint_concurrency_per_instance"] = endpoint_concurrency_per_instance
    metrics_json["model_params"] = json.dumps(model_params)
    metrics_json["pipeline_sku"] = pipeline_sku
    metrics_json["run_id"] = job_name
    metrics_json["studio_url"] = job.studio_url
    
    df = df.append(metrics_json, ignore_index=True)
    shutil.rmtree(download_path)

df.to_csv(result_path, index=False)

### Results from mlfow logged metrics

In [None]:
import pandas as pd
import json
import os

from scripts.aml_run_utils import get_mlflow_logged_metrics


# modify the following variables
exp_name = "llm_perf"
model_family = "orca"
model_params = {
    "top_p": 0.5,
    "temperature": 0.001,
    "do_sample": True,
    "return_full_text": False,
    "max_new_tokens": 50
}
endpoint_sku = "Standard_ND96amsr_A100_v4"
job_names = [
    # job names
]

result_path = f"out/result/{model_family}.csv"
pipeline_sku = "serverless"
endpoint_nodes = 1
endpoint_concurrency_per_instance = 1

os.makedirs(os.path.dirname(result_path), exist_ok=True)

df = pd.DataFrame()
for job_name in job_names:
    metrics_json = {}
    job = ws_ml_client.jobs.get(job_name)
    metrics_json["model"] = job.display_name.split("__")[0]
    metrics_json.update(get_mlflow_logged_metrics(ws_ml_client, job.name, exp_name))
    metrics_json["endpoint_sku"] = endpoint_sku
    metrics_json["endpoint_nodes"] = endpoint_nodes
    metrics_json["endpoint_concurrency_per_instance"] = endpoint_concurrency_per_instance
    metrics_json["model_params"] = json.dumps(model_params)
    metrics_json["pipeline_sku"] = pipeline_sku
    metrics_json["run_id"] = job_name
    metrics_json["studio_url"] = job.studio_url
    
    df = df.append(metrics_json, ignore_index=True)

df.to_csv(result_path, index=False)

### Clean the results df

In [None]:
# modify the following variables
result_path = f"out/result/{model_family}_cleaned.csv"

cleaned_df = df.drop(columns=["num_valid_entries", "ratio_valid_entries", "run_id", "model_params", "avg_rps", "std_rps"])
cleaned_df = cleaned_df[["model", "avg_generated_tps", "std_generated_tps", "avg_latency_ms", "std_latency_ms", "latency_p50", "latency_p90", "latency_p95", "latency_p99", "endpoint_sku", "endpoint_nodes", "endpoint_concurrency_per_instance", "pipeline_sku", "studio_url"]]
cleaned_df = cleaned_df.round(2)
cleaned_df = cleaned_df.sort_values(by=["model"])
cleaned_df.to_csv(result_path, index=False)

## Plot results

### Plot "Latency vs Models"

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import pandas as pd


df = pd.read_csv(result_path)
# create new column "family" for llama, codellama, falcon, etc
df["family"] = df["model"].apply(lambda x: x.split("-")[0])
df["avg_latency_sec"] = df["avg_latency_ms"] / 1000
df = df.sort_values(by=["avg_latency_sec"])

# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))
ax = sns.barplot(
    x="model",
    y="avg_latency_sec",
    data=df,
    palette="Blues_d",
    ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right")
ax.set_xlabel("Model", fontsize=16)
ax.set_ylabel("Avg Latency (sec)", fontsize=16)
ax.set_title("Avg Latency (sec) vs Model", fontsize=20)

# put values on top of the bars
for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2f}",
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha="center",
        va="center",
        xytext=(0, 10),
        textcoords="offset points",
    )

# use different color for different family in bar chart with legend
for i, bar in enumerate(ax.patches):
    if df.iloc[i]["family"] == "llama":
        bar.set_color("tab:green")
    elif df.iloc[i]["family"] == "codellama":
        bar.set_color("tab:cyan")
    else:
        bar.set_color("tab:orange")

# add legend for used color
legend_elements = [
    Patch(facecolor="tab:green", label="Llama"),
    # Patch(facecolor="tab:cyan", label="Codellama"),
    Patch(facecolor="tab:orange", label="Orca"),
]
ax.legend(handles=legend_elements, loc="upper left")

### Plot "Generated TPS vs Models"

In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
import pandas as pd


df = pd.read_csv(result_path)
df["family"] = df["model"].apply(lambda x: x.split("-")[0])
df = df.sort_values(by=["avg_generated_tps"])

# plot
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))
ax = sns.barplot(
    x="model",
    y="avg_generated_tps",
    data=df,
    palette="Blues_d",
    ax=ax,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right")
ax.set_xlabel("Model", fontsize=16)
ax.set_ylabel("avg_generated_tps", fontsize=16)
ax.set_title("Avg Generated TPS vs Model", fontsize=20)

# put number on top of the bar
for p in ax.patches:
    ax.annotate(
        f"{p.get_height():.2f}",
        (p.get_x() + p.get_width() / 2, p.get_height()),
        ha="center",
        va="center",
        xytext=(0, 10),
        textcoords="offset points",
    )

# use different color for different family in bar chart
for i, bar in enumerate(ax.patches):
    if df.iloc[i]["family"] == "llama":
        bar.set_color("tab:green")
    elif df.iloc[i]["family"] == "codellama":
        bar.set_color("tab:cyan")
    else:
        bar.set_color("tab:orange")

# add legend for used color
legend_elements = [
    Patch(facecolor="tab:green", label="Llama"),
    # Patch(facecolor="tab:cyan", label="Codellama"),
    Patch(facecolor="tab:orange", label="Orca"),
]
ax.legend(handles=legend_elements, loc="upper left")