In [7]:
import json
import sys
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from utils import job_name_map

""" ######################## gpu hours ###################### """
def compute_gpu_hrs_model(logfile, interval=60, time_limit=-1):
    gpu_usage = dict()
    model_counts = dict()
    jobnames = set()
    with open(logfile, 'r') as f:
        for line in f.readlines():
            logline = json.loads(line)
            logtime = logline['timestamp']
            if logtime > time_limit * 3600:
                break
            for job in logline['submitted_jobs']:
                # job already finished or no allocation
                if job['completion_time'] is not None or job['allocation'] is None:
                    continue
                
                if job['real_job_name'] is None:
                    model_name = job['name'].split('-')[0]
                else:
                    s = job['real_job_name'].split('-')
                    model_name = s[0] + '-' + s[1]
                if model_name not in gpu_usage:
                    gpu_usage[model_name] = dict()
                    model_counts[model_name] = 0

                jobname_ = job['name'] if job['real_job_name'] is None else job['real_job_name']
                if jobname_ not in jobnames:
                    model_counts[model_name] += 1
                    jobnames.add(jobname_)
                    
                assert len(job['allocation']) == 1
                alloc_ = [(k, v) for k, v in job['allocation'].items()]
                cluster, alloc = alloc_[0][0], alloc_[0][1]
                
                gpu_usage[model_name][cluster] = gpu_usage[model_name].get(
                    cluster, 0) + len(alloc)
    gpu_hrs = dict()
    for model, gpu_units in gpu_usage.items():
        gpu_hrs[model] = dict()
        for cluster, sum_gpus in gpu_units.items():
            gpu_hrs[model][cluster] = (
                sum_gpus * interval / 3600) / model_counts[model]
    return gpu_hrs


def compute_gpu_hrs_model_dir(dir, time_limit, interval=60):
    print(f"Processing {dir}")
    gpu_hrs = dict()
    count = 0
    for logfile in glob.glob(dir + "/workload*.log"):
        workload_occ = compute_gpu_hrs_model(logfile, interval, time_limit)
        for model, occ in workload_occ.items():
            if model not in gpu_hrs:
                gpu_hrs[model] = dict()
            for cluster, cluster_gpu_hrs in occ.items():
                gpu_hrs[model][cluster] = gpu_hrs[model].get(
                    cluster, 0) + cluster_gpu_hrs
        count += 1

    for model in gpu_hrs.keys():
        for cluster in gpu_hrs[model].keys():
            gpu_hrs[model][cluster] /= count

    return gpu_hrs


# haware_gpu_hrs = compute_gpu_hrs_model_dir("/logs/haware_mip/philly_b", 48)
# hunaware_gpu_hrs = compute_gpu_hrs_model_dir("/logs/unaware_weighted_mip/philly_b", 48)
# pollux_gpu_hrs = compute_gpu_hrs_model_dir("/logs/unaware_pollux/philly_b_run2", 48)
# gavel_gpu_hrs = compute_gpu_hrs_model_dir("/logs/gavel/philly_b", interval=180, time_limit=48)
haware_gpu_hrs = compute_gpu_hrs_model_dir("/workspace/simulator.v3/results/apollo/5_azure_2_dgx/workloads-test", 48)
gavel_gpu_hrs = compute_gpu_hrs_model_dir(
    "/workspace/simulator.v3/results/gavel/5_azure_2_dgx/workloads-test", interval=180, time_limit=48)
print(haware_gpu_hrs)
print(gavel_gpu_hrs)


Processing /workspace/simulator.v3/results/apollo/5_azure_2_dgx/workloads-test
Processing /workspace/simulator.v3/results/gavel/5_azure_2_dgx/workloads-test
{'cifar10': {'azure': 0.09166666666666666, 'dgx-ext': 0.4}, 'gpt-15b': {'dgx-ext': 51.599999999999994, 'azure': 56.83333333333333}, 'gpt-6.7b': {'dgx-ext': 15.133333333333333}}
{'cifar10': {'dgx-ext': 0.8}, 'gpt-15b': {'azure': 274.4}, 'gpt-6.7b': {'azure': 0.4, 'dgx-ext': 15.2}}


In [1]:
import json
import sys
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
from utils import job_name_map
""" ######################## run time details ###################### """

def compute_runtime(logfile, interval=60, time_limit=-1):
    runtime = dict()
    model_counts = dict()
    jobnames = set()
    with open(logfile, 'r') as f:
        for line in f.readlines():
            logline = json.loads(line)
            logtime = logline['timestamp']
            if logtime > time_limit * 3600:
                break
            for job in logline['submitted_jobs']:
                # job already finished
                if job['completion_time'] is not None:
                    continue

                if job['real_job_name'] is None:
                    model_name = job['name'].split('-')[0]
                else:
                    s = job['real_job_name'].split('-')
                    model_name = s[0] + '-' + s[1]
                if model_name not in runtime:
                    runtime[model_name] = dict()
                    model_counts[model_name] = 0

                jobname_ = job['name'] if job['real_job_name'] is None else job['real_job_name']
                if jobname_ not in jobnames:
                    model_counts[model_name] += 1
                    jobnames.add(jobname_)

                if job['allocation'] is None:  # no allocation
                    status = 'suspend'
                    runtime[model_name][status] = runtime[model_name].get(
                        status, 0) + interval
                else:
                    if job['overhead'] != 0:  # use the last placement
                        status = 'restart'
                        runtime[model_name][status] = runtime[model_name].get(
                            status, 0) + job['overhead']

                    alloc_ = [(k, v) for k, v in job['allocation'].items()]
                    assert len(alloc_) == 1
                    cluster, alloc = alloc_[0][0], alloc_[0][1]
                    status = cluster
                    runtime[model_name][status] = runtime[model_name].get(
                        status, 0) + interval - job['overhead']

    res = {model: {status: time/model_counts[model] for status,
                   time in status_time.items()} for model, status_time in runtime.items()}
    return res


def compute_runtime_dir(dir, time_limit, interval=60):
    print(f"Processing {dir}")
    res = dict()
    count = 0
    print(glob.glob(dir + "/workload*.log"))
    for logfile in glob.glob(dir + "/workload*.log"):
        print(logfile)
        workload_occ = compute_runtime(logfile, interval, time_limit)
        print(workload_occ)
        for model, occ in workload_occ.items():
            if model not in res:
                res[model] = dict()
            for status, time in occ.items():
                res[model][status] = res[model].get(
                    status, 0) + time
        count += 1

    for model in res.keys():
        for status in res[model].keys():
            res[model][status] /= count
    return res


def compute_normed(input_hrs, norm_hrs):
    normed_model_hrs = dict()
    for model, hrs in input_hrs.items():
        normed_hrs = dict()
        for cluster, hr in hrs.items():
            normed_hrs[cluster] = hr / norm_hrs[model]
        normed_model_hrs[model] = normed_hrs
    return normed_model_hrs


def get_points(nhrs):
    points = []
    for cluster in cluster_ordering:
        cur_points = []
        for model in model_ordering:
            cur_points.append(nhrs[model].get(cluster, 0))
        points.append(np.asarray(cur_points))
    return points


def plot_points(points, offset, display_str, cluster_names, cluster_colors):
    cur_sum = np.zeros_like(points[0])
    global label_added
    for i, cur_points in enumerate(points):
        print(cur_sum)
        cluster = cluster_ordering[i]
        label = cluster_names[cluster] if not label_added else None
        b = plt.bar(Xs + offset, cur_points, label=label, color=cluster_colors[cluster],
                    bottom=cur_sum, width=barwidth)
        cur_sum += cur_points

    for x, y in zip(Xs + offset - barwidth / 2 + 0.02, 0.05 + np.zeros_like(Xs)):
        plt.gca().text(x, y, display_str, fontsize=12, fontweight='bold', color='w')
    label_added = True

cluster_spec = '5_azure_2_dgx'
workloads = 'no_ncf_workloads'
haware_runtime = compute_runtime_dir(
    f"/workspace/simulator.v3/results/apollo/{cluster_spec}/{workloads}", 48)
gavel_runtime = compute_runtime_dir(
    f"/workspace/simulator.v3/results/gavel/{cluster_spec}/{workloads}", interval=180, time_limit=240)
print("### apollo runtime")
print(haware_runtime)
print('')
print("### gavel runtime")
print(gavel_runtime)


# normalize
norm_runtime = dict()
for model, allocs in haware_runtime.items():
    norm_runtime[model] = sum(allocs.values())
print('\n### normalized runtime')
print(norm_runtime)

haware_runtime_norm = compute_normed(haware_runtime, norm_runtime)
gavel_runtime_norm = compute_normed(gavel_runtime, norm_runtime)
print('\n### normalized gavel runtime')
print(gavel_runtime_norm)


points_ordering = [haware_runtime_norm]
Xs = np.arange(len(haware_runtime_norm))
model_ordering = list(haware_runtime_norm.keys())
cluster_colors = {'restart': 'g', 'suspend': 'r', 'azure': 'b', 'dgx-ext': 'y'}
cluster_names = {'restart': 'restart', 'suspend': 'suspend', 'azure': 'v100', 'dgx-ext': 'a100'}
cluster_ordering = ['dgx-ext', 'azure', 'suspend', 'restart']

haware_points = get_points(haware_runtime_norm)
gavel_points = get_points(gavel_runtime_norm)

plt.clf()
label_added = False
barwidth = 0.2

offsets = [0, 0.25]
plot_pts = [haware_points, gavel_points]
display_strs = ['A', 'G']
for plot_pt, offset, display_str in zip(plot_pts, offsets, display_strs):
  plot_points(plot_pt, offset, display_str, cluster_names, cluster_colors)

plt.axhline(y=1.0, color='k', linewidth=0.3, linestyle='--')
for i in range(len(Xs) - 1):
  plt.axvline(x=Xs[i] + 0.5, color='k', linewidth=1)
plt.xticks(ticks=Xs, labels=[x for x in model_ordering], rotation=0, fontsize=12)
plt.legend(fontsize=12)
plt.yticks(fontsize=12)
plt.gcf().set_size_inches(6, 2.25)
plt.gcf().set_tight_layout(tight=1)
plt.gcf().set_dpi(300)
plt.ylabel('JCT (hrs)', fontsize=12)
plt.show()
# plt.savefig('/tmp/norm_gpu_hrs.pdf')


Processing /workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads
['/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload8.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload3.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload7.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload1.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload2.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload4.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload6.log', '/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload5.log']
/workspace/simulator.v3/results/apollo/5_azure_2_dgx/no_ncf_workloads/workload8.log
{'cifar10': {'restart': 119.25, 'azure': 238.75, 'dgx-ext': 262.5}, 'yolov3': {'restart': 493.125, 'azure': 7571.25, 'dgx-ext': 830.625}, 

NameError: name 'gavel_runtime' is not defined