In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
info_job_level_dfs = {}
burstiness_gpu_level_dfs = {}
temporal_gpu_level_dfs = {}
info_job_level_dfs["gpu_utilization"] = pd.read_csv("job_level_gpu_utilization.csv")
gputil_burstiness = pd.read_csv("burstiness_gpu_level_gpu_utilization.csv")
temporal_gpu_level_dfs["gpu_utilization"] = pd.read_csv("temporal_gpu_level_gpu_utilization.csv")

In [3]:
gputil_all = info_job_level_dfs["gpu_utilization"]

gputil_30 = gputil_all[gputil_all['mean_utilization'] <= 30]
gputil_30_70 = gputil_all[(gputil_all['mean_utilization'] > 30) & (gputil_all['mean_utilization'] < 70)]
gputil_70 = gputil_all[(gputil_all['mean_utilization'] >= 70) & (gputil_all['mean_utilization'] <= 100)]  

In [4]:
gputil_temporal_grouped = temporal_gpu_level_dfs["gpu_utilization"].groupby("jobid").agg({"temporal_imbalance" : 'max'}).reset_index()
gputil_temporal_30 = pd.merge(gputil_30, gputil_temporal_grouped[['jobid', 'temporal_imbalance']], on='jobid', how='left')
gputil_temporal_30_70 = pd.merge(gputil_30_70, gputil_temporal_grouped[['jobid', 'temporal_imbalance']], on='jobid', how='left')
gputil_temporal_70 = pd.merge(gputil_70, gputil_temporal_grouped[['jobid', 'temporal_imbalance']], on='jobid', how='left')

In [5]:
gputil_burstiness_grouped = gputil_burstiness.groupby("jobid").agg({'burstiness_score' : 'mean'}).reset_index()

In [6]:
gputil_temporal_burstiness = pd.merge(gputil_all, gputil_burstiness_grouped, on="jobid", how="left")
gputil_temporal_burstiness = pd.merge(gputil_temporal_burstiness, gputil_temporal_grouped, on="jobid", how="left")
gputil_temporal_burstiness = gputil_temporal_burstiness.dropna()

In [7]:
lowt_lowb = gputil_temporal_burstiness[(gputil_temporal_burstiness["temporal_imbalance"] <= 0.5) & (gputil_temporal_burstiness["burstiness_score"] <= 0)]
lowt_highb = gputil_temporal_burstiness[(gputil_temporal_burstiness["temporal_imbalance"] <= 0.5) & (gputil_temporal_burstiness["burstiness_score"] > 0)]
hight_lowb = gputil_temporal_burstiness[(gputil_temporal_burstiness["temporal_imbalance"] > 0.5) & (gputil_temporal_burstiness["burstiness_score"] <= 0)]
hight_highb = gputil_temporal_burstiness[(gputil_temporal_burstiness["temporal_imbalance"] > 0.5) & (gputil_temporal_burstiness["burstiness_score"] > 0)]

In [8]:
def calculate_burstiness_temporal_statistics(df):
    df_30 = df[df["mean_utilization"] <= 30]
    df_30_70 = df[(df["mean_utilization"] > 30) & (df["mean_utilization"] < 70)]
    df_70 = df[df["mean_utilization"] >= 70]

    total_job_count = df["jobid"].nunique()
    total_gpu_hours = df["gpu_hours"].sum()
    low_mean_percentage = df_30["jobid"].nunique()/total_job_count * 100
    medium_mean_percentage = df_30_70["jobid"].nunique()/total_job_count * 100
    high_mean_percentage = df_70["jobid"].nunique()/total_job_count * 100

    return pd.DataFrame({
        "Statistic": ["Total Job Count", "Total GPU Hours", "Low Mean Utilization (%)", "Medium Mean Utilization (%)", "High Mean Utilization (%)"],
        "Value": [total_job_count, total_gpu_hours, low_mean_percentage, medium_mean_percentage, high_mean_percentage]
    })

In [9]:
pd.set_option('display.float_format', '{:.2f}'.format)

lowt_lowb_stats_df = calculate_burstiness_temporal_statistics(lowt_lowb)
lowt_lowb_stats_df

Unnamed: 0,Statistic,Value
0,Total Job Count,104730.0
1,Total GPU Hours,1462583.76
2,Low Mean Utilization (%),32.34
3,Medium Mean Utilization (%),42.2
4,High Mean Utilization (%),25.47


In [10]:
pd.set_option('display.float_format', '{:.2f}'.format)

lowt_highb_stats_df = calculate_burstiness_temporal_statistics(lowt_highb)
lowt_highb_stats_df

Unnamed: 0,Statistic,Value
0,Total Job Count,55639.0
1,Total GPU Hours,2865362.35
2,Low Mean Utilization (%),15.95
3,Medium Mean Utilization (%),41.47
4,High Mean Utilization (%),42.59


In [11]:
pd.set_option('display.float_format', '{:.2f}'.format)

hight_lowb_stats_df = calculate_burstiness_temporal_statistics(hight_lowb)
hight_lowb_stats_df

Unnamed: 0,Statistic,Value
0,Total Job Count,145971.0
1,Total GPU Hours,2657522.6
2,Low Mean Utilization (%),54.15
3,Medium Mean Utilization (%),45.28
4,High Mean Utilization (%),0.56


In [12]:
pd.set_option('display.float_format', '{:.2f}'.format)

hight_highb_stats_df = calculate_burstiness_temporal_statistics(hight_highb)
hight_highb_stats_df

Unnamed: 0,Statistic,Value
0,Total Job Count,38603.0
1,Total GPU Hours,3534739.83
2,Low Mean Utilization (%),74.55
3,Medium Mean Utilization (%),24.15
4,High Mean Utilization (%),1.3
