In [5]:
import plotly.graph_objects as go
import json
import pandas as pd

In [6]:
def get_flops_per_joule(files):
    results = {}
    for gpu, fname in files.items():
        with open(fname, "r") as f:
            line = f.readline()
            data = json.loads(line)
            results[gpu] = data
    gpus = []
    flops_per_joule_forward = []
    flops_per_joule_backward = []
    forward_error = []
    backward_error = []

    for gpu, data in results.items():
        gpus.append(gpu)

        # Calculate FLOPS per joule
        fwd_flops_per_joule = data['forward_flops_sum'] / data['forward_energy_sum']
        bwd_flops_per_joule = data['backward_flops_sum'] / data['backward_energy_sum']

        flops_per_joule_forward.append(fwd_flops_per_joule)
        flops_per_joule_backward.append(bwd_flops_per_joule)

        # Calculate error bars using SEM for energy (propagate error for FLOPS/energy ratio)
        # For ratio A/B, relative error = sqrt((sem_A/A)^2 + (sem_B/B)^2)
        # Since FLOPS is constant (no error), error comes only from energy SEM
        fwd_rel_error = data['forward_energy_sem'] / data['forward_energy_sum']
        bwd_rel_error = data['backward_energy_sem'] / data['backward_energy_sum']

        forward_error.append(fwd_flops_per_joule * fwd_rel_error)
        backward_error.append(bwd_flops_per_joule * bwd_rel_error)

    return gpus, flops_per_joule_forward, flops_per_joule_backward, forward_error, backward_error

In [7]:
def get_time_data(files):
    results = {}
    for gpu, fname in files.items():
        with open(fname, "r") as f:
            line = f.readline()
            data = json.loads(line)
            results[gpu] = data

    gpus = []
    fwd_cpu_time = []
    bwd_cpu_time = []
    fwd_gpu_time = []
    bwd_gpu_time = []
    fwd_cpu_time_err = []
    bwd_cpu_time_err = []
    fwd_gpu_time_err = []
    bwd_gpu_time_err = []

    for gpu, data in results.items():
        gpus.append(gpu)

        # Times in microseconds, convert to milliseconds
        fwd_cpu_time.append(data['forward_cpu_time_mean'] / 1000)
        bwd_cpu_time.append(data['backward_cpu_time_mean'] / 1000)
        fwd_gpu_time.append(data['forward_gpu_time_mean'] / 1000)
        bwd_gpu_time.append(data['backward_gpu_time_mean'] / 1000)

        # Error bars (SEM) also convert to milliseconds
        fwd_cpu_time_err.append(data['forward_cpu_time_sem'] / 1000)
        bwd_cpu_time_err.append(data['backward_cpu_time_sem'] / 1000)
        fwd_gpu_time_err.append(data['forward_gpu_time_sem'] / 1000)
        bwd_gpu_time_err.append(data['backward_gpu_time_sem'] / 1000)

    return (gpus, fwd_cpu_time, bwd_cpu_time, fwd_gpu_time, bwd_gpu_time,
            fwd_cpu_time_err, bwd_cpu_time_err, fwd_gpu_time_err, bwd_gpu_time_err)

# Efficiency (GFLOPs/Joule)

In [8]:
# FP32 plot
files_fp32 = {
    "V100": "results/baseline_v100_fp32_llama3.2_1b_energy_results.jsonl",
    "A100": "results/baseline_a100-80g_fp32_llama3.2_1b_energy_results.jsonl",
    "A40":  "results/baseline_a40_fp32_llama3.2_1b_energy_results.jsonl",
    "L40S": "results/baseline_l40_fp32_llama3.2_1b_energy_results.jsonl"
}

gpus, fp32_forward, fp32_backward, fp32_fwd_err, fp32_bwd_err = get_flops_per_joule(files_fp32)
fp32_forward = [x / 1e9 for x in fp32_forward]
fp32_backward = [x / 1e9 for x in fp32_backward]
fp32_fwd_err = [x / 1e9 for x in fp32_fwd_err]
fp32_bwd_err = [x / 1e9 for x in fp32_bwd_err]

fig = go.Figure(data=[
    go.Bar(name='Forward FP32', x=gpus, y=fp32_forward,
           error_y=dict(type='data', array=fp32_fwd_err, visible=True)),
    go.Bar(name='Backward FP32', x=gpus, y=fp32_backward,
           error_y=dict(type='data', array=fp32_bwd_err, visible=True))
])
fig.update_layout(
    title="GPU Efficiency: GFLOPs per joule (Llama3.2 1B, FP32, 10 warmup steps, 100 steps)",
    xaxis_title="GPU",
    yaxis_title="GFLOPs per joule",
    barmode='group'
)
fig.show()

In [9]:
# FP16 plot
files_fp16 = {
    "V100": "results/baseline_v100_fp16_llama3.2_1b_energy_results.jsonl",
    "A100": "results/baseline_a100-80g_fp16_llama3.2_1b_energy_results.jsonl",
    "A40":  "results/baseline_a40_fp16_llama3.2_1b_energy_results.jsonl",
    "L40S": "results/baseline_l40_fp16_llama3.2_1b_energy_results.jsonl"
}

gpus, fp16_forward, fp16_backward, fp16_fwd_err, fp16_bwd_err = get_flops_per_joule(files_fp16)
fp16_forward = [x / 1e9 for x in fp16_forward]
fp16_backward = [x / 1e9 for x in fp16_backward]
fp16_fwd_err = [x / 1e9 for x in fp16_fwd_err]
fp16_bwd_err = [x / 1e9 for x in fp16_bwd_err]

fig2 = go.Figure(data=[
    go.Bar(name='Forward FP16', x=gpus, y=fp16_forward,
           error_y=dict(type='data', array=fp16_fwd_err, visible=True)),
    go.Bar(name='Backward FP16', x=gpus, y=fp16_backward,
           error_y=dict(type='data', array=fp16_bwd_err, visible=True))
])
fig2.update_layout(
    title="GPU Efficiency: GFLOPs per joule (Llama3.2 1B, FP16, 10 warmup steps, 100 steps)",
    xaxis_title="GPU",
    yaxis_title="GFLOPs per joule",
    barmode='group'
)
fig2.show()

# Time (ms)

In [10]:
# GPU Time Analysis for GPU Efficiency
# Note: We focus on GPU execution time rather than CPU time because:
# - GPU time measures actual computational work on the GPU hardware
# - CPU time includes host-side overhead (kernel launches, memory transfers, PyTorch overhead)
# - For GPU efficiency analysis, we want to understand how efficiently the GPU cores are utilized

gpus, fp32_fwd_cpu, fp32_bwd_cpu, fp32_fwd_gpu, fp32_bwd_gpu, fp32_fwd_cpu_err, fp32_bwd_cpu_err, fp32_fwd_gpu_err, fp32_bwd_gpu_err = get_time_data(files_fp32)

fig4 = go.Figure(data=[
    go.Bar(name='Forward GPU FP32', x=gpus, y=fp32_fwd_gpu,
           error_y=dict(type='data', array=fp32_fwd_gpu_err, visible=True)),
    go.Bar(name='Backward GPU FP32', x=gpus, y=fp32_bwd_gpu,
           error_y=dict(type='data', array=fp32_bwd_gpu_err, visible=True))
])
fig4.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP32, 10 warmup steps, 100 steps)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig4.show()

In [11]:
# FP16 GPU Time plots
# Focusing on GPU execution time as the key metric for GPU efficiency
gpus_fp16, fp16_fwd_cpu, fp16_bwd_cpu, fp16_fwd_gpu, fp16_bwd_gpu, fp16_fwd_cpu_err, fp16_bwd_cpu_err, fp16_fwd_gpu_err, fp16_bwd_gpu_err = get_time_data(files_fp16)

fig6 = go.Figure(data=[
    go.Bar(name='Forward GPU FP16', x=gpus_fp16, y=fp16_fwd_gpu,
           error_y=dict(type='data', array=fp16_fwd_gpu_err, visible=True)),
    go.Bar(name='Backward GPU FP16', x=gpus_fp16, y=fp16_bwd_gpu,
           error_y=dict(type='data', array=fp16_bwd_gpu_err, visible=True))
])
fig6.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP16, 10 warmup steps, 100 steps)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig6.show()

In [12]:
# Combined FP32 vs FP16 GPU Time Comparison
fig7 = go.Figure(data=[
    go.Bar(name='Forward FP32', x=gpus, y=fp32_fwd_gpu,
           error_y=dict(type='data', array=fp32_fwd_gpu_err, visible=True),
           marker_color='lightblue'),
    go.Bar(name='Forward FP16', x=gpus_fp16, y=fp16_fwd_gpu,
           error_y=dict(type='data', array=fp16_fwd_gpu_err, visible=True),
           marker_color='darkblue'),
    go.Bar(name='Backward FP32', x=gpus, y=fp32_bwd_gpu,
           error_y=dict(type='data', array=fp32_bwd_gpu_err, visible=True),
           marker_color='lightcoral'),
    go.Bar(name='Backward FP16', x=gpus_fp16, y=fp16_bwd_gpu,
           error_y=dict(type='data', array=fp16_bwd_gpu_err, visible=True),
           marker_color='darkred')
])
fig7.update_layout(
    title="GPU Time Comparison: FP32 vs FP16 (Llama3.2 1B, 10 warmup steps, 100 steps)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig7.show()

# Tables: Efficiency (GFLOPs/Joule)

In [13]:
gpus_fp32, fp32_fwd, fp32_bwd, fp32_fwd_err, fp32_bwd_err = get_flops_per_joule(files_fp32)
gpus_fp16, fp16_fwd, fp16_bwd, fp16_fwd_err, fp16_bwd_err = get_flops_per_joule(files_fp16)
assert gpus_fp32 == gpus_fp16, "GPU lists do not match!"

df = pd.DataFrame({
    "GPU":            gpus_fp32,
    "FP32 Forward":   fp32_fwd,
    "FP32 Backward":  fp32_bwd,
    "FP16 Forward":   fp16_fwd,
    "FP16 Backward":  fp16_bwd,
}).set_index("GPU")

df_g = df / 1e9  # Convert to GFLOPs

df_err = pd.DataFrame({
    "GPU":            gpus_fp32,
    "FP32 Forward":   fp32_fwd_err,
    "FP32 Backward":  fp32_bwd_err,
    "FP16 Forward":   fp16_fwd_err,
    "FP16 Backward":  fp16_bwd_err,
}).set_index("GPU")

df_err_g = df_err / 1e9  # Convert to GFLOPs

print("GPU Efficiency (GFLOPs/Joule)")
print(df_g)
print()
print("Error (GFLOPs/Joule)")
print(df_err_g)

df_combined = pd.DataFrame({
    "GPU": gpus_fp32,
    "FP32_Forward_Value": df_g["FP32 Forward"],
    "FP32_Forward_Error": df_err_g["FP32 Forward"],
    "FP32_Backward_Value": df_g["FP32 Backward"],
    "FP32_Backward_Error": df_err_g["FP32 Backward"],
    "FP16_Forward_Value": df_g["FP16 Forward"],
    "FP16_Forward_Error": df_err_g["FP16 Forward"],
    "FP16_Backward_Value": df_g["FP16 Backward"],
    "FP16_Backward_Error": df_err_g["FP16 Backward"],
}).set_index("GPU")

df_combined.to_csv("results/gpu_efficiency_combined.csv")
print("\nSaved combined efficiency dataframe to CSV:")
print("- results/gpu_efficiency_combined.csv")

GPU Efficiency (GFLOPs/Joule)
      FP32 Forward  FP32 Backward  FP16 Forward  FP16 Backward
GPU                                                           
V100      0.724375       0.405051      0.797050       0.253625
A100      0.849340       0.533988      2.623557       0.874169
A40       1.316697       0.573644      1.957099       0.839090
L40S      2.251809       0.712677      3.944471       1.612235

Error (GFLOPs/Joule)
      FP32 Forward  FP32 Backward  FP16 Forward  FP16 Backward
GPU                                                           
V100      0.000195       0.000019      0.000092       0.000006
A100      0.000283       0.000083      0.062540       0.000175
A40       0.000294       0.000017      0.003725       0.000941
L40S      0.000852       0.000034      0.017024       0.000151

Saved combined efficiency dataframe to CSV:
- results/gpu_efficiency_combined.csv


# Tables: Time (ms)

In [14]:
df_time = pd.DataFrame({
    "GPU":            gpus_fp32,
    "FP32 Forward":   fp32_fwd_gpu,
    "FP32 Backward":  fp32_bwd_gpu,
    "FP16 Forward":   fp16_fwd_gpu,
    "FP16 Backward":  fp16_bwd_gpu,
}).set_index("GPU")

df_time_err = pd.DataFrame({
    "GPU":            gpus_fp32,
    "FP32 Forward":   fp32_fwd_gpu_err,
    "FP32 Backward":  fp32_bwd_gpu_err,
    "FP16 Forward":   fp16_fwd_gpu_err,
    "FP16 Backward":  fp16_bwd_gpu_err,
}).set_index("GPU")

print("GPU Time (ms)")
print(df_time)
print()
print("Error (ms)")
print(df_time_err)

df_time_combined = pd.DataFrame({
    "GPU": gpus_fp32,
    "FP32_Forward_Value": df_time["FP32 Forward"],
    "FP32_Forward_Error": df_time_err["FP32 Forward"],
    "FP32_Backward_Value": df_time["FP32 Backward"],
    "FP32_Backward_Error": df_time_err["FP32 Backward"],
    "FP16_Forward_Value": df_time["FP16 Forward"],
    "FP16_Forward_Error": df_time_err["FP16 Forward"],
    "FP16_Backward_Value": df_time["FP16 Backward"],
    "FP16_Backward_Error": df_time_err["FP16 Backward"],
}).set_index("GPU")

df_time_combined.to_csv("results/gpu_time_combined.csv")
print("\nSaved combined time dataframe to CSV:")
print("- results/gpu_time_combined.csv")


GPU Time (ms)
      FP32 Forward  FP32 Backward  FP16 Forward  FP16 Backward
GPU                                                           
V100     32.591892      82.431381     47.901308     117.290469
A100     24.115921      75.489363      7.223464      25.309792
A40      21.537208      70.884696     11.856707      32.324321
L40S     14.728896      45.832904      9.440878      24.812749

Error (ms)
      FP32 Forward  FP32 Backward  FP16 Forward  FP16 Backward
GPU                                                           
V100      0.069230       0.107052      0.180499       0.195196
A100      0.064898       0.395355      0.040599       0.015897
A40       0.116379       0.143728      0.001304       0.096747
L40S      0.051209       0.031648      0.001132       0.014546

Saved combined time dataframe to CSV:
- results/gpu_time_combined.csv
