In [73]:
import plotly.graph_objects as go
import json
import pandas as pd
import glob
import os

In [74]:
def load_optimized_results(results_dir="results_optimized"):
    """Load optimized results and organize by GPU and precision"""
    results = {}

    # Find all result files
    pattern = os.path.join(results_dir, "*_result.jsonl")
    files = glob.glob(pattern)

    for file_path in files:
        filename = os.path.basename(file_path)
        # Parse filename: gpu_precision_model_config_result.jsonl
        parts = filename.replace("_result.jsonl", "").split("_")
        gpu = parts[0]
        precision = parts[1]
        config = parts[-1]  # small, mid, large

        with open(file_path, "r") as f:
            line = f.readline()
            data = json.loads(line)

        if gpu not in results:
            results[gpu] = {}
        if precision not in results[gpu]:
            results[gpu][precision] = {}

        results[gpu][precision][config] = data

    return results


In [75]:
def get_optimized_flops_per_joule(results, precision, config="small"):
    """Extract FLOPS per joule data for a specific precision and config"""
    gpus = []
    flops_per_joule_forward = []
    flops_per_joule_backward = []
    forward_error = []
    backward_error = []

    for gpu, gpu_data in results.items():
        if precision in gpu_data and config in gpu_data[precision]:
            data = gpu_data[precision][config]
            gpus.append(gpu.upper())

            # Use pre-calculated GFLOPs per joule and convert to TFLOPs
            fwd_tflops_per_joule = data['gflops_per_joule_forward'] / 1000
            bwd_tflops_per_joule = data.get('gflops_per_joule_backward', 0) / 1000

            flops_per_joule_forward.append(fwd_tflops_per_joule)
            flops_per_joule_backward.append(bwd_tflops_per_joule)

            # Calculate error bars using SEM for energy (propagate error for FLOPS/energy ratio)
            fwd_rel_error = data['forward_energy_sem'] / data['forward_energy_sum']
            bwd_rel_error = data.get('backward_energy_sem', 0) / data.get('backward_energy_sum', 1)

            forward_error.append(fwd_tflops_per_joule * fwd_rel_error)
            backward_error.append(bwd_tflops_per_joule * bwd_rel_error)

    return gpus, flops_per_joule_forward, flops_per_joule_backward, forward_error, backward_error


In [76]:
def get_optimized_time_data(results, precision, config="small"):
    """Extract time data for a specific precision and config"""
    gpus = []
    fwd_cpu_time = []
    bwd_cpu_time = []
    fwd_gpu_time = []
    bwd_gpu_time = []
    fwd_cpu_time_err = []
    bwd_cpu_time_err = []
    fwd_gpu_time_err = []
    bwd_gpu_time_err = []

    for gpu, gpu_data in results.items():
        if precision in gpu_data and config in gpu_data[precision]:
            data = gpu_data[precision][config]
            gpus.append(gpu.upper())

            # Times in microseconds, convert to milliseconds
            fwd_cpu_time.append(data['forward_cpu_time_mean'] / 1000)
            bwd_cpu_time.append(data.get('backward_cpu_time_mean', 0) / 1000)
            fwd_gpu_time.append(data['forward_gpu_time_mean'] / 1000)
            bwd_gpu_time.append(data.get('backward_gpu_time_mean', 0) / 1000)

            # Error bars (SEM) also convert to milliseconds
            fwd_cpu_time_err.append(data['forward_cpu_time_sem'] / 1000)
            bwd_cpu_time_err.append(data.get('backward_cpu_time_sem', 0) / 1000)
            fwd_gpu_time_err.append(data['forward_gpu_time_sem'] / 1000)
            bwd_gpu_time_err.append(data.get('backward_gpu_time_sem', 0) / 1000)

    return (gpus, fwd_cpu_time, bwd_cpu_time, fwd_gpu_time, bwd_gpu_time,
            fwd_cpu_time_err, bwd_cpu_time_err, fwd_gpu_time_err, bwd_gpu_time_err)


In [77]:
# Load optimized results
results = load_optimized_results()
print("Available GPUs:", list(results.keys()))
print("Available precisions for A100:", list(results['a100'].keys()) if 'a100' in results else "N/A")
print("Available configs for A100 FP16:", list(results['a100']['fp16'].keys()) if 'a100' in results and 'fp16' in results['a100'] else "N/A")


Available GPUs: ['l40', 'h100', 'a40', 'a100', 'v100']
Available precisions for A100: ['fp16', 'fp32', 'bf16']
Available configs for A100 FP16: ['mid', 'large', 'small']


# Efficiency (TFLOPs/Joule) - Optimized Results


In [78]:
# FP32 plot - Small config
print("\n" + "="*80)
print("SMALL CONFIG EFFICIENCY PLOTS")
print("="*80)


gpus_fp32, fp32_forward, fp32_backward, fp32_fwd_err, fp32_bwd_err = get_optimized_flops_per_joule(results, "fp32", "small")

fig = go.Figure(data=[
    go.Bar(name='Forward FP32', x=gpus_fp32, y=fp32_forward,
           error_y=dict(type='data', array=fp32_fwd_err, visible=True)),
    go.Bar(name='Backward FP32', x=gpus_fp32, y=fp32_backward,
           error_y=dict(type='data', array=fp32_bwd_err, visible=True))
])
fig.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP32, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig.show()

# FP16 plot - Small config
gpus_fp16, fp16_forward, fp16_backward, fp16_fwd_err, fp16_bwd_err = get_optimized_flops_per_joule(results, "fp16", "small")

fig2 = go.Figure(data=[
    go.Bar(name='Forward FP16', x=gpus_fp16, y=fp16_forward,
           error_y=dict(type='data', array=fp16_fwd_err, visible=True)),
    go.Bar(name='Backward FP16', x=gpus_fp16, y=fp16_backward,
           error_y=dict(type='data', array=fp16_bwd_err, visible=True))
])
fig2.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP16, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig2.show()

# BF16 plot - Small config
gpus_bf16, bf16_forward, bf16_backward, bf16_fwd_err, bf16_bwd_err = get_optimized_flops_per_joule(results, "bf16", "small")

fig3 = go.Figure(data=[
    go.Bar(name='Forward BF16', x=gpus_bf16, y=bf16_forward,
           error_y=dict(type='data', array=bf16_fwd_err, visible=True)),
    go.Bar(name='Backward BF16', x=gpus_bf16, y=bf16_backward,
           error_y=dict(type='data', array=bf16_bwd_err, visible=True))
])
fig3.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, BF16, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig3.show()



SMALL CONFIG EFFICIENCY PLOTS


In [79]:
# MID CONFIG EFFICIENCY PLOTS
print("\n" + "="*80)
print("MID CONFIG EFFICIENCY PLOTS")
print("="*80)

# FP32 plot - Mid config
gpus_fp32_mid, fp32_forward_mid, fp32_backward_mid, fp32_fwd_err_mid, fp32_bwd_err_mid = get_optimized_flops_per_joule(results, "fp32", "mid")

fig_mid_fp32 = go.Figure(data=[
    go.Bar(name='Forward FP32', x=gpus_fp32_mid, y=fp32_forward_mid,
           error_y=dict(type='data', array=fp32_fwd_err_mid, visible=True)),
    go.Bar(name='Backward FP32', x=gpus_fp32_mid, y=fp32_backward_mid,
           error_y=dict(type='data', array=fp32_bwd_err_mid, visible=True))
])
fig_mid_fp32.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP32, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_mid_fp32.show()

# FP16 plot - Mid config
gpus_fp16_mid, fp16_forward_mid, fp16_backward_mid, fp16_fwd_err_mid, fp16_bwd_err_mid = get_optimized_flops_per_joule(results, "fp16", "mid")

fig_mid_fp16 = go.Figure(data=[
    go.Bar(name='Forward FP16', x=gpus_fp16_mid, y=fp16_forward_mid,
           error_y=dict(type='data', array=fp16_fwd_err_mid, visible=True)),
    go.Bar(name='Backward FP16', x=gpus_fp16_mid, y=fp16_backward_mid,
           error_y=dict(type='data', array=fp16_bwd_err_mid, visible=True))
])
fig_mid_fp16.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP16, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_mid_fp16.show()

# BF16 plot - Mid config
gpus_bf16_mid, bf16_forward_mid, bf16_backward_mid, bf16_fwd_err_mid, bf16_bwd_err_mid = get_optimized_flops_per_joule(results, "bf16", "mid")

fig_mid_bf16 = go.Figure(data=[
    go.Bar(name='Forward BF16', x=gpus_bf16_mid, y=bf16_forward_mid,
           error_y=dict(type='data', array=bf16_fwd_err_mid, visible=True)),
    go.Bar(name='Backward BF16', x=gpus_bf16_mid, y=bf16_backward_mid,
           error_y=dict(type='data', array=bf16_bwd_err_mid, visible=True))
])
fig_mid_bf16.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, BF16, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_mid_bf16.show()



MID CONFIG EFFICIENCY PLOTS


In [80]:
# LARGE CONFIG EFFICIENCY PLOTS
print("\n" + "="*80)
print("LARGE CONFIG EFFICIENCY PLOTS")
print("="*80)

# FP32 plot - Large config
gpus_fp32_large, fp32_forward_large, fp32_backward_large, fp32_fwd_err_large, fp32_bwd_err_large = get_optimized_flops_per_joule(results, "fp32", "large")

fig_large_fp32 = go.Figure(data=[
    go.Bar(name='Forward FP32', x=gpus_fp32_large, y=fp32_forward_large,
           error_y=dict(type='data', array=fp32_fwd_err_large, visible=True)),
    go.Bar(name='Backward FP32', x=gpus_fp32_large, y=fp32_backward_large,
           error_y=dict(type='data', array=fp32_bwd_err_large, visible=True))
])
fig_large_fp32.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP32, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_large_fp32.show()

# FP16 plot - Large config
gpus_fp16_large, fp16_forward_large, fp16_backward_large, fp16_fwd_err_large, fp16_bwd_err_large = get_optimized_flops_per_joule(results, "fp16", "large")

fig_large_fp16 = go.Figure(data=[
    go.Bar(name='Forward FP16', x=gpus_fp16_large, y=fp16_forward_large,
           error_y=dict(type='data', array=fp16_fwd_err_large, visible=True)),
    go.Bar(name='Backward FP16', x=gpus_fp16_large, y=fp16_backward_large,
           error_y=dict(type='data', array=fp16_bwd_err_large, visible=True))
])
fig_large_fp16.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, FP16, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_large_fp16.show()

# BF16 plot - Large config
gpus_bf16_large, bf16_forward_large, bf16_backward_large, bf16_fwd_err_large, bf16_bwd_err_large = get_optimized_flops_per_joule(results, "bf16", "large")

fig_large_bf16 = go.Figure(data=[
    go.Bar(name='Forward BF16', x=gpus_bf16_large, y=bf16_forward_large,
           error_y=dict(type='data', array=bf16_fwd_err_large, visible=True)),
    go.Bar(name='Backward BF16', x=gpus_bf16_large, y=bf16_backward_large,
           error_y=dict(type='data', array=bf16_bwd_err_large, visible=True))
])
fig_large_bf16.update_layout(
    title="GPU Efficiency: TFLOPs per joule (Llama3.2 1B, BF16, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="TFLOPs per joule",
    barmode='group'
)
fig_large_bf16.show()


LARGE CONFIG EFFICIENCY PLOTS


# Time (ms) - Optimized Results


In [81]:
# GPU Time Analysis for GPU Efficiency - FP32 Small Config
gpus_fp32_time, fp32_fwd_cpu, fp32_bwd_cpu, fp32_fwd_gpu, fp32_bwd_gpu, fp32_fwd_cpu_err, fp32_bwd_cpu_err, fp32_fwd_gpu_err, fp32_bwd_gpu_err = get_optimized_time_data(results, "fp32", "small")

fig4 = go.Figure(data=[
    go.Bar(name='Forward GPU FP32', x=gpus_fp32_time, y=fp32_fwd_gpu,
           error_y=dict(type='data', array=fp32_fwd_gpu_err, visible=True)),
    go.Bar(name='Backward GPU FP32', x=gpus_fp32_time, y=fp32_bwd_gpu,
           error_y=dict(type='data', array=fp32_bwd_gpu_err, visible=True))
])
fig4.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP32, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig4.show()

# FP16 GPU Time plots - Small Config
gpus_fp16_time, fp16_fwd_cpu, fp16_bwd_cpu, fp16_fwd_gpu, fp16_bwd_gpu, fp16_fwd_cpu_err, fp16_bwd_cpu_err, fp16_fwd_gpu_err, fp16_bwd_gpu_err = get_optimized_time_data(results, "fp16", "small")

fig6 = go.Figure(data=[
    go.Bar(name='Forward GPU FP16', x=gpus_fp16_time, y=fp16_fwd_gpu,
           error_y=dict(type='data', array=fp16_fwd_gpu_err, visible=True)),
    go.Bar(name='Backward GPU FP16', x=gpus_fp16_time, y=fp16_bwd_gpu,
           error_y=dict(type='data', array=fp16_bwd_gpu_err, visible=True))
])
fig6.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP16, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig6.show()

# BF16 GPU Time plots - Small Config
gpus_bf16_time, bf16_fwd_cpu, bf16_bwd_cpu, bf16_fwd_gpu, bf16_bwd_gpu, bf16_fwd_cpu_err, bf16_bwd_cpu_err, bf16_fwd_gpu_err, bf16_bwd_gpu_err = get_optimized_time_data(results, "bf16", "small")

fig7 = go.Figure(data=[
    go.Bar(name='Forward GPU BF16', x=gpus_bf16_time, y=bf16_fwd_gpu,
           error_y=dict(type='data', array=bf16_fwd_gpu_err, visible=True)),
    go.Bar(name='Backward GPU BF16', x=gpus_bf16_time, y=bf16_bwd_gpu,
           error_y=dict(type='data', array=bf16_bwd_gpu_err, visible=True))
])
fig7.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, BF16, Small Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig7.show()


In [82]:
# MID CONFIG TIME PLOTS
print("\n" + "="*80)
print("MID CONFIG TIME PLOTS")
print("="*80)

# GPU Time Analysis for GPU Efficiency - FP32 Mid Config
gpus_fp32_time_mid, fp32_fwd_cpu_mid, fp32_bwd_cpu_mid, fp32_fwd_gpu_mid, fp32_bwd_gpu_mid, fp32_fwd_cpu_err_mid, fp32_bwd_cpu_err_mid, fp32_fwd_gpu_err_mid, fp32_bwd_gpu_err_mid = get_optimized_time_data(results, "fp32", "mid")

fig_mid_fp32_time = go.Figure(data=[
    go.Bar(name='Forward GPU FP32', x=gpus_fp32_time_mid, y=fp32_fwd_gpu_mid,
           error_y=dict(type='data', array=fp32_fwd_gpu_err_mid, visible=True)),
    go.Bar(name='Backward GPU FP32', x=gpus_fp32_time_mid, y=fp32_bwd_gpu_mid,
           error_y=dict(type='data', array=fp32_bwd_gpu_err_mid, visible=True))
])
fig_mid_fp32_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP32, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_mid_fp32_time.show()

# FP16 GPU Time plots - Mid Config
gpus_fp16_time_mid, fp16_fwd_cpu_mid, fp16_bwd_cpu_mid, fp16_fwd_gpu_mid, fp16_bwd_gpu_mid, fp16_fwd_cpu_err_mid, fp16_bwd_cpu_err_mid, fp16_fwd_gpu_err_mid, fp16_bwd_gpu_err_mid = get_optimized_time_data(results, "fp16", "mid")

fig_mid_fp16_time = go.Figure(data=[
    go.Bar(name='Forward GPU FP16', x=gpus_fp16_time_mid, y=fp16_fwd_gpu_mid,
           error_y=dict(type='data', array=fp16_fwd_gpu_err_mid, visible=True)),
    go.Bar(name='Backward GPU FP16', x=gpus_fp16_time_mid, y=fp16_bwd_gpu_mid,
           error_y=dict(type='data', array=fp16_bwd_gpu_err_mid, visible=True))
])
fig_mid_fp16_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP16, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_mid_fp16_time.show()

# BF16 GPU Time plots - Mid Config
gpus_bf16_time_mid, bf16_fwd_cpu_mid, bf16_bwd_cpu_mid, bf16_fwd_gpu_mid, bf16_bwd_gpu_mid, bf16_fwd_cpu_err_mid, bf16_bwd_cpu_err_mid, bf16_fwd_gpu_err_mid, bf16_bwd_gpu_err_mid = get_optimized_time_data(results, "bf16", "mid")

fig_mid_bf16_time = go.Figure(data=[
    go.Bar(name='Forward GPU BF16', x=gpus_bf16_time_mid, y=bf16_fwd_gpu_mid,
           error_y=dict(type='data', array=bf16_fwd_gpu_err_mid, visible=True)),
    go.Bar(name='Backward GPU BF16', x=gpus_bf16_time_mid, y=bf16_bwd_gpu_mid,
           error_y=dict(type='data', array=bf16_bwd_gpu_err_mid, visible=True))
])
fig_mid_bf16_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, BF16, Mid Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_mid_bf16_time.show()



MID CONFIG TIME PLOTS


In [83]:
# LARGE CONFIG TIME PLOTS
print("\n" + "="*80)
print("LARGE CONFIG TIME PLOTS")
print("="*80)

# GPU Time Analysis for GPU Efficiency - FP32 Large Config
gpus_fp32_time_large, fp32_fwd_cpu_large, fp32_bwd_cpu_large, fp32_fwd_gpu_large, fp32_bwd_gpu_large, fp32_fwd_cpu_err_large, fp32_bwd_cpu_err_large, fp32_fwd_gpu_err_large, fp32_bwd_gpu_err_large = get_optimized_time_data(results, "fp32", "large")

fig_large_fp32_time = go.Figure(data=[
    go.Bar(name='Forward GPU FP32', x=gpus_fp32_time_large, y=fp32_fwd_gpu_large,
           error_y=dict(type='data', array=fp32_fwd_gpu_err_large, visible=True)),
    go.Bar(name='Backward GPU FP32', x=gpus_fp32_time_large, y=fp32_bwd_gpu_large,
           error_y=dict(type='data', array=fp32_bwd_gpu_err_large, visible=True))
])
fig_large_fp32_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP32, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_large_fp32_time.show()

# FP16 GPU Time plots - Large Config
gpus_fp16_time_large, fp16_fwd_cpu_large, fp16_bwd_cpu_large, fp16_fwd_gpu_large, fp16_bwd_gpu_large, fp16_fwd_cpu_err_large, fp16_bwd_cpu_err_large, fp16_fwd_gpu_err_large, fp16_bwd_gpu_err_large = get_optimized_time_data(results, "fp16", "large")

fig_large_fp16_time = go.Figure(data=[
    go.Bar(name='Forward GPU FP16', x=gpus_fp16_time_large, y=fp16_fwd_gpu_large,
           error_y=dict(type='data', array=fp16_fwd_gpu_err_large, visible=True)),
    go.Bar(name='Backward GPU FP16', x=gpus_fp16_time_large, y=fp16_bwd_gpu_large,
           error_y=dict(type='data', array=fp16_bwd_gpu_err_large, visible=True))
])
fig_large_fp16_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, FP16, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_large_fp16_time.show()

# BF16 GPU Time plots - Large Config
gpus_bf16_time_large, bf16_fwd_cpu_large, bf16_bwd_cpu_large, bf16_fwd_gpu_large, bf16_bwd_gpu_large, bf16_fwd_cpu_err_large, bf16_bwd_cpu_err_large, bf16_fwd_gpu_err_large, bf16_bwd_gpu_err_large = get_optimized_time_data(results, "bf16", "large")

fig_large_bf16_time = go.Figure(data=[
    go.Bar(name='Forward GPU BF16', x=gpus_bf16_time_large, y=bf16_fwd_gpu_large,
           error_y=dict(type='data', array=bf16_fwd_gpu_err_large, visible=True)),
    go.Bar(name='Backward GPU BF16', x=gpus_bf16_time_large, y=bf16_bwd_gpu_large,
           error_y=dict(type='data', array=bf16_bwd_gpu_err_large, visible=True))
])
fig_large_bf16_time.update_layout(
    title="GPU Time: Forward vs Backward (Llama3.2 1B, BF16, Large Config, Optimized)",
    xaxis_title="GPU",
    yaxis_title="GPU Time (ms)",
    barmode='group'
)
fig_large_bf16_time.show()


LARGE CONFIG TIME PLOTS


# Config Comparison (Small vs Mid vs Large)


In [84]:
# Compare different configs for A100 FP16
configs = ['small', 'mid', 'large']
a100_fp16_data = {}

for config in configs:
    if 'a100' in results and 'fp16' in results['a100'] and config in results['a100']['fp16']:
        data = results['a100']['fp16'][config]
        a100_fp16_data[config] = {
            'forward_tflops_per_joule': data['gflops_per_joule_forward'] / 1000,
            'backward_tflops_per_joule': data.get('gflops_per_joule_backward', 0) / 1000,
            'forward_gpu_time': data['forward_gpu_time_mean'] / 1000,
            'backward_gpu_time': data.get('backward_gpu_time_mean', 0) / 1000,
            'input_length': data['input_length']
        }

if a100_fp16_data:
    config_names = list(a100_fp16_data.keys())
    forward_tflops = [a100_fp16_data[config]['forward_tflops_per_joule'] for config in config_names]
    backward_tflops = [a100_fp16_data[config]['backward_tflops_per_joule'] for config in config_names]

    fig8 = go.Figure(data=[
        go.Bar(name='Forward', x=config_names, y=forward_tflops),
        go.Bar(name='Backward', x=config_names, y=backward_tflops)
    ])
    fig8.update_layout(
        title="A100 FP16 Efficiency by Config (Optimized)",
        xaxis_title="Config",
        yaxis_title="TFLOPs per joule",
        barmode='group'
    )
    fig8.show()


# Tables: Efficiency (TFLOPs/Joule) - Optimized Results


In [85]:
# Create efficiency comparison tables for all configs
configs = ['small', 'mid', 'large']
all_gpus = set()
for gpu in results.keys():
    all_gpus.add(gpu.upper())
all_gpus = sorted(list(all_gpus))

# Initialize dataframes for each config
efficiency_data = {}
time_data = {}

for config in configs:
    efficiency_data[config] = {}
    time_data[config] = {}

    for precision in ['fp32', 'fp16', 'bf16']:
        gpus, forward, backward, fwd_err, bwd_err = get_optimized_flops_per_joule(results, precision, config)
        gpus_time, fwd_cpu, bwd_cpu, fwd_gpu, bwd_gpu, fwd_cpu_err, bwd_cpu_err, fwd_gpu_err, bwd_gpu_err = get_optimized_time_data(results, precision, config)

        efficiency_data[config][precision] = {
            'gpus': gpus,
            'forward': forward,
            'backward': backward,
            'forward_err': fwd_err,
            'backward_err': bwd_err
        }

        time_data[config][precision] = {
            'gpus': gpus_time,
            'forward_gpu': fwd_gpu,
            'backward_gpu': bwd_gpu,
            'forward_gpu_err': fwd_gpu_err,
            'backward_gpu_err': bwd_gpu_err
        }

# Create efficiency dataframes for each config
for config in configs:
    print(f"\n{'='*60}")
    print(f"EFFICIENCY TABLE - {config.upper()} CONFIG")
    print(f"{'='*60}")

    efficiency_df_data = {'GPU': all_gpus}
    efficiency_err_df_data = {'GPU': all_gpus}

    for precision in ['fp32', 'fp16', 'bf16']:
        if precision in efficiency_data[config]:
            data = efficiency_data[config][precision]
            # Create mapping from GPU to values
            gpu_to_forward = dict(zip(data['gpus'], data['forward']))
            gpu_to_backward = dict(zip(data['gpus'], data['backward']))
            gpu_to_forward_err = dict(zip(data['gpus'], data['forward_err']))
            gpu_to_backward_err = dict(zip(data['gpus'], data['backward_err']))

            efficiency_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward.get(gpu, None) for gpu in all_gpus]
            efficiency_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward.get(gpu, None) for gpu in all_gpus]
            efficiency_err_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward_err.get(gpu, None) for gpu in all_gpus]
            efficiency_err_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward_err.get(gpu, None) for gpu in all_gpus]

    df_efficiency = pd.DataFrame(efficiency_df_data).set_index("GPU")
    df_efficiency_err = pd.DataFrame(efficiency_err_df_data).set_index("GPU")

    print(f"GPU Efficiency (TFLOPs/Joule) - {config.upper()} Config")
    print(df_efficiency)
    print(f"\nError (TFLOPs/Joule) - {config.upper()} Config")
    print(df_efficiency_err)



EFFICIENCY TABLE - SMALL CONFIG
GPU Efficiency (TFLOPs/Joule) - SMALL Config
      FP32_Forward  FP32_Backward  FP16_Forward  FP16_Backward  BF16_Forward  \
GPU                                                                            
A100      1.048364       0.038145      0.889142       0.605453      0.937989   
A40       1.530084       0.048042      1.434109       0.239654      1.509613   
H100      0.819582       0.062503      0.861138       0.935207      0.869813   
L40       1.432302       0.080693      1.614308       0.541658      1.614987   
V100      1.006462       0.034653      0.911072       0.251433      0.816114   

      BF16_Backward  
GPU                  
A100       0.402717  
A40        0.200275  
H100       0.641832  
L40        0.440734  
V100       0.028080  

Error (TFLOPs/Joule) - SMALL Config
      FP32_Forward  FP32_Backward  FP16_Forward  FP16_Backward  BF16_Forward  \
GPU                                                                            
A100      

In [86]:
# Create time dataframes for each config
for config in configs:
    print(f"\n{'='*60}")
    print(f"TIME TABLE - {config.upper()} CONFIG")
    print(f"{'='*60}")

    time_df_data = {'GPU': all_gpus}
    time_err_df_data = {'GPU': all_gpus}

    for precision in ['fp32', 'fp16', 'bf16']:
        if precision in time_data[config]:
            data = time_data[config][precision]
            # Create mapping from GPU to values
            gpu_to_forward = dict(zip(data['gpus'], data['forward_gpu']))
            gpu_to_backward = dict(zip(data['gpus'], data['backward_gpu']))
            gpu_to_forward_err = dict(zip(data['gpus'], data['forward_gpu_err']))
            gpu_to_backward_err = dict(zip(data['gpus'], data['backward_gpu_err']))

            time_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward.get(gpu, None) for gpu in all_gpus]
            time_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward.get(gpu, None) for gpu in all_gpus]
            time_err_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward_err.get(gpu, None) for gpu in all_gpus]
            time_err_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward_err.get(gpu, None) for gpu in all_gpus]

    df_time = pd.DataFrame(time_df_data).set_index("GPU")
    df_time_err = pd.DataFrame(time_err_df_data).set_index("GPU")

    print(f"GPU Time (ms) - {config.upper()} Config")
    print(df_time)
    print(f"\nError (ms) - {config.upper()} Config")
    print(df_time_err)



TIME TABLE - SMALL CONFIG
GPU Time (ms) - SMALL Config
      FP32_Forward  FP32_Backward  FP16_Forward  FP16_Backward  BF16_Forward  \
GPU                                                                            
A100     26.930957     730.823220     26.936527      80.194909     26.377835   
A40      22.504585     705.357739     22.761384     154.010789     22.180322   
H100     12.452648     254.011743     12.111224      33.269915     11.947869   
L40      15.674593     387.727695     15.730137      68.908316     15.565481   
V100     35.779011     986.596072     37.336473     152.672810     52.094278   

      BF16_Backward  
GPU                  
A100      79.443841  
A40      164.036780  
H100      34.182445  
L40       71.462827  
V100    1244.036788  

Error (ms) - SMALL Config
      FP32_Forward  FP32_Backward  FP16_Forward  FP16_Backward  BF16_Forward  \
GPU                                                                            
A100      0.212105       1.095158      0.0

In [87]:
# Save combined dataframes for each config
for config in configs:
    print(f"\n{'='*60}")
    print(f"SAVING CSV FILES - {config.upper()} CONFIG")
    print(f"{'='*60}")

    # Recreate dataframes for this config
    efficiency_df_data = {'GPU': all_gpus}
    time_df_data = {'GPU': all_gpus}

    for precision in ['fp32', 'fp16', 'bf16']:
        if precision in efficiency_data[config]:
            data = efficiency_data[config][precision]
            gpu_to_forward = dict(zip(data['gpus'], data['forward']))
            gpu_to_backward = dict(zip(data['gpus'], data['backward']))
            gpu_to_forward_err = dict(zip(data['gpus'], data['forward_err']))
            gpu_to_backward_err = dict(zip(data['gpus'], data['backward_err']))

            efficiency_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward.get(gpu, None) for gpu in all_gpus]
            efficiency_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward.get(gpu, None) for gpu in all_gpus]
            efficiency_df_data[f'{precision.upper()}_Forward_Error'] = [gpu_to_forward_err.get(gpu, None) for gpu in all_gpus]
            efficiency_df_data[f'{precision.upper()}_Backward_Error'] = [gpu_to_backward_err.get(gpu, None) for gpu in all_gpus]

    for precision in ['fp32', 'fp16', 'bf16']:
        if precision in time_data[config]:
            data = time_data[config][precision]
            gpu_to_forward = dict(zip(data['gpus'], data['forward_gpu']))
            gpu_to_backward = dict(zip(data['gpus'], data['backward_gpu']))
            gpu_to_forward_err = dict(zip(data['gpus'], data['forward_gpu_err']))
            gpu_to_backward_err = dict(zip(data['gpus'], data['backward_gpu_err']))

            time_df_data[f'{precision.upper()}_Forward'] = [gpu_to_forward.get(gpu, None) for gpu in all_gpus]
            time_df_data[f'{precision.upper()}_Backward'] = [gpu_to_backward.get(gpu, None) for gpu in all_gpus]
            time_df_data[f'{precision.upper()}_Forward_Error'] = [gpu_to_forward_err.get(gpu, None) for gpu in all_gpus]
            time_df_data[f'{precision.upper()}_Backward_Error'] = [gpu_to_backward_err.get(gpu, None) for gpu in all_gpus]

    df_efficiency_combined = pd.DataFrame(efficiency_df_data).set_index("GPU")
    df_time_combined = pd.DataFrame(time_df_data).set_index("GPU")

    # Save to CSV files with config-specific names
    efficiency_filename = f"results_optimized/gpu_efficiency_optimized_{config}.csv"
    time_filename = f"results_optimized/gpu_time_optimized_{config}.csv"

    df_efficiency_combined.to_csv(efficiency_filename)
    df_time_combined.to_csv(time_filename)

    print(f"Saved {config.upper()} config dataframes to CSV:")
    print(f"- {efficiency_filename}")
    print(f"- {time_filename}")

print(f"\n{'='*60}")
print("SUMMARY: All config tables generated!")
print(f"{'='*60}")



SAVING CSV FILES - SMALL CONFIG
Saved SMALL config dataframes to CSV:
- results_optimized/gpu_efficiency_optimized_small.csv
- results_optimized/gpu_time_optimized_small.csv

SAVING CSV FILES - MID CONFIG
Saved MID config dataframes to CSV:
- results_optimized/gpu_efficiency_optimized_mid.csv
- results_optimized/gpu_time_optimized_mid.csv

SAVING CSV FILES - LARGE CONFIG
Saved LARGE config dataframes to CSV:
- results_optimized/gpu_efficiency_optimized_large.csv
- results_optimized/gpu_time_optimized_large.csv

SUMMARY: All config tables generated!


# Cross-Config Analysis


In [88]:
# Analyze efficiency across different configs for each GPU/precision combination
config_analysis = []

for gpu in results.keys():
    for precision in results[gpu].keys():
        for config in results[gpu][precision].keys():
            data = results[gpu][precision][config]

            # Handle missing backward data (forward-only runs)
            backward_tflops = data.get('gflops_per_joule_backward', 0) / 1000
            backward_gpu_time = data.get('backward_gpu_time_mean', 0) / 1000
            backward_energy = data.get('backward_energy_sum', 0)

            config_analysis.append({
                'GPU': gpu.upper(),
                'Precision': precision.upper(),
                'Config': config,
                'Input_Length': data['input_length'],
                'Batch_Size': data['batch_size'],
                'Forward_TFLOPs_per_Joule': data['gflops_per_joule_forward'] / 1000,
                'Backward_TFLOPs_per_Joule': backward_tflops,
                'Forward_GPU_Time_ms': data['forward_gpu_time_mean'] / 1000,
                'Backward_GPU_Time_ms': backward_gpu_time,
                'Forward_Energy_J': data['forward_energy_sum'],
                'Backward_Energy_J': backward_energy
            })

df_config_analysis = pd.DataFrame(config_analysis)
df_config_analysis.to_csv("results_optimized/config_analysis_optimized.csv", index=False)

print("Cross-config analysis saved to: results_optimized/config_analysis_optimized.csv")
print("\nSample of cross-config analysis:")
print(df_config_analysis.head(10))


Cross-config analysis saved to: results_optimized/config_analysis_optimized.csv

Sample of cross-config analysis:
    GPU Precision Config  Input_Length  Batch_Size  Forward_TFLOPs_per_Joule  \
0   L40      BF16  large          1024           8                  5.248393   
1   L40      BF16    mid           512           8                  3.044461   
2   L40      BF16  small           256           8                  1.614987   
3   L40      FP16  small           256           8                  1.614308   
4   L40      FP16  large          1024           8                  6.239047   
5   L40      FP16    mid           512           8                  3.218234   
6   L40      FP32  small           256           8                  1.432302   
7   L40      FP32  large          1024           8                  5.683011   
8   L40      FP32    mid           512           8                  2.770130   
9  H100      BF16    mid           512           8                  1.645426   

   Ba