In [None]:
import torch
import os
import pandas as pd
from flopmetrics.profiler import TorchProfiler
from flopmetrics.ncu import NCUProfiler
from flopmetrics.network import ToyNetwork, run_toy_network_forward_ncu, run_toy_network_forward_backward_ncu, construct_toy_network_and_input_for_ncu

os.makedirs("results", exist_ok=True)

python-dotenv could not parse statement starting at line 1


# Analytical vs PyTorch vs NCU Comparison
Single fixed config n, d, m = 10, 1024, 128

In [3]:
def toy_network_forward_flops(dim, n_layers, n_tokens):
    with TorchProfiler() as prof:
        net = ToyNetwork(n_layers=n_layers, dim=dim)
        x = torch.randn(dim, n_tokens, device="cuda")
        with prof.record_context("forward"):
            _ = net(x)
    return prof.get_flops_by_step().loc["forward", "flops"]


def toy_network_backward_flops(dim, n_layers, n_tokens):
    with TorchProfiler() as prof:
        net = ToyNetwork(n_layers=n_layers, dim=dim)
        x = torch.randn(dim, n_tokens, device="cuda")
        y = net(x)
        with prof.record_context("backward"):
            y.sum().backward()
    return prof.get_flops_by_step().loc["backward", "flops"]


def toy_network_forward_flops_ncu(dim, n_layers, n_tokens):
    ncu = NCUProfiler()
    _ = ncu.profile_function(run_toy_network_forward_ncu, {
        "dim": dim,
        "n_layers": n_layers,
        "n_tokens": n_tokens,
    })
    flops = ncu.get_total_flops()

    ncu2 = NCUProfiler()
    ncu2.profile_function(construct_toy_network_and_input_for_ncu, {
        "dim": dim,
        "n_layers": n_layers,
        "n_tokens": n_tokens,
    })
    setup_flops = ncu2.get_total_flops()
    flops -= setup_flops

    ncu.result.to_csv("experiments/toy_network/results/toy_network_forward_flops_ncu.csv")
    ncu2.result.to_csv("experiments/toy_network/results/toy_network_setup_flops_ncu.csv")

    return flops


def toy_network_forward_backward_flops_ncu(dim, n_layers, n_tokens):
    ncu = NCUProfiler()
    _ = ncu.profile_function(run_toy_network_forward_backward_ncu, {
        "dim": dim,
        "n_layers": n_layers,
        "n_tokens": n_tokens,
    })
    flops = ncu.get_total_flops()

    ncu.result.to_csv("experiments/toy_network/results/toy_network_forward_backward_flops_ncu.csv")

    ncu2 = NCUProfiler()
    ncu2.profile_function(construct_toy_network_and_input_for_ncu, {
        "dim": dim,
        "n_layers": n_layers,
        "n_tokens": n_tokens,
    })
    setup_flops = ncu2.get_total_flops()
    flops -= setup_flops

    return flops


def toy_network_params(dim, n_layers):
    net = ToyNetwork(n_layers=n_layers, dim=dim)
    return sum(p.numel() for p in net.parameters() if p.requires_grad)

In [None]:
N = 10      # number of layers
D = 1024    # size of input/output dimension
M = 128     # sequence length

In [8]:
baseline_num_params = toy_network_params(D, N)
print(baseline_num_params)

baseline_forward_flops_exp = toy_network_forward_flops(D, N, M)
baseline_backward_flops_exp = toy_network_backward_flops(D, N, M)

baseline_forward_flops_ncu = toy_network_forward_flops_ncu(D, N, M)
baseline_backward_flops_ncu = toy_network_forward_backward_flops_ncu(D, N, M) - baseline_forward_flops_ncu

baseline_forward_flops_theory = N * (2 * D * D * M)
baseline_backward_flops_theory = (N - 1) * (4 * D * D * M) + 2 * D * D * M

baseline_forward_flops_theory_exact = baseline_forward_flops_theory + N * (D * M)
baseline_backward_flops_theory_exact = baseline_backward_flops_theory + (N - 1) * (2 * D * M) + D * M

baseline_forward_flops_theory_exact_gpu = baseline_forward_flops_theory_exact # TODO: add actual equation
baseline_backward_flops_theory_exact_gpu = baseline_backward_flops_theory_exact # TODO: add actual equation

baseline_df = pd.DataFrame(
    {
        "forward_flops": [baseline_forward_flops_exp, baseline_forward_flops_ncu, baseline_forward_flops_theory, baseline_forward_flops_theory_exact, baseline_forward_flops_theory_exact_gpu],
        "backward_flops": [baseline_backward_flops_exp, baseline_backward_flops_ncu, baseline_backward_flops_theory, baseline_backward_flops_theory_exact, baseline_backward_flops_theory_exact_gpu],
    },
    index=["torch profiler (experimental)", "ncu profiler (experimental)", "theoretical (analyzed)", "theoretical exact (analyzed)", "theoretical exact gpu implementation (analyzed)"],
)
pd.set_option('display.float_format', '{:,.0f}'.format)
print(baseline_df)

10485760
                                                 forward_flops  backward_flops
torch profiler (experimental)                    2,684,354,560   5,100,273,664
ncu profiler (experimental)                      2,727,606,309   5,134,953,113
theoretical (analyzed)                           2,684,354,560   5,100,273,664
theoretical exact (analyzed)                     2,685,665,280   5,102,764,032
theoretical exact gpu implementation (analyzed)  2,685,665,280   5,102,764,032


# Analytical vs PyTorch Comparison
Fixed n=32, m=128, d from 128 to 2048 with step 128

In [11]:
N = 32
M = 128
D_values = list(range(128, 2049, 128))

results = []

print("Running PyTorch vs Analytical comparison...")
print(f"Parameters: N={N}, M={M}, D_values={D_values}")

for D in D_values:
    print(f"Processing D={D}...")

    pytorch_forward_flops = toy_network_forward_flops(D, N, M)
    pytorch_backward_flops = toy_network_backward_flops(D, N, M)

    analytical_forward_flops = N * (2 * D * D * M) + N * (D * M)
    analytical_backward_flops = (N - 1) * (4 * D * D * M) + 2 * D * D * M + (N - 1) * (2 * D * M) + D * M

    results.append({
        'D': D,
        'pytorch_forward_flops': pytorch_forward_flops,
        'pytorch_backward_flops': pytorch_backward_flops,
        'analytical_forward_flops': analytical_forward_flops,
        'analytical_backward_flops': analytical_backward_flops,
        'forward_error_pct': abs(pytorch_forward_flops - analytical_forward_flops) / analytical_forward_flops * 100,
        'backward_error_pct': abs(pytorch_backward_flops - analytical_backward_flops) / analytical_backward_flops * 100
    })

comparison_df = pd.DataFrame(results)
comparison_df.to_csv("results/analytical_vs_pytorch_comparison.csv", index=False)

print("\nComparison completed!")
print(f"Results saved to: results/analytical_vs_pytorch_comparison.csv")
print("\nSummary:")
print(comparison_df[['D', 'forward_error_pct', 'backward_error_pct']].round(2))

Running PyTorch vs Analytical comparison...
Parameters: N=32, M=128, D_values=[128, 256, 384, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048]
Processing D=128...
Processing D=256...
Processing D=384...
Processing D=512...
Processing D=640...
Processing D=768...
Processing D=896...
Processing D=1024...
Processing D=1152...
Processing D=1280...
Processing D=1408...
Processing D=1536...
Processing D=1664...
Processing D=1792...
Processing D=1920...
Processing D=2048...

Comparison completed!
Results saved to: results/analytical_vs_pytorch_comparison.csv

Summary:
       D  forward_error_pct  backward_error_pct
0    128               0.39                0.39
1    256               0.19                0.19
2    384               0.13                0.13
3    512               0.10                0.10
4    640               0.08                0.08
5    768               0.07                0.07
6    896               0.06                0.06
7   1024               