In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.ticker import LogLocator

In [None]:
gpt2 = {
    "Sequence Length": [13, 64, 200, 371, 732],
    "PyTorch": [3.75, 2.80, 3.68, 4.00, 8.62],
    "C++": [4865.00, 23818.00, 74133.00, 132801.00, 269317.00],
    "CUDA v1 (Naive)": [10.85, 15.40, 33.98, 64.97, 117.63],
    "CUDA v2-1 (GEMM)": [9.07, 11.63, 27.83, 52.51, 95.24],
    "CUDA v2-2 (GEMM/TF32)": [12.63, 14.39, 23.91, 41.45, 67.67],
    "CUDA v2-3 (Softmax)": [12.57, 13.10, 19.09, 29.84, 43.94],
    "CUDA v2-4 (LayerNorm)": [11.15, 11.71, 18.44, 28.36, 42.90],
    "CUDA v2-5 (Transpose)": [11.94, 11.61, 18.67, 29.27, 43.48],
    "CUDA v3 (Flash Attention)": [8.16, 8.27, 15.90, 25.05, 38.92],
    "CUDA v4 (Memory Caching)": [7.76, 7.68, 9.00, 13.64, 23.59],
}
df = pd.DataFrame(gpt2)

In [None]:
plt.figure(figsize=(12, 7), dpi=300)

# Plot the data with specified colors
plt.plot(df["Sequence Length"], df["C++"], label="C++", color="C2", linestyle=(0, (1, 5)), marker="^", markersize=6)
plt.plot(df["Sequence Length"], df["PyTorch"], label="PyTorch", color="C0", linestyle="--", marker="s", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v1 (Naive)"], label="CUDA v1 (Naive)", color="#ffed00", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-1 (GEMM)"], label="CUDA v2-1 (GEMM)", color="#ff9c0030", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-2 (GEMM/TF32)"], label="CUDA v2-2 (GEMM/TF32)", color="#ff9c0040", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-3 (Softmax)"], label="CUDA v2-3 (Softmax)", color="#ff9c0060", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-4 (LayerNorm)"], label="CUDA v2-4 (LayerNorm)", color="#ff9c0080", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-5 (Transpose)"], label="CUDA v2-5 (Transpose)", color="#ff9c00", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v3 (Flash Attention)"], label="CUDA v3 (Flash Attention)", color="#ff6300", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v4 (Memory Caching)"], label="CUDA v4 (Memory Caching)", color="#ff0000", linestyle="-", marker="o", markersize=6)

# Annotate the plots
for i, txt in enumerate(df["C++"]):
    plt.annotate(f"{int(txt)}", (df["Sequence Length"][i], df["C++"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")
for i, txt in enumerate(df["PyTorch"]):
    plt.annotate(f"{float(txt)}", (df["Sequence Length"][i], df["PyTorch"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")
for i, txt in enumerate(df["CUDA v4 (Memory Caching)"]):
    plt.annotate(f"{float(txt)}", (df["Sequence Length"][i], df["CUDA v4 (Memory Caching)"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")

# Axis and Grid
plt.xlim(9, 1200)
plt.xscale("log")
plt.yscale("log")
plt.gca().xaxis.set_major_locator(LogLocator(base=10.0, numticks=10))
plt.grid(True, which="both", ls="-", alpha=0.2)

# Labels
plt.title("[GPT2] Performance Comparison: PyTorch vs C++ vs CUDA")
plt.xlabel("Sequence Length")
plt.ylabel("Median Execution Time (ms)")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
llama3_8b = {
    "Sequence Length": [14, 63, 192, 352, 713],
    "PyTorch": [56.39, 62.63, 86.02, 148.30, 251.48],
    "C++": [403337, 1.80118e+06, 5.54735e+06, 1.0833e+07, 2.09741e+07],
    "CUDA v1 (Naive)": [226, 548, 1902, 3162, 6686.5],
    "CUDA v2-1 (GEMM)": [296, 529, 1738, 2774, 5935.5],
    "CUDA v2-2 (GEMM/TF32)": [193, 245, 493.5, 844.5, 2000],
    "CUDA v2-3 (Softmax)": [192, 235, 460, 675, 1781],
    "CUDA v2-4 (RMSNorm)": [175, 223, 451.5, 666.5, 1669],
    "CUDA v2-5 (Transpose)": [176.5, 223, 451, 666, 1705],
    "CUDA v3 (Flash Attention)": [147.5, 192.5, 419.5, 634.5, 1679.5],
    "CUDA v4 (Memory Caching)": [140, 172, 366, 571.5, 1191.5],
}
df = pd.DataFrame(llama3_8b)

In [None]:
plt.figure(figsize=(12, 7), dpi=300)

# Plot the data with specified colors
plt.plot(df["Sequence Length"], df["C++"], label="C++", color="C2", linestyle=(0, (1, 5)), marker="^", markersize=6)
plt.plot(df["Sequence Length"], df["PyTorch"], label="PyTorch", color="C0", linestyle="--", marker="s", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v1 (Naive)"], label="CUDA v1 (Naive)", color="#ffed00", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-1 (GEMM)"], label="CUDA v2-1 (GEMM)", color="#ff9c0030", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-2 (GEMM/TF32)"], label="CUDA v2-2 (GEMM/TF32)", color="#ff9c0040", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-3 (Softmax)"], label="CUDA v2-3 (Softmax)", color="#ff9c0060", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-4 (RMSNorm)"], label="CUDA v2-4 (RMSNorm)", color="#ff9c0080", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v2-5 (Transpose)"], label="CUDA v2-5 (Transpose)", color="#ff9c00", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v3 (Flash Attention)"], label="CUDA v3 (Flash Attention)", color="#ff6300", linestyle="-", marker="o", markersize=6)
plt.plot(df["Sequence Length"], df["CUDA v4 (Memory Caching)"], label="CUDA v4 (Memory Caching)", color="#ff0000", linestyle="-", marker="o", markersize=6)

# Annotate the plots
for i, txt in enumerate(df["C++"]):
    plt.annotate(f"{int(txt)}", (df["Sequence Length"][i], df["C++"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")
for i, txt in enumerate(df["PyTorch"]):
    plt.annotate(f"{float(txt)}", (df["Sequence Length"][i], df["PyTorch"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")
for i, txt in enumerate(df["CUDA v4 (Memory Caching)"]):
    plt.annotate(f"{float(txt)}", (df["Sequence Length"][i], df["CUDA v4 (Memory Caching)"][i]), textcoords="offset points", xytext=(0,5), ha="center", fontsize=8, color="black")

# Axis and Grid
plt.xlim(9, 1200)
plt.xscale("log")
plt.yscale("log")
plt.gca().xaxis.set_major_locator(LogLocator(base=10.0, numticks=10))
plt.grid(True, which="both", ls="-", alpha=0.2)

# Labels
plt.title("[LLaMA-3-8B] Performance Comparison: PyTorch vs C++ vs CUDA")
plt.xlabel("Sequence Length")
plt.ylabel("Median Execution Time (ms)")
plt.legend()
plt.tight_layout()
plt.show()
