## Setup environment

In [None]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
sns.set_theme(
  context="paper",
  style="whitegrid",
  palette="colorblind",
  font="Source Sans Pro",
)

In [None]:
%config InlineBackend.figure_format = "retina"

## Load data

In [None]:
def read_run(file, variant):
  df = pd.read_table(file)
  # Clean up column names
  df.columns = df.columns.str.strip()
  # Drop summary rows
  df = df.iloc[:-3]
  # Only name and line coverage columns
  # df = df[["Name", "Cov (L)", "Scope (L)", "CL / SL"]]
  df = df.sort_values("Name", ignore_index=True)
  # Remove duplicate names (e.g. from macro-generated code with multiple uses of
  # the same variable name)
  df = df.drop_duplicates("Name", keep=False)
  return df

o0_15_efb_df = read_run("O0-15-mem2reg/git-rsb-efb.tsv", "Clang 15, O0 + mem2reg")
o1_12_df = read_run("O1-12/git-rsb.tsv", "Clang 12, O1")
o1_13_df = read_run("O1-13/git-rsb.tsv", "Clang 13, O1")
o1_14_df = read_run("O1-14/git-rsb.tsv", "Clang 14, O1")
o1_15_df = read_run("O1-15/git-rsb.tsv", "Clang 15, O1")
o1_15_efb_df = read_run("O1-15/git-rsb-efb.tsv", "Clang 15, O1 + KE")
o2_15_df = read_run("O2-15/git-rsb.tsv", "Clang 15, O2")
o2_15_efb_df = read_run("O2-15/git-rsb-efb.tsv", "Clang 15, O2 + KE")
o3_15_df = read_run("O3-15/git-rsb.tsv", "Clang 15, O3")
o3_15_efb_df = read_run("O3-15/git-rsb-efb.tsv", "Clang 15, O3 + KE")

# Restrict all data frames to common names they all share
common_names = (
  set(o0_15_efb_df["Name"]) &
  set(o1_12_df["Name"]) &
  set(o1_13_df["Name"]) &
  set(o1_14_df["Name"]) &
  set(o1_15_df["Name"]) &
  set(o1_15_efb_df["Name"]) &
  set(o2_15_df["Name"]) &
  set(o2_15_efb_df["Name"]) &
  set(o3_15_df["Name"]) &
  set(o3_15_efb_df["Name"])
)
print(f"Common names: {len(common_names)}")
def common_only(df, variant):
  diff = len(df) - len(df[df["Name"].isin(common_names)])
  print(f"Dropped {diff} unique names from {variant}")
  return df[df["Name"].isin(common_names)]
o0_15_efb_df = common_only(o0_15_efb_df, "Clang 15, O0 + mem2reg")
o1_12_df = common_only(o1_12_df, "Clang 12, O1")
o1_13_df = common_only(o1_13_df, "Clang 13, O1")
o1_14_df = common_only(o1_14_df, "Clang 14, O1")
o1_15_df = common_only(o1_15_df, "Clang 15, O1")
o1_15_efb_df = common_only(o1_15_efb_df, "Clang 15, O1 + KE")
o2_15_df = common_only(o2_15_df, "Clang 15, O2")
o2_15_efb_df = common_only(o2_15_efb_df, "Clang 15, O2 + KE")
o3_15_df = common_only(o3_15_df, "Clang 15, O3")
o3_15_efb_df = common_only(o3_15_efb_df, "Clang 15, O3 + KE")

# Order is important here: some data transformations rely on `first` to access
# the baseline, `diff` to access KE vs. not, etc.
# Re-check all transformations when changing the order.
compilations_df = pd.concat([
  o0_15_efb_df,
  o1_12_df,
  o1_13_df,
  o1_14_df,
  o1_15_df,
  o1_15_efb_df,
  o2_15_df,
  o2_15_efb_df,
  o3_15_df,
  o3_15_efb_df,
], keys=[
  "Clang 15, O0 + mem2reg",
  "Clang 12, O1",
  "Clang 13, O1",
  "Clang 14, O1",
  "Clang 15, O1",
  "Clang 15, O1 + KE",
  "Clang 15, O2",
  "Clang 15, O2 + KE",
  "Clang 15, O3",
  "Clang 15, O3 + KE",
], names=[
  "Variant",
  "Row",
])

In [None]:
def normalise(df):
  # Line table may differ between runs, giving different scope line counts
  # Use the largest scope line count from any run to recompute ratio
  df["Max Scope (L)"] = df.groupby("Name")["Scope (L)"].transform("max")
  df["CL / MSL"] = df["Cov (L)"] / df["Max Scope (L)"]
  # Normalise values to baseline (Clang 15, O0 + mem2reg)
  df["Baseline Cov (L)"] = df.groupby("Name")["Cov (L)"].transform("first")
  with np.errstate(all="ignore"):
    df["CL / BCL"] = df["Cov (L)"] / df["Baseline Cov (L)"]
  # Add coverage difference for plots showing change across runs
  df["Cov Diff (L)"] = df.groupby("Name")["Cov (L)"].transform("diff")
  df["CL / BCL Diff"] = df.groupby("Name")["CL / BCL"].transform("diff")
  df["CL / BCL Prev"] = df.groupby("Name")["CL / BCL"].transform("shift")

normalise(compilations_df)

# compilations_df.to_csv("normalised.tsv", sep="\t")

## Playground

In [None]:
# df = compilations_df.copy()
# variants = df.index.get_level_values("Variant")

# df[df["Name"].str.contains("write_graph_chunk_data, data")]
# df[variants.str.fullmatch("Clang 15, O1")].nlargest(50, "CL / BCL")
# o1_df = df[variants.str.fullmatch("Clang 15, O1")]
# o1_df[o1_df["CL / BCL"] > 1].shape[0] / o1_df.shape[0]

# o1_df[o1_df["CB / SB"] == 1][o1_df["CL / BCL"] < 1]

## Coverage by compiler version

In [None]:
df = compilations_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("O[01]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.60),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, 1.002),
)

## Experiment: Absolute coverage, sorted independently, source lines

In [None]:
df = compilations_df.copy()
# Order each variant by coverage independently
df["Order"] = df.sort_values(by="Cov (L)", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15") & variants.str.contains("O[01]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="Cov (L)",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "upper right",
  bbox_to_anchor=(0.70, 0.70),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered source lines",
  ybound=(0, None),
)

## Experiment: Absolute coverage, sorted consistently, instruction bytes

In [None]:
df = compilations_df.copy()
# Order each variant by coverage independently
df["Order"] = df.sort_values(by="Cov (B)", ascending=False).groupby("Variant").cumcount()
# Copy baseline order to other variants
df["Order"] = df.groupby("Name")["Order"].transform("first")
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15") & variants.str.contains("O[01]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="Cov (B)",
  hue="Variant",
  kind="line",
)
ax = g.facet_axis(0, 0)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax.lines[0].zorder = 3
sns.move_legend(
  g,
  "upper right",
  bbox_to_anchor=(0.70, 0.70),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value instruction bytes coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered instruction bytes",
  ybound=(0, None),
)

## Experiment: Absolute coverage, sorted consistently, source lines

In [None]:
df = compilations_df.copy()
# Order each variant by coverage independently
df["Order"] = df.sort_values(by="Cov (L)", ascending=False).groupby("Variant").cumcount()
# Copy baseline order to other variants
df["Order"] = df.groupby("Name")["Order"].transform("first")
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15") & variants.str.contains("O[01]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="Cov (L)",
  hue="Variant",
  kind="line",
)
ax = g.facet_axis(0, 0)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax.lines[0].zorder = 3
sns.move_legend(
  g,
  "upper right",
  bbox_to_anchor=(0.70, 0.70),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered source lines",
  ybound=(0, None),
)

## Coverage by optimisation level

In [None]:
df = compilations_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.65),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, 1.002),
)

## Experiment: Coverage with knowledge extension, sorted independently

In [None]:
df = compilations_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15")]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.65),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, 1.002),
)

## Experiment: Coverage difference with knowledge extension, absolute, O1

In [None]:
df = compilations_df.copy()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15, O1") & variants.str.contains("KE")]
g = sns.relplot(
  df,
  x="Row",
  y="Cov Diff (L)",
  hue="Variant",
  kind="scatter",
)
sns.move_legend(
  g,
  "upper right",
  bbox_to_anchor=(0.77, 0.95),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index",
  xbound=(0, None),
  ylabel="Coverage difference with knowledge extension (source lines)",
  ybound=(0, df["Cov Diff (L)"].max()),
)

## Experiment: Coverage difference with knowledge extension, ratios, O1

In [None]:
df = compilations_df.copy()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15, O1") & variants.str.contains("KE")]

norm = mpl.colors.Normalize(0, 1, clip=True)
palette = sns.color_palette("blend:red,yellow,green", as_cmap=True)

g = sns.relplot(
  df,
  x="Row",
  y="CL / BCL Diff",
  hue="CL / BCL Prev",
  hue_norm=norm,
  palette=palette,
  kind="scatter",
  legend=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Variable index",
  xbound=(0, None),
  ylabel="Coverage ratio difference with knowledge extension",
  ybound=(0, 1),
)

ax = g.facet_axis(0, 0)
plt.colorbar(
  mappable=mpl.cm.ScalarMappable(norm=norm, cmap=palette),
  ax=ax,
  label="Coverage ratio without knowledge extension",
)

## Experiment: Coverage difference with knowledge extension, O1, distribution

In [None]:
df = compilations_df.copy()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15, O1") & variants.str.contains("KE")]
g = sns.displot(
  df,
  x="Cov Diff (L)",
  hue="Variant",
  multiple="dodge",
  discrete=True,
)
sns.move_legend(
  g,
  "upper right",
  bbox_to_anchor=(0.75, 0.95),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Coverage difference with knowledge extension (source lines)",
  xbound=(0, df["Cov Diff (L)"].max()),
  ylabel="Variables",
  ybound=(0, 500),
)

## Experiment: Coverage with knowledge extension, O1, sorted consistently

In [None]:
df = compilations_df.copy()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 15") & variants.str.contains("O1")]

# Create figure with multiple axes
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
[ax1, ax2] = axs
fig.suptitle("Variable value source line coverage with knowledge extension (Git)")

df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
df["Order"] = df.groupby("Name")["Order"].transform("first")
sns.lineplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  ax=ax1,
)
ax1.legend(title=False)
ax1.set_xlabel("Variable index (sorted by coverage)")
ax1.set_xbound(0, df["Order"].max())
ax1.set_ylabel("Covered source lines normalised to defined region")
ax1.set_ybound(0, 1.002)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax1.lines[0].zorder = 3

df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
df["Order"] = df.groupby("Name")["Order"].transform("last")
sns.lineplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  ax=ax2,
)
ax2.legend(title=False)
ax2.set_xlabel("Variable index (sorted by coverage)")
ax2.set_xbound(0, df["Order"].max())
ax2.set_ylabel("Covered source lines normalised to defined region")
ax2.set_ybound(0, 1.002)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax2.lines[1].zorder = 3

## Coverage ratio comparison

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O0 + mem2reg"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O0 + mem2reg"
df["Defined source lines (our approach)"] = df["CL / SL"]
df["Scope source lines (other tools)"] = 1.0
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=["Defined source lines (our approach)", "Scope source lines (other tools)"],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
g = sns.relplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.75),
  frameon=True,
  shadow=True,
  title=None,
)
ax = g.facet_axis(0, 0)
x = ax.lines[0].get_xdata()
scope_y = ax.lines[0].get_ydata()
defined_y = ax.lines[1].get_ydata()
ax.fill_between(x, scope_y, defined_y, color="gray", alpha=0.5)
ax.annotate(
  "Unachievable\ncoverage",
  xy=(0.85, 0.9),
  xycoords="axes fraction",
  horizontalalignment="center",
  bbox=dict(boxstyle="round", facecolor="m"),
)
g.set(
  title="Coverage metric comparison (Git, Clang 15)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Coverage of scope source lines",
  ybound=(0, 1.002),
)

## Experiment: Both metrics, ratios, O0, sorted independently

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O0 + mem2reg"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O0 + mem2reg"
df["Covered / defined source lines (our approach)"] = df["CL / BCL"]
df["Covered / scope instr. bytes (other tools)"] = df["CB / SB"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered / defined source lines (our approach)",
    "Covered / scope instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
g = sns.relplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.5),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Coverage metric comparison (Git, Clang 15, O0)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Coverage ratio (multiple metrics)",
  ybound=(0, 1.002),
)

## Experiment: Both metrics, ratios, O1, sorted independently

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O1"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O1"
df["Covered / defined source lines (our approach)"] = df["CL / BCL"]
df["Covered / scope instr. bytes (other tools)"] = df["CB / SB"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered / defined source lines (our approach)",
    "Covered / scope instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
g = sns.relplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.2),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Coverage metric comparison (Git, Clang 15, O1)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Coverage ratio (multiple metrics)",
  ybound=(0, 1.002),
)

## Experiment: Both metrics, absolute, O1, sorted independently

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O1"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O1"
df["Covered source lines (our approach)"] = df["Cov (L)"]
df["Covered instr. bytes (other tools)"] = df["Cov (B)"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered source lines (our approach)",
    "Covered instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
sns.lineplot(
  df[df["Cov Type"] == "Covered source lines (our approach)"],
  x="Order",
  y="Cov Value",
  color=sns.color_palette()[0],
  ax=ax1,
  label="Covered source lines (our approach)",
  legend=None,
)
sns.lineplot(
  df[df["Cov Type"] == "Covered instr. bytes (other tools)"],
  x="Order",
  y="Cov Value",
  color=sns.color_palette()[1],
  ax=ax2,
  label="Covered instr. bytes (other tools)",
  legend=None,
)
fig.legend(
  loc="upper right",
  bbox_to_anchor=(0.875, 0.85),
  frameon=True,
  shadow=True,
  title=None,
)
ax1.set_title("Coverage metric comparison (Git, Clang 15, O1)")
ax1.set_xlabel("Variable index (sorted by coverage)")
ax1.set_ylabel("Covered source lines")
ax1.set_xbound(0, df["Order"].max())
ax1.set_ybound(0, df[df["Cov Type"] == "Covered source lines (our approach)"]["Cov Value"].max())
ax2.set_ylabel("Covered instr. bytes")
ax2.set_xbound(0, df["Order"].max())
ax2.set_ybound(0, df[df["Cov Type"] == "Covered instr. bytes (other tools)"]["Cov Value"].max())
ax2.grid(False)

## Experiment: Both metrics, ratios, O0, sorted consistently

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O0 + mem2reg"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O0 + mem2reg"
df["Covered / defined source lines (our approach)"] = df["CL / BCL"]
df["Covered / scope instr. bytes (other tools)"] = df["CB / SB"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered / defined source lines (our approach)",
    "Covered / scope instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
# Copy baseline order to other variants
df["Order"] = df.groupby("Name")["Order"].transform("first")
g = sns.relplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  kind="line",
)
ax = g.facet_axis(0, 0)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax.lines[0].zorder = 3
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.5),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Coverage metric comparison (Git, Clang 15, O0)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, df["Order"].max()),
  ylabel="Coverage ratio (multiple metrics)",
  ybound=(0, 1.002),
)

## Experiment: Both metrics, ratios, O1, sorted consistently

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O1"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O1"
df["Covered / defined source lines (our approach)"] = df["CL / BCL"]
df["Covered / scope instr. bytes (other tools)"] = df["CB / SB"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered / defined source lines (our approach)",
    "Covered / scope instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)

# Create figure with multiple axes
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
[ax1, ax2] = axs
fig.suptitle("Coverage metric comparison (Git, Clang 15, O1)")

df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
df["Order"] = df.groupby("Name")["Order"].transform("first")
sns.lineplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  ax=ax1,
)
ax1.legend(title=False)
ax1.set_xlabel("Variable index (sorted by coverage)")
ax1.set_xbound(0, df["Order"].max())
ax1.set_ylabel("Coverage ratio (multiple metrics)")
ax1.set_ybound(0, 1.002)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax1.lines[0].zorder = 3

df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
df["Order"] = df.groupby("Name")["Order"].transform("last")
sns.lineplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  ax=ax2,
)
ax2.legend(title=False)
ax2.set_xlabel("Variable index (sorted by coverage)")
ax2.set_xbound(0, df["Order"].max())
ax2.set_ylabel("Coverage ratio (multiple metrics)")
ax2.set_ybound(0, 1.002)
# Setting `zorder` > 2 ensures the sorted line is drawn on top
ax2.lines[1].zorder = 3

## Experiment: Both metrics, ratios, O1, distribution

In [None]:
df = compilations_df.copy()
df = df.loc["Clang 15, O1"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 15, O1"
df["Covered / defined source lines (our approach)"] = df["CL / BCL"]
df["Covered / scope instr. bytes (other tools)"] = df["CB / SB"]
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=[
    "Covered / defined source lines (our approach)",
    "Covered / scope instr. bytes (other tools)",
  ],
  var_name="Cov Type",
  value_name="Cov Value",
)
g = sns.displot(
  df,
  x="Cov Value",
  hue="Cov Type",
  multiple="dodge",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.1, 0.9),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Coverage metric comparison (Git, Clang 15, O1)",
  xlabel="Coverage ratio (multiple metrics)",
  xbound=(0, 1.02),
  ylabel="Variables",
  ybound=(0, None),
)

## Arith. mean coverage

In [None]:
df = compilations_df.copy()
df = df.groupby("Variant")["CL / BCL"].mean().reset_index()
df = df.sort_values(by="CL / BCL", ascending=False).reset_index()
df = df[df["Variant"].str.contains("O[01]") & ~(df["Variant"].str.contains("KE"))]
g = sns.catplot(
  df,
  x="Variant",
  y="CL / BCL",
  kind="bar",
  height=3.0,
  aspect=4 / 3,
)
g.set(
  title="Variable value source line coverage (Git)",
  xlabel="Compiler version and optimisation level",
  xticklabels=["\n"*(i%2) + l for i,l in enumerate(df["Variant"])],
  ylabel="Arith. mean coverage norm. to defined region",
)