In [None]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
sns.set_theme(
  context="paper",
  style="whitegrid",
  palette="colorblind",
  font="Source Sans Pro",
)

In [None]:
%config InlineBackend.figure_format = "retina"

In [None]:
def read_run(file, variant):
  df = pd.read_table(file)
  # Clean up column names
  df.columns = df.columns.str.strip()
  # Drop summary rows
  df = df.iloc[:-3]
  # Only name and line coverage columns
  df = df[["Name", "Cov (L)", "Scope (L)", "CL / SL"]]
  df = df.sort_values("Name", ignore_index=True)
  # Remove duplicate names (e.g. from macro-generated code with multiple uses of
  # the same variable name)
  df = df.drop_duplicates("Name", keep=False)
  return df

o0_14_efb_df = read_run("O0-14-mem2reg/git.o-rsb-efb.tsv", "Clang 14, O0 + mem2reg")
o1_12_df = read_run("O1-12/git.o-rsb.tsv", "Clang 12, O1")
o1_13_df = read_run("O1-13/git.o-rsb.tsv", "Clang 13, O1")
o1_14_df = read_run("O1-14/git.o-rsb.tsv", "Clang 14, O1")
o1_14_efb_df = read_run("O1-14/git.o-rsb-efb.tsv", "Clang 14, O1 + KE")
o2_14_df = read_run("O2-14/git.o-rsb.tsv", "Clang 14, O2")
o3_14_df = read_run("O3-14/git.o-rsb.tsv", "Clang 14, O3")

# Restrict all data frames to common names they all share
common_names = (
  set(o0_14_efb_df["Name"]) &
  set(o1_12_df["Name"]) &
  set(o1_13_df["Name"]) &
  set(o1_14_df["Name"]) &
  set(o1_14_efb_df["Name"]) &
  set(o2_14_df["Name"]) &
  set(o3_14_df["Name"])
)
print(f"Common names: {len(common_names)}")
def common_only(df, variant):
  diff = len(df) - len(df[df["Name"].isin(common_names)])
  print(f"Dropped {diff} unique names from {variant}")
  return df[df["Name"].isin(common_names)]
o0_14_efb_df = common_only(o0_14_efb_df, "Clang 14, O0 + mem2reg")
o1_12_df = common_only(o1_12_df, "Clang 12, O1")
o1_13_df = common_only(o1_13_df, "Clang 13, O1")
o1_14_df = common_only(o1_14_df, "Clang 14, O1")
o1_14_efb_df = common_only(o1_14_efb_df, "Clang 14, O1 + KE")
o2_14_df = common_only(o2_14_df, "Clang 14, O2")
o3_14_df = common_only(o3_14_df, "Clang 14, O3")

distribution_df = pd.concat([
  o0_14_efb_df,
  o1_12_df,
  o1_13_df,
  o1_14_df,
  o1_14_efb_df,
  o2_14_df,
  o3_14_df,
], keys=[
  "Clang 14, O0 + mem2reg",
  "Clang 12, O1",
  "Clang 13, O1",
  "Clang 14, O1",
  "Clang 14, O1 + KE",
  "Clang 14, O2",
  "Clang 14, O3",
], names=[
  "Variant",
  "Row",
])

In [None]:
def normalise(df):
  # Line table may differ between runs, giving different scope line counts
  # Use the largest scope line count from any run to recompute ratio
  df["Max Scope (L)"] = df.groupby("Name")["Scope (L)"].transform("max")
  df["CL / MSL"] = df["Cov (L)"] / df["Max Scope (L)"]
  # Normalise values to baseline (Clang 14, O0 + mem2reg)
  df["Baseline Cov (L)"] = df.groupby("Name")["Cov (L)"].transform("first")
  with np.errstate(all="ignore"):
    df["CL / BCL"] = df["Cov (L)"] / df["Baseline Cov (L)"]

normalise(distribution_df)

# distribution_df[distribution_df["Name"].str.contains("write_graph_chunk_data, data")]
# distribution_df.sort_values("CL / BCL", ascending=False)

# distribution_df

In [None]:
df = distribution_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("O[01]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.60),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (git.o)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, None),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, None),
)

In [None]:
df = distribution_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
# O2 and O3 appear to be identical for this data set, omit O3
df = df[variants.str.contains("Clang 14") & variants.str.contains("O[012]") & ~(variants.str.contains("KE"))]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.65),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (git.o)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, None),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, None),
)

In [None]:
df = distribution_df.copy()
df["Order"] = df.sort_values(by="CL / BCL", ascending=False).groupby("Variant").cumcount()
variants = df.index.get_level_values("Variant")
df = df[variants.str.contains("Clang 14") & variants.str.contains("O[01]")]
g = sns.relplot(
  df,
  x="Order",
  y="CL / BCL",
  hue="Variant",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.65),
  frameon=True,
  shadow=True,
  title=None,
)
g.set(
  title="Variable value source line coverage (git.o)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, None),
  ylabel="Covered source lines normalised to defined region",
  ybound=(0, None),
)

In [None]:
df = distribution_df.copy()
df = df.loc["Clang 14, O0 + mem2reg"]
# Revive `Variant` column to assist `melt` below
df["Variant"] = "Clang 14, O0 + mem2reg"
df["Defined source lines (our approach)"] = df["CL / SL"]
df["Scope source lines (other tools)"] = 1.0
df = df.melt(
  id_vars=["Name", "Variant"],
  value_vars=["Defined source lines (our approach)", "Scope source lines (other tools)"],
  var_name="Cov Type",
  value_name="Cov Value",
)
df["Order"] = df.sort_values(by="Cov Value", ascending=False).groupby("Cov Type").cumcount()
g = sns.relplot(
  df,
  x="Order",
  y="Cov Value",
  hue="Cov Type",
  kind="line",
)
sns.move_legend(
  g,
  "center left",
  bbox_to_anchor=(0.125, 0.75),
  frameon=True,
  shadow=True,
  title=None,
)
ax = g.facet_axis(0, 0)
x = ax.lines[0].get_xdata()
scope_y = ax.lines[0].get_ydata()
defined_y = ax.lines[1].get_ydata()
ax.fill_between(x, scope_y, defined_y, color="gray", alpha=0.5)
ax.annotate(
  "Unachievable\ncoverage",
  xy=(0.8, 0.85),
  xycoords="axes fraction",
  horizontalalignment="center",
  bbox=dict(boxstyle="round", facecolor="m"),
)
g.set(
  title="Coverage metric comparison (git.o, Clang 14)",
  xlabel="Variable index (sorted by coverage)",
  xbound=(0, None),
  ylabel="Coverage of scope source lines",
  ybound=(0, None),
)

In [None]:
df = distribution_df.copy()
df = df.groupby("Variant")["CL / BCL"].mean().reset_index()
df = df.sort_values(by="CL / BCL", ascending=False).reset_index()
df = df[df["Variant"].str.contains("O[01]") & ~(df["Variant"].str.contains("KE"))]
g = sns.catplot(
  df,
  x="Variant",
  y="CL / BCL",
  kind="bar",
  height=3.0,
  aspect=4 / 3,
)
g.set(
  title="Variable value source line coverage (git.o)",
  xlabel="Compiler version and optimisation level",
  xticklabels=["\n"*(i%2) + l for i,l in enumerate(df["Variant"])],
  ylabel="Arith. mean coverage norm. to defined region",
)