## Setup environment

In [None]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
import seaborn.objects as so

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
sns.set_theme(
  context="paper",
  style="whitegrid",
  palette="colorblind",
  font="Source Sans Pro",
)

In [None]:
%config InlineBackend.figure_format = "retina"

## Load data

In [None]:
def read_run(file, variant):
  df = pd.read_table("../" + file)
  # Clean up column names
  df.columns = df.columns.str.strip()
  # Sort by name to aid matching across datasets
  df = df.sort_values("Name", ignore_index=True)
  # Remove duplicate names (e.g. from macro-generated code with multiple uses of
  # the same variable name)
  df = df.drop_duplicates("Name", keep=False)
  return df

o0_15_efb_df = read_run("O0-15-mem2reg/git-efb.tsv", "Clang 15, O0 + mem2reg")
o1_12_df = read_run("O1-12/git.tsv", "Clang 12, O1")
o1_13_df = read_run("O1-13/git.tsv", "Clang 13, O1")
o1_14_df = read_run("O1-14/git.tsv", "Clang 14, O1")
o1_15_df = read_run("O1-15/git.tsv", "Clang 15, O1")
o1_15_efb_df = read_run("O1-15/git-efb.tsv", "Clang 15, O1 + KE")
o2_15_df = read_run("O2-15/git.tsv", "Clang 15, O2")
o2_15_efb_df = read_run("O2-15/git-efb.tsv", "Clang 15, O2 + KE")
o3_15_df = read_run("O3-15/git.tsv", "Clang 15, O3")
o3_15_efb_df = read_run("O3-15/git-efb.tsv", "Clang 15, O3 + KE")

# Restrict all data frames to common names they all share
common_names = (
  set(o0_15_efb_df["Name"]) &
  set(o1_12_df["Name"]) &
  set(o1_13_df["Name"]) &
  set(o1_14_df["Name"]) &
  set(o1_15_df["Name"]) &
  set(o1_15_efb_df["Name"]) &
  set(o2_15_df["Name"]) &
  set(o2_15_efb_df["Name"]) &
  set(o3_15_df["Name"]) &
  set(o3_15_efb_df["Name"])
)
print(f"Common names: {len(common_names)}")
def common_only(df, variant):
  diff = len(df) - len(df[df["Name"].isin(common_names)])
  print(f"Dropped {diff} unique names from {variant}")
  return df[df["Name"].isin(common_names)]
o0_15_efb_df = common_only(o0_15_efb_df, "Clang 15, O0 + mem2reg")
o1_12_df = common_only(o1_12_df, "Clang 12, O1")
o1_13_df = common_only(o1_13_df, "Clang 13, O1")
o1_14_df = common_only(o1_14_df, "Clang 14, O1")
o1_15_df = common_only(o1_15_df, "Clang 15, O1")
o1_15_efb_df = common_only(o1_15_efb_df, "Clang 15, O1 + KE")
o2_15_df = common_only(o2_15_df, "Clang 15, O2")
o2_15_efb_df = common_only(o2_15_efb_df, "Clang 15, O2 + KE")
o3_15_df = common_only(o3_15_df, "Clang 15, O3")
o3_15_efb_df = common_only(o3_15_efb_df, "Clang 15, O3 + KE")

# Order is important here: some data transformations rely on `first` to access
# the baseline, `diff` to access KE vs. not, etc.
# Re-check all transformations when changing the order.
compilations_df = pd.concat([
  o0_15_efb_df,
  o1_12_df,
  o1_13_df,
  o1_14_df,
  o1_15_df,
  o1_15_efb_df,
  o2_15_df,
  o2_15_efb_df,
  o3_15_df,
  o3_15_efb_df,
], keys=[
  "Clang 15, O0 + mem2reg",
  "Clang 12, O1",
  "Clang 13, O1",
  "Clang 14, O1",
  "Clang 15, O1",
  "Clang 15, O1 + KE",
  "Clang 15, O2",
  "Clang 15, O2 + KE",
  "Clang 15, O3",
  "Clang 15, O3 + KE",
], names=[
  "Variant",
  "Row",
])

In [None]:
def normalise(df):
  # Compute various coverage ratios
  df["CB / SB"] = df["Cov (B)"] / df["Scope (B)"]
  df["CL / SL"] = df["Cov (L)"] / df["Scope (L)"]
  df["ACL / SL"] = df["Adj Cov (L)"] / df["Scope (L)"]
  df["CL / SSL"] = df["Cov (L)"] / df["Src Scope (L)"]
  df["FCL / SSL"] = df["Flt Cov (L)"] / df["Src Scope (L)"]
  # Line table may differ between runs, giving different scope line counts
  # Use the largest scope line count from any run to recompute ratio
  df["Max Scope (L)"] = df.groupby("Name")["Scope (L)"].transform("max")
  df["CL / MSL"] = df["Cov (L)"] / df["Max Scope (L)"]
  # Normalise values to baseline (Clang 15, O0 + mem2reg)
  df["Baseline Cov (L)"] = df.groupby("Name")["Adj Cov (L)"].transform("first")
  with np.errstate(all="ignore"):
    df["ACL / BCL"] = df["Adj Cov (L)"] / df["Baseline Cov (L)"]
  # Add coverage difference for plots showing change across runs
  df["Adj Cov Diff (L)"] = df.groupby("Name")["Adj Cov (L)"].transform("diff")
  df["ACL / BCL Diff"] = df.groupby("Name")["ACL / BCL"].transform("diff")
  df["ACL / BCL Prev"] = df.groupby("Name")["ACL / BCL"].transform("shift")

normalise(compilations_df)

# compilations_df.to_csv("normalised.tsv", sep="\t")

## Playground

In [None]:
# df = compilations_df.copy()
# variants = df.index.get_level_values("Variant")

# df[df["Baseline Cov (L)"] > df["Scope (L)"]]

# df[df["Name"].str.contains("write_graph_chunk_data, data")]
# df[variants.str.fullmatch("Clang 15, O1")].nlargest(50, "CL / BCL")

# o1_df[o1_df["CL / BCL"] > 1].shape[0] / o1_df.shape[0]

# o1_df[o1_df["CB / SB"] == 1][o1_df["CL / BCL"] < 1]

# o1_df["Ratio Diff"] = o1_df["CL / MSL"] - o1_df["ACL / BCL"]
# o1_df.nlargest(50, "Ratio Diff")

# o1_df = df[variants.str.fullmatch("Clang 15, O1")]

## Comparing raw lines to filtered lines
# 6463 filtered away
# Working on xsnprintf, ap, decl wrapper.c:681, unit wrapper.c
# Now treating call arguments as may be defined
# 4763 filtered away
# Missing computation region also clears line sets
# 4782 filtered away
# o1_df[o1_df["Cov (L)"] > 0][o1_df["Flt Cov (L)"] == 0]

## Checking source-based scope lines
# 4627 with no scope lines
# Working on xdl_trim_ends, lim, decl xprepare.c:427, unit xdiff/xprepare.c
# Added multiple comma-separated assignments
# by looking up multiple levels for the nearest Stmt ancestor
# 900 with no scope lines
# Working on wt_status_check_rebase, st, decl wt-status.c:1682, unit wt-status.c
# Fixed call argument iteration
# 666 with no scope lines
# Working on xdl_num_out, buf, decl xutils.c:323, unit xdiff/xutils.c
# Added defined regions for pointer assignments
# 628 with no scope lines
# Working on kwsincr, dirs, decl kwset.c:138, unit kwset.c
# Added descent into tree on left-hand side of assignments
# 620 with no scope lines
# Working on tmp_objdir_create, installed_handlers, decl tmp-objdir.c:115, unit
# tmp-objdir.c
# Added static local computation and definition regions
# 615 with no scope line
# Working on add_cmdname, flex_array_len_, decl help.c:152, unit help.c
# Fixed next line adjustment for one-line regions
# Ignored multi-file regions (e.g. variable declared via #include mid-function)
# 10 with no scope lines
# Working on cmd_grep, dummy, decl grep.c:882, unit builtin/grep.c
# Added all right-hand side assignment and initialiser variables
# 7 with no scope lines
# Working on dereference, unused, decl fast-import.c:3046, unit
# builtin/fast-import.c
# Included current line when referencing existing variables
# 4 with no scope lines
# Remaining issues seem to be "always inline"-related

# o1_df[o1_df["Src Scope (L)"] == 0]