In [1]:
%load_ext rpy2.ipython

In [2]:
%%R
library(RColorBrewer)
#cat("p-value colors:", brewer.pal(n=4,"YlOrBr"), "\n")
cat("p-value colors:", brewer.pal(n=3,"Oranges"), "\n")
cat("effsize colors:", brewer.pal(n=9,"RdBu"), "\n")

p-value colors: #FEE6CE #FDAE6B #E6550D 
effsize colors: #B2182B #D6604D #F4A582 #FDDBC7 #F7F7F7 #D1E5F0 #92C5DE #4393C3 #2166AC 


In [3]:
import itertools as it
import textwrap
from bisect import bisect_left
from typing import List

import numpy as np
from pandas import Categorical
import pandas as pd
import scipy.stats as stats

In [4]:
icse22_coverage = pd.read_csv("results/eval-icse22/coverage/coverage-data.csv")
icse22_coverage['unique_valid_paths_rate'] = icse22_coverage['unique_valid_paths'] / icse22_coverage['total_inputs']
icse22_coverage['fuzzer'] = icse22_coverage['fuzzer'].replace({'bediv-structure': 'bediv-struct', 'rl': 'rlcheck'})


In [5]:
def generate_final_coverage_score_table(
    score_df: pd.DataFrame, time: int, agg: str, metric: str, caption: str, label: str
):
  scores = score_df.query(f"time == {time}")

  avg_scores = scores.groupby(['fuzzer', 'subject']).agg(avg_score=(metric, agg)).round()

  # Determine decimal places for formatting
  digits = []
  for subject in ['ant', 'closure', 'maven', 'nashorn', 'rhino', 'tomcat']:
    fuzzer_scores = avg_scores['avg_score'].xs(subject, level='subject')
    digits += [max(map(lambda num : float(len(str(num).split(".")[0])), fuzzer_scores))]

  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      Mean behavioral diversity of order $0$ \\todo[inline]{{add order}} (\\ie, code coverage) after campaign timeout ($t=1\\si{{\\hour}}$), rounded to the nearest integer.
      Values highlighted in \\colorbox{{lightgray}}{{gray}} indicate the highest mean or means (in the case of a tie) for each subject.
      Superscript symbols additionally indicate statistically significant differences to \\textsc{{BeDiv-Simple}} ($*$), \\textsc{{BeDiv-Struct}} ($\\dagger$), or both ($\\ddagger$).
    }}
    \\label{{{label}}}
    \\begin{{adjustbox}}{{center}}
    \\begin{{tabular}}[]{{l
      S[table-format={digits[0]}]
      S[table-format={digits[1]}]
      S[table-format={digits[2]}]
      S[table-format={digits[3]}]
      S[table-format={digits[4]}]
      S[table-format={digits[5]}]
    }}
    \\toprule
    & \multicolumn{{6}}{{c}}{{\\textbf{{Subject}}}} \\\\
    \\cmidrule(lr){{2-7}}
    {{\\textbf{{Fuzzer}}}} & {{Ant}} & {{Closure}} & {{Maven}} & {{Nashorn}} & {{Rhino}} & {{Tomcat}}\\\\
    \\midrule\n"""

  avg_scores = scores.groupby(['fuzzer', 'subject']).agg(avg_score=(metric, agg)).round()

  fuzzers = ['BeDiv-Simple', 'BeDiv-Struct', 'QuickCheck', 'RLCheck', 'Zest']
  subjects = ['ant', 'closure', 'maven', 'nashorn', 'rhino', 'tomcat']

  # Determine decimal places
  num_digits = []
  for subject in subjects:
    fuzzer_scores = avg_scores['avg_score'].xs(subject, level='subject')
    num_digits += [max(map(lambda num : len(str(num).split(".")[0]), fuzzer_scores))]

  # Generate rows
  for fuzzer in fuzzers:
    row = f"    \\textsc{{{fuzzer}}} "
    fuzzer = fuzzer.lower() # Lowercase for querying

    for subject in subjects:
      max_value = avg_scores.xs(subject, level='subject').max().item()

      fuzzer_value = avg_scores.query(
          f"fuzzer == '{fuzzer}' and subject == '{subject}'"
        )['avg_score'].item()    
      if fuzzer_value == max_value:
        row += f"& \\maxcell {int(fuzzer_value)}"
      else:
        row += f"& {int(fuzzer_value)}"

      # Add statistically significance indicators
      fuzzer_trials = scores.query(f"fuzzer == '{fuzzer}' and subject == '{subject}'")[metric].to_numpy()

      bediv_simple_trials = scores.query(f"fuzzer == 'bediv-simple' and subject == '{subject}'")[metric].to_numpy()

      bediv_struct_trials = scores.query(f"fuzzer == 'bediv-struct' and subject == '{subject}'")[metric].to_numpy()

      p_simple = stats.mannwhitneyu(fuzzer_trials, bediv_simple_trials).pvalue
      p_struct = stats.mannwhitneyu(fuzzer_trials, bediv_struct_trials).pvalue

      if p_simple <= 0.01 and p_struct <= 0.01:
        row += "\\sboth "
      elif p_simple <= 0.01:
        row += "\\ssimple "
      elif p_struct <= 0.01:
        row += "\\sstruct "
      else:
        row += " "
      
    row += "\\\\\n"
    result += row

  result += """    \\bottomrule
    \\end{tabular}
    \\end{adjustbox}
  \\end{table}
  """
  print(textwrap.dedent(result))


In [8]:
generate_final_coverage_score_table(
  score_df=icse22_coverage,
  time=60,
  agg='mean',
  metric='b2',
  caption=r"This is the caption.",
  label='table:bedivfuzz:evaluation:rq2:b2:1h'
)


\begin{table}
  \centering
  \caption{
    Mean behavioral diversity of order $0$ \todo[inline]{add order} (\ie, code coverage) after campaign timeout ($t=1\si{\hour}$), rounded to the nearest integer.
    Values highlighted in \colorbox{lightgray}{gray} indicate the highest mean or means (in the case of a tie) for each subject.
    Superscript symbols additionally indicate statistically significant differences to \textsc{BeDiv-Simple} ($*$), \textsc{BeDiv-Struct} ($\dagger$), or both ($\ddagger$).
  }
  \label{table:bedivfuzz:evaluation:rq2:b2:1h}
  \begin{tabular}[]{l
    S[table-format=4.0]
    S[table-format=5.0]
    S[table-format=4.0]
    S[table-format=3.0]
    S[table-format=4.0]
    S[table-format=4.0]
  }
  \toprule
  & \multicolumn{6}{c}{\textbf{Subject}} \\
  \cmidrule(lr){2-7}
  {\textbf{Fuzzer}} & {Ant} & {Closure} & {Maven} & {Nashorn} & {Rhino} & {Tomcat}\\
  \midrule
  \textsc{BeDiv-Simple} & 2959 & 9999\sstruct & 1818\sstruct & \maxcell 245 & \maxcell 3800 & 3083\sst

In [9]:
# From: https://gist.github.com/vahidetemadi/2db6135c100ac2232afe5a8c99753c3c

def VD_A(treatment: List[float], control: List[float]):
    """
    Computes Vargha and Delaney A index
    A. Vargha and H. D. Delaney.
    A critique and improvement of the CL common language
    effect size statistics of McGraw and Wong.
    Journal of Educational and Behavioral Statistics, 25(2):101-132, 2000
    The formula to compute A has been transformed to minimize accuracy errors
    See: http://mtorchiano.wordpress.com/2014/05/19/effect-size-of-r-precision/
    :param treatment: a numeric list
    :param control: another numeric list
    :returns the value estimate and the magnitude
    """
    m = len(treatment)
    n = len(control)

    if m != n:
        raise ValueError("Data d and f must have the same length")

    r = stats.rankdata(treatment + control)
    r1 = sum(r[0:m])

    # Compute the measure
    # A = (r1/m - (m+1)/2)/n # formula (14) in Vargha and Delaney, 2000
    A = (2 * r1 - m * (m + 1)) / (2 * n * m)  # equivalent formula to avoid accuracy errors

    levels = [0.147, 0.33, 0.474]  # effect sizes from Hess and Kromrey, 2004
    magnitude = ["negligible", "small", "medium", "large"]
    scaled_A = (A - 0.5) * 2

    magnitude = magnitude[bisect_left(levels, abs(scaled_A))]
    estimate = A

    return estimate, magnitude


def VD_A_DF(data, val_col: str = None, group_col: str = None, sort=True):
    """
    :param data: pandas DataFrame object
        An array, any object exposing the array interface or a pandas DataFrame.
        Array must be two-dimensional. Second dimension may vary,
        i.e. groups may have different lengths.
    :param val_col: str, optional
        Must be specified if `a` is a pandas DataFrame object.
        Name of the column that contains values.
    :param group_col: str, optional
        Must be specified if `a` is a pandas DataFrame object.
        Name of the column that contains group names.
    :param sort : bool, optional
        Specifies whether to sort DataFrame by group_col or not. Recommended
        unless you sort your data manually.
    :return: stats : pandas DataFrame of effect sizes
    Stats summary ::
    'A' : Name of first measurement
    'B' : Name of second measurement
    'estimate' : effect sizes
    'magnitude' : magnitude
    """

    x = data.copy()
    if sort:
        x[group_col] = Categorical(x[group_col], categories=x[group_col].unique(), ordered=True)
        x.sort_values(by=[group_col, val_col], ascending=True, inplace=True)

    groups = x[group_col].unique()

    # Pairwise combinations
    g1, g2 = np.array(list(it.combinations(np.arange(groups.size), 2))).T

    # Compute effect size for each combination
    ef = np.array([VD_A(list(x[val_col][x[group_col] == groups[i]].values),
                        list(x[val_col][x[group_col] == groups[j]].values)) for i, j in zip(g1, g2)])

    return pd.DataFrame({
        'A': np.unique(data[group_col])[g1],
        'B': np.unique(data[group_col])[g2],
        'estimate': ef[:, 0],
        'magnitude': ef[:, 1]
    })

In [10]:
def generate_subject_p_a12_table(
    score_df: pd.DataFrame,
    time: int,
    metric: str,
    subject: str,
    caption: str,
    label: str
):
  scores = score_df.query  (f"subject == '{subject}' and time == {time}")
  a12 = VD_A_DF(scores, val_col=metric, group_col='fuzzer', sort=True)
  a12 = a12.astype({'estimate': 'float64'}).round(3)

  result = f"""
  \\newcolumntype{{R}}{{>{{\\raggedleft \\arraybackslash}}m{{2.5cm}}}}
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{tabular}}{{RRRRR}}
    \\toprule\n"""

  fuzzers = ['BeDiv-Simple', 'BeDiv-Struct', 'QuickCheck', 'RLCheck', 'Zest']
  for row in range(len(fuzzers)):
    result += '  '
    for col in range(len(fuzzers)):
      if col > 0:
        result += ' & '

      fuzzer_row = fuzzers[row].lower()
      fuzzer_col = fuzzers[col].lower()
      # Diagonal entry: fuzzer name
      if row == col: 
        #result += f"\\multicolumn{{1}}{{c}}{{\\textsc{{{fuzzers[row]}}}}}"
        result += f"  \\textsc{{{fuzzers[row]}}}"

      # Upper diagonal: A12 measure
      elif row < col:
        cell = a12.query(f"A=='{fuzzer_row}' and B=='{fuzzer_col}'")
        estimate = cell['estimate'].item()
        magnitude = cell['magnitude'].item()

        match magnitude:
          case 'negligible':
            result += r"\cellcolor{a12n} "
          case 'small':
            if estimate < 0.5:
              result += r"\cellcolor{a12sl} "
            else:
              result += r"\cellcolor{a12sh} "
          case 'medium':
            if estimate < 0.5:
              result += r"\cellcolor{a12ml} "
            else:
              result += r"\cellcolor{a12mh} "
          case 'large':
            if estimate < 0.5:
              result += r"\cellcolor{a12ll} "
            else:
              result += r"\cellcolor{a12lh} "
        
        result += f"{estimate}"
        
      # Lower diagonal: p-value
      elif row > col: 
        row_trials = scores.query(f"fuzzer == '{fuzzer_row}'")[metric].to_numpy()
        col_trials = scores.query(f"fuzzer == '{fuzzer_col}'")[metric].to_numpy()
        p = stats.mannwhitneyu(row_trials, col_trials).pvalue

        if p <= 0.001:
          result += r"\cellcolor{p001} "
        elif p <= 0.01:
          result += r"\cellcolor{p01} "
        elif p <= 0.05:
          result += r"\cellcolor{p05} "
        else:
          result += r"\cellcolor{lightgray} "

        p = np.format_float_scientific(p, precision=2)
        result += f"{p}"

      if col == len(fuzzers) - 1:
        result += ' \\\\'
    
    result += '\n  '

  result += """  \\bottomrule
    \\end{tabular}
  \\end{table}
  """
  print(textwrap.dedent(result))

In [31]:
generate_subject_p_a12_table(
    score_df=icse22_coverage,
    time=60,
    metric='b0',
    subject='ant',
    caption='This is the caption',
    label='table:test'
)


\newcolumntype{R}{>{\raggedleft \arraybackslash}m{2.5cm}}
\begin{table}
  \centering
  \caption{
    This is the caption
  }
  \label{table:test}
  \begin{tabular}{RRRRR}
  \toprule
  \textsc{BeDiv-Simple} & \cellcolor{a12n} 0.525 & \cellcolor{a12sl} 0.373 & \cellcolor{a12lh} 1.0 & \cellcolor{a12lh} 0.981 \\
  \cellcolor{lightgray} 7.45e-01 &   \textsc{BeDiv-Struct} & \cellcolor{a12ml} 0.309 & \cellcolor{a12lh} 1.0 & \cellcolor{a12lh} 0.997 \\
  \cellcolor{lightgray} 9.33e-02 & \cellcolor{p05} 1.12e-02 &   \textsc{QuickCheck} & \cellcolor{a12lh} 1.0 & \cellcolor{a12lh} 1.0 \\
  \cellcolor{p001} 2.46e-11 & \cellcolor{p001} 2.45e-11 & \cellcolor{p001} 2.47e-11 &   \textsc{RLCheck} & \cellcolor{a12ll} 0.0 \\
  \cellcolor{p001} 1.54e-10 & \cellcolor{p001} 3.91e-11 & \cellcolor{p001} 2.93e-11 & \cellcolor{p001} 2.40e-11 &   \textsc{Zest} \\
  \bottomrule
  \end{tabular}
\end{table}



In [11]:
def generate_coverage_p_a12_table(
    score_df: pd.DataFrame,
    time: int,
    metric: str,
    caption: str,
    label: str
):
  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{tabular}}{{crcrrcrr}}
    \\toprule
    & & \\phantom{{m}} & \\multicolumn{{2}}{{c}}{{$p$}} & \\phantom{{m}} & \\multicolumn{{2}}{{c}}{{$\\hat{{A}}_{{12}}$}} \\\\
    \\cmidrule(lr){{4-5}} \\cmidrule(lr){{7-8}}  
    Subject & Baseline && \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct && \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct\\\\\n"""

  baseline_nicenames = {
    'quickcheck': 'QuickCheck',
    'rlcheck': 'RLCheck',
    'zest': 'Zest'
  }

  for subject in ['Ant', 'Closure', 'Maven', 'Nashorn', 'Rhino', 'Tomcat']:
    data = score_df.query(f"subject == '{subject.lower()}' and time == {time}")
    a12 = VD_A_DF(data, val_col=metric, group_col='fuzzer', sort=True)
    a12 = a12.astype({'estimate': 'float64'}).round(3)

    result += '    \\midrule\n'
    result += f"    \\multirow{{3}}{{*}}{{{subject}}}\n"

    for baseline in baseline_nicenames.keys():
      result += f"      & \\textsc{{{baseline_nicenames[baseline]}}} & "

      # p-values
      for tech in ['bediv-simple', 'bediv-struct']:
        tech_trials = data.query(f"fuzzer == '{tech}'")[metric].to_numpy()
        base_trials = data.query(f"fuzzer == '{baseline}'")[metric].to_numpy()
        p = stats.mannwhitneyu(tech_trials, base_trials).pvalue

        if p <= 0.001:
          result += r"& \cellcolor{p001} "
        elif p <= 0.01:
          result += r"& \cellcolor{p01} "
        elif p <= 0.05:
          result += r"& \cellcolor{p05} "
        else:
          result += r"& \cellcolor{lightgray} "

        p = np.format_float_scientific(p, precision=2)
        result += f"{p} "

      result += '&'

      # a12 effsizes
      for tech in ['bediv-simple', 'bediv-struct']:
        cell = a12.query(f"A=='{tech}' and B=='{baseline.lower()}'")
        estimate = cell['estimate'].item()
        magnitude = cell['magnitude'].item()

        match magnitude:
          case 'negligible':
            result += r"& \cellcolor{a12n} "
          case 'small':
            if estimate < 0.5:
              result += r"& \cellcolor{a12sl} "
            else:
              result += r"& \cellcolor{a12sh} "
          case 'medium':
            if estimate < 0.5:
              result += r"& \cellcolor{a12ml} "
            else:
              result += r"& \cellcolor{a12mh} "
          case 'large':
            if estimate < 0.5:
              result += r"& \cellcolor{a12ll} "
            else:
              result += r"& \cellcolor{a12lh} "
      
        result += f"{estimate} "

      result += '\\\\\n'

  result += """    \\bottomrule
    \\end{tabular}
  \\end{table}
  """
  
  print(textwrap.dedent(result))

In [15]:
generate_coverage_p_a12_table(
    score_df=icse22_coverage,
    time=60,
    metric='b2',
    caption='This is the caption.',
    label='This is the label'
)


\begin{table}
  \centering
  \caption{
    This is the caption.
  }
  \label{This is the label}
  \begin{tabular}{crcrrcrr}
  \toprule
  & & \phantom{m} & \multicolumn{2}{c}{$p$} & \phantom{m} & \multicolumn{2}{c}{$\hat{A}_{12}$} \\
  \cmidrule(lr){4-5} \cmidrule(lr){7-8}  
  Subject & Baseline && \textsc{BDF}\ssimple & \textsc{BDF}\sstruct && \textsc{BDF}\ssimple & \textsc{BDF}\sstruct\\
  \midrule
  \multirow{3}{*}{Ant}
    & \textsc{QuickCheck} & & \cellcolor{p001} 3.02e-11 & \cellcolor{p001} 3.02e-11 && \cellcolor{a12lh} 1.0 & \cellcolor{a12lh} 1.0 \\
    & \textsc{RLCheck} & & \cellcolor{p001} 3.01e-11 & \cellcolor{p001} 3.01e-11 && \cellcolor{a12lh} 1.0 & \cellcolor{a12lh} 1.0 \\
    & \textsc{Zest} & & \cellcolor{lightgray} 8.88e-01 & \cellcolor{lightgray} 4.55e-01 && \cellcolor{a12n} 0.511 & \cellcolor{a12n} 0.557 \\
  \midrule
  \multirow{3}{*}{Closure}
    & \textsc{QuickCheck} & & \cellcolor{p001} 3.02e-11 & \cellcolor{p001} 3.02e-11 && \cellcolor{a12lh} 1.0 & \cellcolor{a1

In [None]:
def generate_p_a12_table_alt(
    score_df: pd.DataFrame,
    time: int,
    metric: str,
    caption: str,
    label: str
):
  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{tabular}}{{crcrrcrr}}
    \\toprule
    & & \\phantom{{m}} & \\multicolumn{{2}}{{c}}{{\\textsc{{BeDiv-Simple}}}} & \\phantom{{m}} & \\multicolumn{{2}}{{c}}{{\\textsc{{BeDiv-Struct}}}} \\\\
    \\cmidrule(lr){{4-5}} \\cmidrule(lr){{7-8}}  
    Subject & Baseline && $p$ & $\\hat{{A}}_{{12}}$ && $p$ & $\\hat{{A}}_{{12}}$\\\\
  """
  for subject in ['Ant', 'Closure', 'Maven', 'Nashorn', 'Rhino', 'Tomcat']:
    data = score_df.query(f"subject == '{subject.lower()}' and time == {time}")
    a12 = VD_A_DF(data, val_col=metric, group_col='fuzzer', sort=True)
    a12 = a12.astype({'estimate': 'float64'}).round(3)

    result += '  \\midrule\n'
    result += f"  \\multirow{{3}}{{*}}{{{subject}}}\n"

    for baseline in ['QuickCheck', 'RLCheck', 'Zest']:
      result += f"    & \\textsc{{{baseline}}} "

      for tech in ['bediv-simple', 'bediv-struct']:
        result += '&'

        tech_trials = data.query(f"fuzzer == '{tech}'")[metric].to_numpy()
        base_trials = data.query(f"fuzzer == '{baseline.lower()}'")[metric].to_numpy()
        p = stats.mannwhitneyu(tech_trials, base_trials).pvalue

      # p-values
        if p <= 0.001:
          result += r"& \cellcolor{p001} "
        elif p <= 0.01:
          result += r"& \cellcolor{p01} "
        elif p <= 0.05:
          result += r"& \cellcolor{p05} "
        else:
          result += r"& \cellcolor{lightgray} "

        p = np.format_float_scientific(p, precision=2)
        result += f"{p} "

        # a12 effsizes
        cell = a12.query(f"A=='{tech}' and B=='{baseline.lower()}'")
        estimate = cell['estimate'].item()
        magnitude = cell['magnitude'].item()

        match magnitude:
          case 'negligible':
            result += r"& \cellcolor{a12n} "
          case 'small':
            if estimate < 0.5:
              result += r"& \cellcolor{a12sl} "
            else:
              result += r"& \cellcolor{a12sh} "
          case 'medium':
            if estimate < 0.5:
              result += r"& \cellcolor{a12ml} "
            else:
              result += r"& \cellcolor{a12mh} "
          case 'large':
            if estimate < 0.5:
              result += r"& \cellcolor{a12ll} "
            else:
              result += r"& \cellcolor{a12lh} "
        
        result += f"{estimate} "
      result += '\\\\\n'

  result += """  \\bottomrule
    \\end{tabular}
  \\end{table}
  """
  print(result)

generate_p_a12_table_alt(
    score_df=icse22_coverage,
    time=5,
    metric='b0',
    caption='This is the caption.',
    label='This is the label'
)

In [282]:
%%R
if (FALSE) {
  library(dplyr)
  icse22_crashes <- read.csv(
    "results/eval-icse22/crashes/crash-data.csv"
  )
  df <- icse22_crashes |>
    arrange(tte) |>
    distinct(fuzzer, subject, trial, exception_class, .keep_all = TRUE) |>
    select(fuzzer, subject, trial, exception_class, tte)

  df$subject_prefix <- toupper(substr(df$subject, 1, 1))
  df <- df |>
    group_by(subject) |>
    mutate(exception_id = as.numeric(factor(exception_class))) |>
    ungroup()

  df$crash_id <- paste0(df$subject_prefix, df$exception_id)

  df |>
    select(subject, fuzzer, trial, exception_class, tte, crash_id) |>
    arrange(crash_id) |>
    filter(crash_id != 'T1') |>
    write.csv(
      "results/eval-icse22/crashes/crash-data-deduplicated.csv", row.names = FALSE
    )
}

In [40]:
icse22_crashes = pd.read_csv("results/eval-icse22/crashes/crash-data-deduplicated.csv")
icse22_crashes['fuzzer'] = icse22_crashes['fuzzer']\
  .replace({
    'bediv-structure': 'bediv-struct', 
    'rl': 'rlcheck'
  })


In [54]:
def generate_crash_table(
      crash_df: pd.DataFrame, 
      crash_subjects: list[str],
      caption: str, 
      label: str
  ):
  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{adjustbox}}{{center}}
    \\begin{{tabular}}{{ccrrrrrrrrrrr}}
    \\toprule
    & & 
      \\multicolumn{{2}}{{c}}{{\\textsc{{BDF}}\\ssimple}} & 
      \\multicolumn{{2}}{{c}}{{\\textsc{{BDF}}\\sstruct}} & 
      \\multicolumn{{2}}{{c}}{{\\textsc{{QC}}}} & 
      \\multicolumn{{2}}{{c}}{{\\textsc{{RL}}}} & 
      \\multicolumn{{2}}{{c}}{{\\textsc{{Zest}}}} \\\\
      \\cmidrule(lr){{3-4}} 
      \\cmidrule(lr){{5-6}} 
      \\cmidrule(lr){{7-8}}  
      \\cmidrule(lr){{9-10}}
      \\cmidrule(lr){{11-12}}
    Subject & Crash-ID & 
      $\\mathit{{TTE}}$ & $\\mathit{{Rel.}}$ & 
      $\\mathit{{TTE}}$ & $\\mathit{{Rel.}}$ & 
      $\\mathit{{TTE}}$ & $\\mathit{{Rel.}}$ & 
      $\\mathit{{TTE}}$ & $\\mathit{{Rel.}}$ & 
      $\\mathit{{TTE}}$ & $\\mathit{{Rel.}}$ &\\\\
  """

  # Aggregate crash results
  crash_data = crash_df.groupby(
    ['subject', 'fuzzer', 'exception_class', 'crash_id',]
  ).agg(
    avg_tte = ('tte', lambda x: np.round(np.mean(x / 60))), # s to min
    reliability = ('trial', lambda x: np.round(len(x) / 30, 2))
  )

  assert (
     len(crash_subjects) == len(crash_data.index.get_level_values('subject').unique())
  )

  assert (all(
     [s.lower() in crash_data.index.get_level_values('subject') for s in crash_subjects]
  ))

  for subject in crash_subjects:
    result += '  \\midrule\n'
    subject_crashes = crash_data.xs(subject.lower(), level = 'subject')

    # Determine number of unique crash_ids for subject multirow
    crash_ids = subject_crashes.index.get_level_values('crash_id').unique()
    result += f"    \\multirow{{{len(crash_ids)}}}{{*}}{{{subject}}}"

    for crash_id in sorted(crash_ids):
      min_tte = min(subject_crashes.xs(crash_id, level='crash_id')['avg_tte'])
      max_reliability = max(subject_crashes.xs(crash_id, level='crash_id')['reliability'])

      result += f"\n      & {crash_id} "
      for fuzzer in ['bediv-simple', 'bediv-struct', 'quickcheck', 'rlcheck', 'zest']:
        crash_info = subject_crashes.query(
            f"fuzzer == '{fuzzer}' and crash_id == '{crash_id}'"
        )

        if crash_info.empty:
            result += "& $-$ & $-$ "
            continue
        
        tte = crash_info['avg_tte'].item()
        tte_color = '\\cellcolor{lightgray} ' if np.isclose(tte, min_tte) else ''
        if tte > 0:
          result += f"& {tte_color} {int(tte)} & "
        else:
          result += f"& {tte_color}$<$1 & "

        reliability = crash_info['reliability'].item()
        if np.isclose(reliability, max_reliability):
          result += f"\\textbf{{\\SI{{{int(reliability * 100)}}}{{\percent}}}} "
        else:
          result += f"\\SI{{{int(reliability * 100)}}}{{\percent}} "
    
      result += '\\\\'
    result += '\n  '
  result += r"""  \bottomrule
    \end{tabular}
    \end{adjustbox}
  \end{table}
  """
  print(textwrap.dedent(result))

In [55]:
generate_crash_table(
    icse22_crashes,
    crash_subjects=['Closure', 'Nashorn', 'Rhino'],
    caption="The caption.", 
    label="the:label",
)


\begin{table}
  \centering
  \caption{
    The caption.
  }
  \label{the:label}
  \begin{adjustbox}{center}
  \begin{tabular}{ccrrrrrrrrrrr}
  \toprule
  & & 
    \multicolumn{2}{c}{\textsc{BDF}\ssimple} & 
    \multicolumn{2}{c}{\textsc{BDF}\sstruct} & 
    \multicolumn{2}{c}{\textsc{QC}} & 
    \multicolumn{2}{c}{\textsc{RL}} & 
    \multicolumn{2}{c}{\textsc{Zest}} \\
    \cmidrule(lr){3-4} 
    \cmidrule(lr){5-6} 
    \cmidrule(lr){7-8}  
    \cmidrule(lr){9-10}
    \cmidrule(lr){11-12}
  Subject & Crash-ID & 
    $\mathit{TTE}$ & $\mathit{Rel.}$ & 
    $\mathit{TTE}$ & $\mathit{Rel.}$ & 
    $\mathit{TTE}$ & $\mathit{Rel.}$ & 
    $\mathit{TTE}$ & $\mathit{Rel.}$ & 
    $\mathit{TTE}$ & $\mathit{Rel.}$ &\\
  \midrule
  \multirow{3}{*}{Closure}
    & C1 &  425 & \textbf{\SI{100}{\percent}} &  347 & \textbf{\SI{100}{\percent}} & \cellcolor{lightgray}  4 & \textbf{\SI{100}{\percent}} &  7 & \textbf{\SI{100}{\percent}} &  149 & \textbf{\SI{100}{\percent}} \\
    & C2 &  121 & \textbf

In [58]:
def fill_crash_results(
    crash_df: pd.DataFrame,
    subject: str,
    fuzzers: list[str],
    num_trials: int,
    campaign_timeout_s: int
  ):
  assert (crash_df['subject'] == subject).all()

  trials = range(1, num_trials + 1)
  crash_ids = crash_df['crash_id'].unique()

  full_index = pd.MultiIndex.from_product([fuzzers, trials, crash_ids], names=['fuzzer', 'trial', 'crash_id'])

  crash_id_to_exception_class = {}
  for crash_id in crash_ids:
    exception_class = crash_df.query(f"crash_id == '{crash_id}'")['exception_class'].unique()
    assert(len(exception_class) == 1)
    crash_id_to_exception_class[crash_id] = exception_class[0]

  result_df = crash_df.set_index(['fuzzer', 'trial', 'crash_id']).reindex(full_index).reset_index()

  result_df['subject'] = subject
  result_df['exception_class'] = result_df.apply(
    lambda row: crash_id_to_exception_class[row['crash_id']] if pd.isna(row['exception_class']) else row['exception_class'], axis=1
  )

  result_df['tte'] = result_df['tte'].fillna(campaign_timeout_s)

  return result_df


In [75]:
def generate_crash_p_a12_table(
    crash_df: pd.DataFrame,
    caption: str,
    label: str
):
  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{adjustbox}}{{center}}
    \\begin{{tabular}}{{crcrrcrr}}
    \\toprule
    & & 
      \\phantom{{m}} & 
      \\multicolumn{{2}}{{c}}{{$p$}} & 
      \\phantom{{m}} & 
      \\multicolumn{{2}}{{c}}{{$\\hat{{A}}_{{12}}$}} \\\\
    \\cmidrule(lr){{4-5}} \\cmidrule(lr){{7-8}}
    Crash-ID & Baseline 
    & & 
      \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct & &
      \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct \\\\
"""

  baseline_nicenames = {
    'quickcheck': 'QuickCheck',
    'rlcheck': 'RLCheck',
    'zest': 'Zest'
  }

  for subject in ['Closure', 'Nashorn', 'Rhino']:
    subject_crashes = crash_df.query(f"subject == '{subject.lower()}'")

    # Determine number of unique crash_ids for subject multirow
    crash_ids = subject_crashes['crash_id'].unique()
    #result += f"  \\multirow{{{len(crash_ids)}}}{{*}}{{{subject}}}"

    for crash_id in sorted(crash_ids):
      result += '    \\midrule\n'
      #result += f"\n      & {crash_id} "
      result += f"    \\multirow{{{len(baseline_nicenames)}}}{{*}}{{{crash_id}}}"

      trial_tte = subject_crashes.query(f"crash_id == '{crash_id}'")

      # Fill missing trials with max TTE (=campaign timeout)
      trial_tte_filled = fill_crash_results(
        trial_tte,
        subject=subject.lower(),
        fuzzers=['bediv-simple', 'bediv-struct', 'quickcheck', 'rlcheck', 'zest'],
        num_trials=30,
        campaign_timeout_s=24 * 60 * 60
      )
      a12 = VD_A_DF(trial_tte_filled, val_col='tte', group_col='fuzzer', sort=True)
      a12 = a12.astype({'estimate': 'float64'}).round(3)

      for baseline in baseline_nicenames.keys():
        result += f"\n      & {baseline_nicenames[baseline]} & "

        # Mann-Whitney U test on TTE
        for tech in ['bediv-simple', 'bediv-struct']:
          tech_tte = trial_tte_filled.query(f"fuzzer == '{tech}'")['tte'].to_numpy()

          baseline_tte = trial_tte_filled.query(f"fuzzer == '{baseline}'")['tte'].to_numpy()

          pvalue = stats.mannwhitneyu(tech_tte, baseline_tte).pvalue
          if pvalue <= 0.001:
            result += r"& \cellcolor{p001} "
          elif pvalue <= 0.01:
            result += r"& \cellcolor{p01} "
          elif pvalue <= 0.05:
            result += r"& \cellcolor{p05} "
          else:
            result += r"& \cellcolor{lightgray} "

          result += f"{np.format_float_scientific(pvalue, precision=2)} "
        
        result += ' & '

        # A12 effect size on TTE
        for tech in ['bediv-simple', 'bediv-struct']:
          cell = a12.query(f"A=='{tech}' and B=='{baseline}'")
          estimate = cell['estimate'].item()
          magnitude = cell['magnitude'].item()

          # Note that equalities are flipped because lower TTE is better
          match magnitude:
            case 'negligible':
              result += r"& \cellcolor{a12n} "
            case 'small':
              if estimate > 0.5:
                result += r"& \cellcolor{a12sl} "
              else:
                result += r"& \cellcolor{a12sh} "
            case 'medium':
              if estimate > 0.5:
                result += r"& \cellcolor{a12ml} "
              else:
                result += r"& \cellcolor{a12mh} "
            case 'large':
              if estimate > 0.5:
                result += r"& \cellcolor{a12ll} "
              else:
                result += r"& \cellcolor{a12lh} "

          result += f"{estimate} "
        result += '\\\\'
      result += '\n'
  result += """    \\bottomrule
    \\end{tabular}
    \\end{adjustbox}
  \\end{table}
  """
  print(textwrap.dedent(result))

In [76]:
generate_crash_p_a12_table(
    crash_df=icse22_crashes,
    caption="This is the caption",
    label="label"
)


\begin{table}
  \centering
  \caption{
    This is the caption
  }
  \label{label}
  \begin{adjustbox}{center}
  \begin{tabular}{crcrrcrr}
  \toprule
  & & 
    \phantom{m} & 
    \multicolumn{2}{c}{$p$} & 
    \phantom{m} & 
    \multicolumn{2}{c}{$\hat{A}_{12}$} \\
  \cmidrule(lr){4-5} \cmidrule(lr){7-8}
  Crash-ID & Baseline 
  & & 
    \textsc{BDF}\ssimple & \textsc{BDF}\sstruct & &
    \textsc{BDF}\ssimple & \textsc{BDF}\sstruct \\
  \midrule
  \multirow{3}{*}{C1}
    & QuickCheck & & \cellcolor{p001} 1.78e-10 & \cellcolor{p001} 3.02e-11  & & \cellcolor{a12ll} 0.98 & \cellcolor{a12ll} 1.0 \\
    & RLCheck & & \cellcolor{p001} 5.56e-10 & \cellcolor{p001} 3.01e-11  & & \cellcolor{a12ll} 0.967 & \cellcolor{a12ll} 1.0 \\
    & Zest & & \cellcolor{p001} 4.64e-05 & \cellcolor{p01} 1.24e-03  & & \cellcolor{a12ll} 0.807 & \cellcolor{a12ll} 0.743 \\
  \midrule
  \multirow{3}{*}{C2}
    & QuickCheck & & \cellcolor{p001} 2.97e-11 & \cellcolor{p001} 1.92e-09  & & \cellcolor{a12ll} 1.0 & \cel

In [87]:
def generate_crash_reliability_p_oddsratio_table(
    crash_df: pd.DataFrame,
    caption: str,
    label: str
):
  result = f"""
  \\begin{{table}}
    \\centering
    \\caption{{
      {caption}
    }}
    \\label{{{label}}}
    \\begin{{adjustbox}}{{center}}
    \\begin{{tabular}}{{crcrrcrr}}
    \\toprule
    & & 
      \\phantom{{m}} & 
      \\multicolumn{{2}}{{c}}{{$p$}} & 
      \\phantom{{m}} & 
      \\multicolumn{{2}}{{c}}{{$\\psi$}} \\\\
    \\cmidrule(lr){{4-5}} \\cmidrule(lr){{7-8}}
    Crash-ID & Baseline 
    & & 
      \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct & &
      \\textsc{{BDF}}\\ssimple & \\textsc{{BDF}}\\sstruct\\\\
"""

  baseline_nicenames = {
    'quickcheck': 'QuickCheck',
    'rlcheck': 'RLCheck',
    'zest': 'Zest'
  }

  for subject in ['Closure', 'Nashorn', 'Rhino']:
    subject_crashes = crash_df.query(f"subject == '{subject.lower()}'")

    # Determine number of unique crash_ids for subject multirow
    crash_ids = subject_crashes['crash_id'].unique()
    #result += f"  \\multirow{{{len(crash_ids)}}}{{*}}{{{subject}}}"

    for crash_id in sorted(crash_ids):
      result += '    \\midrule\n'
      #result += f"\n      & {crash_id} "
      result += f"    \\multirow{{{len(baseline_nicenames)}}}{{*}}{{{crash_id}}}"

      trial_tte = subject_crashes.query(f"crash_id == '{crash_id}'")

      for baseline in baseline_nicenames.keys():
        result += f"\n      & {baseline_nicenames[baseline]} & "

        # Fisher's exact test on reliability
        for tech in ['bediv-simple', 'bediv-struct']:
          tech_trials = len(trial_tte.query(f"fuzzer == '{tech}'")['trial'].unique())
          baseline_trials = len(trial_tte.query(f"fuzzer == '{baseline}'")['trial'].unique())

          pvalue = stats.fisher_exact(
            table=[
              [tech_trials, baseline_trials],
              [30 - tech_trials, 30 - baseline_trials],
            ]).pvalue
          
          if pvalue <= 0.001:
            result += r"& \cellcolor{p001} "
          elif pvalue <= 0.01:
            result += r"& \cellcolor{p01} "
          elif pvalue <= 0.05:
            result += r"& \cellcolor{p05} "
          else:
            result += r"& \cellcolor{lightgray} "
                    
          result += f"{np.format_float_scientific(pvalue, precision=2)}"
        
        result += ' & '

        # Odds ratio on reliability
        for tech in ['bediv-simple', 'bediv-struct']:
          tech_trials = len(trial_tte.query(f"fuzzer == '{tech}'")['trial'].unique())
          baseline_trials = len(trial_tte.query(f"fuzzer == '{baseline}'")['trial'].unique())

          odds_tech = (tech_trials + 0.5) / (30 - tech_trials + 0.5)
          odds_baseline = (baseline_trials + 0.5) / (30 - baseline_trials + 0.5)

          odds_ratio = odds_tech / odds_baseline

          if np.isclose(odds_ratio, 1.0): 
            result += r"& \cellcolor{a12n} "
          elif odds_ratio > 1.0:
            result += r"& \cellcolor{a12mh} "
          else:
            result += r"& \cellcolor{a12ml} "

          result += f"{np.round(odds_ratio, decimals=2)} "
        result += '\\\\'
      result += '\n'
  result += """    \\bottomrule
    \\end{tabular}
    \\end{adjustbox}
  \\end{table}
  """
  print(textwrap.dedent(result))

In [88]:
generate_crash_reliability_p_oddsratio_table(
    crash_df=icse22_crashes,
    caption="This is the caption",
    label="label"
)


\begin{table}
  \centering
  \caption{
    This is the caption
  }
  \label{label}
  \begin{adjustbox}{center}
  \begin{tabular}{crcrrcrr}
  \toprule
  & & 
    \phantom{m} & 
    \multicolumn{2}{c}{$p$} & 
    \phantom{m} & 
    \multicolumn{2}{c}{$\psi$} \\
  \cmidrule(lr){4-5} \cmidrule(lr){7-8}
  Crash-ID & Baseline 
  & & 
    \textsc{BDF}\ssimple & \textsc{BDF}\sstruct & &
    \textsc{BDF}\ssimple & \textsc{BDF}\sstruct\\
  \midrule
  \multirow{3}{*}{C1}
    & QuickCheck & & \cellcolor{lightgray} 1.e+00& \cellcolor{lightgray} 1.e+00 & & \cellcolor{a12n} 1.0 & \cellcolor{a12n} 1.0 \\
    & RLCheck & & \cellcolor{lightgray} 1.e+00& \cellcolor{lightgray} 1.e+00 & & \cellcolor{a12n} 1.0 & \cellcolor{a12n} 1.0 \\
    & Zest & & \cellcolor{lightgray} 1.e+00& \cellcolor{lightgray} 1.e+00 & & \cellcolor{a12n} 1.0 & \cellcolor{a12n} 1.0 \\
  \midrule
  \multirow{3}{*}{C2}
    & QuickCheck & & \cellcolor{lightgray} 1.e+00& \cellcolor{lightgray} 1.e+00 & & \cellcolor{a12n} 1.0 & \cellcolor