In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd 
import os 
import numpy as np 
import sys  
sys.path.append("../")
from utils import analysis_utils, git_utils

In [3]:
outputdir = "../output/evaluation/processed"
projects = ['Lang', 'Math', 'Time', 'Closure', 'Cli', 'Compress', 'Codec', 'Collections', 'Csv', 
    'JacksonCore', 'JacksonXml', 'JxPath', 'Jsoup']

# RQ1. Prevalence

### Prevalence of lived mutants

In [4]:
def get_summarised_statistic_simple(df:pd.DataFrame, project:str):
    def compute_perc(a, b, decimal_point):
        return np.round(100*a/b, decimals=decimal_point) if b != 0 else 0
    columns = ['all', 'survived', 'killed', 'no_covg', 'timed_out', 'non_viable', 'memory_error', 'run_error']
    line = ""
    decimal_point = 2    
    df = df[~(df['all'] == 0)]
    n_all = df['all'].sum() 
    for col in columns[:3]:
        # agg
        if col != 'all': # if not all, compute the percentage
            n_sum = compute_perc(df[col].sum(), n_all, decimal_point)
        else:
            n_sum = n_all #df[col].sum()
        # avg
        if col != 'all':
            vs = df[col].div(df['all']).values 
            vs[(vs == np.inf) | (vs == -np.inf)] = 0.
            n_mean = np.round(np.mean(100*vs), decimals=decimal_point)
        else:
            n_mean = np.round(df[col].mean(), decimals=decimal_point)
        
        sum_stat = f"{n_sum}"# \%" 
        mean_stat = f"{n_mean}"
        line += f" & {mean_stat} & {sum_stat}"   
    n_sum, n_mean = 0, 0
    for col in columns[3:]:
        # agg 
        n_sum += df[col].sum()
        # avg 
        vs = df[col].div(df['all']).values 
        vs[vs == np.inf] = 0.
        n_mean += np.round(np.mean(100*vs), decimals=decimal_point)
    # agg 
    n_sum = np.round(100 * n_sum/df['all'].sum(), decimals=decimal_point)
    sum_stat = f"{n_sum}"
    # avg 
    n_mean = np.round(n_mean, decimals=decimal_point)
    mean_stat = f"{n_mean}" #\%" 
    line += f" & {mean_stat} & {sum_stat}"
    line += " \\\\"
    print (project + f" & {len(df)}" + line) 
 

In [5]:
dfs = []
for i, project in enumerate(projects):
    df = pd.read_csv(os.path.join(outputdir, f"{project}.init_pit_mut_status.csv"))
    dfs.append(df)
    get_summarised_statistic_simple(df, project)
get_summarised_statistic_simple(pd.concat(dfs), 'Total')

Lang & 49 & 346.39 & 16973 & 8.18 & 7.12 & 56.4 & 56.04 & 35.42 & 36.84 \\
Math & 104 & 274.5 & 28548 & 11.74 & 10.05 & 65.93 & 61.31 & 22.33 & 28.63 \\
Time & 20 & 154.3 & 3086 & 9.62 & 8.85 & 83.7 & 84.54 & 6.67 & 6.61 \\
Closure & 139 & 244.38 & 33969 & 8.95 & 10.07 & 72.14 & 74.26 & 18.91 & 15.66 \\
Cli & 30 & 92.0 & 2760 & 8.94 & 9.93 & 86.16 & 85.25 & 4.91 & 4.82 \\
Compress & 45 & 186.02 & 8371 & 15.08 & 16.9 & 71.98 & 67.69 & 12.94 & 15.41 \\
Codec & 18 & 197.83 & 3561 & 11.64 & 10.11 & 80.74 & 83.46 & 7.62 & 6.43 \\
Collections & 2 & 218.0 & 436 & 11.63 & 14.91 & 28.1 & 19.72 & 60.26 & 65.37 \\
Csv & 14 & 102.36 & 1433 & 8.63 & 12.35 & 84.65 & 80.88 & 6.72 & 6.77 \\
JacksonCore & 22 & 888.0 & 19536 & 16.72 & 14.78 & 58.47 & 61.61 & 24.8 & 23.61 \\
JacksonXml & 3 & 99.33 & 298 & 7.89 & 8.39 & 56.16 & 55.37 & 35.95 & 36.24 \\
JxPath & 9 & 189.44 & 1705 & 10.33 & 9.21 & 72.64 & 72.43 & 17.02 & 18.36 \\
Jsoup & 92 & 115.57 & 10632 & 14.74 & 15.17 & 74.98 & 73.78 & 10.28 & 11.05 \\

### Prevalence of latent mutants

In [6]:
n_dfs_thr = {}
for project in projects:
    mut_propa_stat_debt_file = os.path.join(outputdir, f"{project}.indv_mut_propagation_status_and_debt.csv")
    df = pd.read_csv(mut_propa_stat_debt_file)
    df = df.loc[~((df.status == 'reveal') & (df.debt_time > 365))]
    df = df.loc[~((df.status == 'surv') & (df.debt_time <= 365))]
    rows = []
    for bid, adf in df.groupby('bid'):
        n_surv = (adf.status == 'non-latent').sum()
        n_reveal = (adf.status == 'latent').sum()
        n_dead = (adf.status == 'discard').sum()
        n_all = len(adf)
        rows.append([bid, n_surv, n_reveal, n_dead, n_all])
    n_df = pd.DataFrame(rows, columns=['bid', 'n_non_latent', 'n_latent', 'n_discard', 'n_all'])
    n_df['project'] = [project] * len(n_df)
    n_dfs_thr[project] = n_df

In [7]:
def get_n_changes(df:pd.DataFrame, k = 'bid'):
    df = df[[k, 'survived']]
    n_rev = df[k].shape[0]
    df = df.loc[df.survived > 0]
    n_rev_w_survived = df[k].shape[0]
    total_surv_n = df.survived.sum()
    ret = [n_rev, n_rev_w_survived, total_surv_n]
    ret = [n_rev, df.shape[0], df.survived.sum()]
    return ret, df

decimal_point = 2
dfs = []
n_chg_pproj = {}
val_inval_muts_dfs = []
merged_dfs = []
for project in projects:
    df = pd.read_csv(os.path.join(outputdir, f"{project}.init_pit_mut_status.csv"))
    ret, merged_df = get_n_changes(df)
    merged_df['project'] = [project] * len(merged_df)
    merged_dfs.append(merged_df)
    n_chg_pproj[project] = ret 
    df['p_b_id'] = [f"{project}_{bid}" for bid in df.bid.values] 
    dfs.append(df)
n_chg_pproj['Total'], _ = get_n_changes(pd.concat(dfs), k = 'p_b_id')


In [8]:
def status_distr(project, n_df, n_chg):
    total_n = n_df.n_all.sum()
    
    total_n_non_latent = n_df.n_non_latent.sum()
    total_n_latent = n_df.n_latent.sum()
    total_n_discard = n_df.n_discard.sum()

    perc_surv = np.round(100*total_n_non_latent/total_n, decimals=1)
    perc_dead = np.round(100*total_n_discard/total_n, decimals=1)
    perc_reveal = np.round(100*total_n_latent/total_n, decimals=1)

    avg_n_non_latent = np.round(n_df.n_non_latent.mean(), decimals=1)
    avg_n_latent = np.round(n_df.n_latent.mean(), decimals=1)
    avg_n_discard = np.round(n_df.n_discard.mean(), decimals=1)

    perc_avg_surv  = np.round(100*(n_df.n_non_latent/n_df.n_all).mean(), decimals=1)
    perc_avg_reveal  = np.round(100*(n_df.n_latent/n_df.n_all).mean(), decimals=1)
    perc_avg_dead  = np.round(100*(n_df.n_discard/n_df.n_all).mean(), decimals=1)
    
    n_bid_w_mut = n_df.loc[n_df.n_all > 0][['project', 'bid']].drop_duplicates().shape[0]
    line = f"{project} & {n_chg[0]}" + " & \\textbf{" + f"{n_bid_w_mut}" + "}" + f"/{n_chg[1]}" +  "& \\textbf{" + f"{total_n}" + "}" + f"/{n_chg[2]}"
    line += f"& {avg_n_latent} ({perc_avg_reveal}\%) & {total_n_latent} ({perc_reveal}\%)" 
    line += f" & {avg_n_non_latent} ({perc_avg_surv}\%) & {total_n_non_latent} ({perc_surv}\%)" 
    line += f" & {avg_n_discard} ({perc_avg_dead}\%) & {total_n_discard} ({perc_dead}\%)"

    rows = [project, n_chg[0], (n_bid_w_mut, n_chg[1]), (total_n, n_chg[2])]
    rows.extend([(avg_n_latent, perc_avg_reveal), (total_n_latent, perc_reveal)])
    rows.extend([(avg_n_non_latent, perc_avg_surv), (total_n_non_latent, perc_surv)])
    rows.extend([(avg_n_discard, perc_avg_dead), (total_n_discard, perc_dead)])
    print (line + "\\\\")

decimal = 1
n_dfs = {}
for project in projects:
    status_distr(project, n_dfs_thr[project], n_chg_pproj[project])
# total 
status_distr('Total', pd.concat(list(n_dfs_thr.values())), n_chg_pproj['Total'])

Lang & 49 & \textbf{30}/33& \textbf{1025}/1209& 2.6 (8.5\%) & 78 (7.6\%) & 25.7 (76.0\%) & 771 (75.2\%) & 5.9 (15.4\%) & 176 (17.2\%)\\
Math & 104 & \textbf{89}/92& \textbf{2099}/2870& 1.8 (6.2\%) & 163 (7.8\%) & 16.4 (70.7\%) & 1461 (69.6\%) & 5.3 (23.0\%) & 475 (22.6\%)\\
Time & 20 & \textbf{19}/19& \textbf{185}/273& 0.5 (4.7\%) & 9 (4.9\%) & 9.2 (94.7\%) & 175 (94.6\%) & 0.1 (0.7\%) & 1 (0.5\%)\\
Closure & 139 & \textbf{118}/118& \textbf{2866}/3421& 0.5 (1.2\%) & 60 (2.1\%) & 19.2 (79.6\%) & 2265 (79.0\%) & 4.6 (19.2\%) & 541 (18.9\%)\\
Cli & 30 & \textbf{22}/25& \textbf{237}/274& 0.7 (4.9\%) & 15 (6.3\%) & 8.4 (88.5\%) & 184 (77.6\%) & 1.7 (6.5\%) & 38 (16.0\%)\\
Compress & 45 & \textbf{43}/44& \textbf{1201}/1415& 3.2 (14.3\%) & 138 (11.5\%) & 12.7 (51.3\%) & 544 (45.3\%) & 12.1 (34.5\%) & 519 (43.2\%)\\
Codec & 18 & \textbf{16}/18& \textbf{282}/360& 1.5 (5.1\%) & 24 (8.5\%) & 9.2 (53.2\%) & 148 (52.5\%) & 6.9 (41.8\%) & 110 (39.0\%)\\
Collections & 2 & \textbf{2}/2& \textbf{4}/65&