In [1]:

import os
import pandas as pd
from pingouin import wilcoxon
from tqdm import tqdm
# Initialize tqdm for pandas
tqdm.pandas()

In [2]:
TASK = "assert_generation"
NAME_MAPPING = {
    "codet5p-220m": "CodeT5+ 220M",
    "codet5p-770m": "CodeT5+ 770M"
}
RENAME_TUNING_METHOD_DICT = {
    "full-finetuning": "Full Fine-Tuning",
    "no-gnn": "Linear Adapter",
    "concatpervector": "GVE + ABF",
    "lora": "LoRA",
    "prompt-tuning": "Prompt-Tuning",
    "prefix-tuning": "Prefix-Tuning",
    "no-finetuning": "No Fine-Tuning",
    "concatpervector_linear": "Linear",
    "concatpervector_no_gve": "No GVE",
    "concatpervector_no_abf": "No ABF"
}

SEEDS = ("seed_18_1", "seed_99_1")
DATASET_BASEPATH = "/data/datasets/fix/"

# Read Results

In [3]:
def read_csv(root_path: str)->list:
    output_paths = []
    for filename in os.listdir(root_path):
        filepath = os.path.join(root_path, filename)
        output_paths.append(filepath)
    return output_paths

output_paths = read_csv(TASK)
output_paths[:5]

['assert_generation/codet5p-770m_concatpervector_no_gve.csv',
 'assert_generation/codet5p-220m_concatpervector_no_gve.csv',
 'assert_generation/codet5p-220m_concatpervector.csv',
 'assert_generation/codet5p-770m_concatpervector_no_abf.csv',
 'assert_generation/codet5p-770m_concatpervector.csv']

In [8]:
def create_df(paths: list)->pd.DataFrame:
    temp_list = []
    for path in paths:
        if "ipynb_checkpoints" not in path:
            filename = os.path.basename(path)
            model = filename.split("_")
            model = model[0]
            if model in NAME_MAPPING.keys():
                
                df = pd.read_csv(path)

                task = path.split("/")[0]
                df["task"] = task

                # filter seed and similar ids
                temp_task = task if task != "code_repair" else "code_repair_long"
                ids_path = os.path.join(DATASET_BASEPATH, f"{temp_task}/included_ids.csv")
                included_ids = pd.read_csv(ids_path)
                mask_ids = df["idx.1"].isin(included_ids["idx"])
                mask_seed = df["seed"].isin(SEEDS)
                df = df[(mask_seed) & (mask_ids)].copy()
                df["model"] = model

                tuning_method = "_".join(os.path.splitext(filename)[0].split("_")[1:])
                df["tuning_method"] = tuning_method

                if task != "summarization":
                    # df.drop(columns=["codebleu_stat"], inplace=True)
                    df = df[["model", "tuning_method", "task", "seed", "codebleu-cn"]].copy()
                    df.rename(columns={"codebleu-cn":"codebleu"}, inplace=True)
                else:
                    df = df[["model", "tuning_method", "task",  "seed", "bleu-cn"]].copy().round(2)

                temp_list.append(df)

    df = pd.concat(temp_list)
    return df

In [9]:
df = create_df(output_paths)

temp_df = df.groupby(["model", "tuning_method", "task", "seed"], as_index=False).mean()
temp_std = temp_df.groupby(["model", "tuning_method", "task"], as_index=False)["codebleu"].std().round(2)

df.drop(columns=["seed"], inplace=True)

# Calculate the mean and standard deviation for each group
temp_mean = df.groupby(["model", "tuning_method", "task"], as_index=False).mean().round(2)

# Add a suffix to the columns to distinguish between mean and std
temp_mean = temp_mean.add_suffix('_mean')
temp_std = temp_std.add_suffix('_std')

# Merge mean and std DataFrames
df_metric = pd.merge(temp_mean, temp_std, left_on=["model_mean", "tuning_method_mean", "task_mean"], 
                         right_on=["model_std", "tuning_method_std", "task_std"])

# Drop redundant columns after merge
df_metric.drop(columns=["model_std", "tuning_method_std", "task_std", "task_mean"], inplace=True)

# Rename columns for clarity
df_metric.rename(columns={"model_mean": "backbone_model", "tuning_method_mean": "tuning_method"} , inplace=True)
df_metric["tuning_method"] = df_metric["tuning_method"].apply(lambda x: RENAME_TUNING_METHOD_DICT[x])


df_metric

Unnamed: 0,backbone_model,tuning_method,codebleu_mean,codebleu_std
0,codet5p-220m,GVE + ABF,82.32,0.3
1,codet5p-220m,Linear,83.08,0.07
2,codet5p-220m,No ABF,83.14,0.04
3,codet5p-220m,No GVE,77.07,8.62
4,codet5p-770m,GVE + ABF,81.16,0.71
5,codet5p-770m,Linear,79.31,4.64
6,codet5p-770m,No ABF,78.79,2.51
7,codet5p-770m,No GVE,83.16,0.01


# Perform Statistical Test

In [10]:
def read_csv(root_path: str)->list:
    output_paths = []
    for filename in os.listdir(root_path):
        filepath = os.path.join(root_path, filename)
        output_paths.append(filepath)
    return output_paths

output_paths = read_csv(TASK)
output_paths

['assert_generation/codet5p-770m_concatpervector_no_gve.csv',
 'assert_generation/codet5p-220m_concatpervector_no_gve.csv',
 'assert_generation/codet5p-220m_concatpervector.csv',
 'assert_generation/codet5p-770m_concatpervector_no_abf.csv',
 'assert_generation/codet5p-770m_concatpervector.csv',
 'assert_generation/codet5p-220m_concatpervector_no_abf.csv',
 'assert_generation/codet5p-770m_concatpervector_linear.csv',
 'assert_generation/codet5p-220m_concatpervector_linear.csv']

In [13]:
def create_dict_for_stat_test(paths: list)->pd.DataFrame:
    gaft_dict = {}
    baseline_dict = {}
    for path in paths:
        if "ipynb_checkpoints" not in path:
            filename = os.path.basename(path)
            backbone_model = filename.split("_")[0]
            tuning_method = "_".join(os.path.splitext(filename)[0].split("_")[1:])
    
            df = pd.read_csv(path)

            # filter seed and similar ids
            temp_task = TASK if TASK != "code_repair" else "code_repair_long"
            ids_path = os.path.join(DATASET_BASEPATH, f"{temp_task}/included_ids.csv")
            included_ids = pd.read_csv(ids_path)
            mask_ids = df["idx.1"].isin(included_ids["idx"])
            mask_seed = df["seed"].isin(SEEDS)
            df = df[(mask_seed) & (mask_ids)].copy()
            if tuning_method == "concatpervector":
                if backbone_model not in gaft_dict:
                    gaft_dict[backbone_model] = {}
                gaft_dict[backbone_model][tuning_method] = df
            else:
                if backbone_model not in baseline_dict:
                    baseline_dict[backbone_model] = {}
                baseline_dict[backbone_model][tuning_method] = df
    return gaft_dict, baseline_dict 

In [14]:
gaft_dict, baseline_dict = create_dict_for_stat_test(output_paths)

In [15]:
temp = gaft_dict["codet5p-770m"]["concatpervector"]
idx = 0
temp[temp["bleu-cn"] < 100].iloc[idx].preds, temp[temp["bleu-cn"] < 100].iloc[idx].labels

('org. junit. Assert. assertEquals ( FileSystemKind. OBJECT_STORE, s3. getKind ( ) ) org org org org',
 'org. junit. Assert. assertEquals ( FileSystemKind. OBJECT_STORE, s3. getKind ( ) )')

In [16]:
def perform_wilcoxon_test(gaft_dict, baseline_dict, metric_name):
    p_val_dict = {}
    r_val_dict = {}
    for backbone_model, tuning_method_dict in baseline_dict.items():
        df_reference = gaft_dict[backbone_model]["concatpervector"]
        group1 = df_reference[metric_name].to_list()
        p_val_dict[backbone_model] = {}
        r_val_dict[backbone_model] = {}
        for tuning_method, df_baseline in tuning_method_dict.items():    
            group2 = df_baseline[metric_name].to_list()
            assert len(group1) == len(group2)
            print(f"Performing test | {len(group1)} samples | {backbone_model} | concatpervector vs {tuning_method}")

            w, alternative, p, rbc, CLES = wilcoxon(group1, group2, **{"zero_method": "zsplit"}).iloc[0].to_list()

            # w, alternative, p = wilcoxon(group1, group2, alternative="two-sided", zero_method="zsplit")
            p_val_dict[backbone_model][tuning_method] = p
            r_val_dict[backbone_model][tuning_method] = rbc

    return p_val_dict, r_val_dict

In [17]:
NAME_MAPPING_INVERSE = {v:k for k,v in NAME_MAPPING.items()}
RENAME_TUNING_METHOD_INVERSE = {v:k for k,v in RENAME_TUNING_METHOD_DICT.items()}

In [19]:
RENAME_TUNING_METHOD_INVERSE

{'Full Fine-Tuning': 'full-finetuning',
 'Linear Adapter': 'no-gnn',
 'GVE + ABF': 'concatpervector',
 'LoRA': 'lora',
 'Prompt-Tuning': 'prompt-tuning',
 'Prefix-Tuning': 'prefix-tuning',
 'No Fine-Tuning': 'no-finetuning',
 'Linear': 'concatpervector_linear',
 'No GVE': 'concatpervector_no_gve',
 'No ABF': 'concatpervector_no_abf'}

In [20]:
def combine_df(df_input, p_val_dict, col_name):
    df = df_input.copy()
    df[col_name] = 0
    for idx, row in df.iterrows():
        backbone_model = row["backbone_model"]
        tuning_method = RENAME_TUNING_METHOD_INVERSE[row["tuning_method"]]
        if tuning_method in p_val_dict.get(backbone_model, {}):
            df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]
    return df

In [21]:
p_val_dict, r_val_dict = perform_wilcoxon_test(gaft_dict, baseline_dict, "codebleu-cn")
df_final = combine_df(df_metric, p_val_dict, "p_val")
df_final = combine_df(df_final, r_val_dict, "rbc")

Performing test | 6710 samples | codet5p-770m | concatpervector vs concatpervector_no_gve
Performing test | 6710 samples | codet5p-770m | concatpervector vs concatpervector_no_abf
Performing test | 6710 samples | codet5p-770m | concatpervector vs concatpervector_linear
Performing test | 6710 samples | codet5p-220m | concatpervector vs concatpervector_no_gve
Performing test | 6710 samples | codet5p-220m | concatpervector vs concatpervector_no_abf
Performing test | 6710 samples | codet5p-220m | concatpervector vs concatpervector_linear


  df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]
  df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]


In [23]:
ordering = ['backbone_model', 'tuning_method', 
            'codebleu_mean', 'codebleu_std','p_val', 'rbc'
        ]
df_final = df_final[ordering].copy()
df_final["backbone_model"] = df_final["backbone_model"].apply(lambda x: NAME_MAPPING[x]) 
df_final

Unnamed: 0,backbone_model,tuning_method,codebleu_mean,codebleu_std,p_val,rbc
0,CodeT5+ 220M,GVE + ABF,82.32,0.3,0.0,0.0
1,CodeT5+ 220M,Linear,83.08,0.07,5.413614999999999e-56,-0.978018
2,CodeT5+ 220M,No ABF,83.14,0.04,6.432163e-72,-0.976961
3,CodeT5+ 220M,No GVE,77.07,8.62,1.20516e-24,0.665405
4,CodeT5+ 770M,GVE + ABF,81.16,0.71,0.0,0.0
5,CodeT5+ 770M,Linear,79.31,4.64,0.3188278,0.383527
6,CodeT5+ 770M,No ABF,78.79,2.51,4.155117e-05,0.278437
7,CodeT5+ 770M,No GVE,83.16,0.01,6.780522e-30,-0.990473


In [24]:
df_final.to_csv(f"t_test_{TASK}.csv")