In [1]:

import os
import pandas as pd
from pingouin import wilcoxon
from tqdm import tqdm
# Initialize tqdm for pandas
tqdm.pandas()

In [2]:
TASK = "code_repair"
NAME_MAPPING = {
    "codet5p-220m": "CodeT5+ 220M",
    "codet5p-770m": "CodeT5+ 770M"
}
RENAME_TUNING_METHOD_DICT = {
    "full-finetuning": "Full Fine-Tuning",
    "no-gnn": "Linear Adapter",
    "concatpervector": "Transducer Tuning",
    "lora": "LoRA",
    "prompt-tuning": "Prompt-Tuning",
    "prefix-tuning": "Prefix-Tuning",
    "no-finetuning": "No Fine-Tuning"
}

SEEDS = ("seed_18_1", "seed_99_1")
DATASET_BASEPATH = "/data/datasets/fix/"

# Read Results

In [3]:
def read_csv(root_path: str)->list:
    output_paths = []
    for filename in os.listdir(root_path):
        filepath = os.path.join(root_path, filename)
        output_paths.append(filepath)
    return output_paths

output_paths = read_csv(TASK)
output_paths[:5]

['code_repair/codet5p-220m_prompt-tuning.csv',
 'code_repair/codet5p-220m_concatpervector.csv',
 'code_repair/codet5p-770m_full-finetuning.csv',
 'code_repair/codet5p-770m_lora.csv',
 'code_repair/codet5p-220m_prefix-tuning.csv']

In [4]:
def create_df(paths: list)->pd.DataFrame:
    temp_list = []
    for path in paths:
        if "ipynb_checkpoints" not in path:
            filename = os.path.basename(path)
            filename = filename.split("_")
            model = filename[0]
            if model in NAME_MAPPING.keys():
                
                df = pd.read_csv(path)

                task = path.split("/")[0]
                df["task"] = task

                # filter seed and similar ids
                temp_task = task if task != "code_repair" else "code_repair_long"
                ids_path = os.path.join(DATASET_BASEPATH, f"{temp_task}/included_ids.csv")
                included_ids = pd.read_csv(ids_path)
                mask_ids = df["idx.1"].isin(included_ids["idx"])
                mask_seed = df["seed"].isin(SEEDS)
                df = df[(mask_seed) & (mask_ids)].copy()
                df["model"] = model

                tuning_method = os.path.splitext(filename[1])[0]
                df["tuning_method"] = tuning_method

                if task != "summarization":
                    # df.drop(columns=["codebleu_stat"], inplace=True)
                    df = df[["model", "tuning_method", "task", "seed", "codebleu-cn"]].copy()
                    df.rename(columns={"codebleu-cn":"codebleu"}, inplace=True)
                else:
                    df = df[["model", "tuning_method", "task",  "seed", "bleu-cn"]].copy().round(2)

                temp_list.append(df)

    df = pd.concat(temp_list)
    return df

In [5]:
df = create_df(output_paths)

temp_df = df.groupby(["model", "tuning_method", "task", "seed"], as_index=False).mean()
temp_std = temp_df.groupby(["model", "tuning_method", "task"], as_index=False)["codebleu"].std().round(2)

df.drop(columns=["seed"], inplace=True)

# Calculate the mean and standard deviation for each group
temp_mean = df.groupby(["model", "tuning_method", "task"], as_index=False).mean().round(2)

# Add a suffix to the columns to distinguish between mean and std
temp_mean = temp_mean.add_suffix('_mean')
temp_std = temp_std.add_suffix('_std')

# Merge mean and std DataFrames
df_metric = pd.merge(temp_mean, temp_std, left_on=["model_mean", "tuning_method_mean", "task_mean"], 
                         right_on=["model_std", "tuning_method_std", "task_std"])

# Drop redundant columns after merge
df_metric.drop(columns=["model_std", "tuning_method_std", "task_std", "task_mean"], inplace=True)

# Rename columns for clarity
df_metric.rename(columns={"model_mean": "backbone_model", "tuning_method_mean": "tuning_method"} , inplace=True)
df_metric["tuning_method"] = df_metric["tuning_method"].apply(lambda x: RENAME_TUNING_METHOD_DICT[x])


df_metric

Unnamed: 0,backbone_model,tuning_method,codebleu_mean,codebleu_std
0,codet5p-220m,Transducer Tuning,98.1,0.39
1,codet5p-220m,Full Fine-Tuning,99.87,0.0
2,codet5p-220m,LoRA,99.87,0.0
3,codet5p-220m,No Fine-Tuning,96.0,0.0
4,codet5p-220m,Linear Adapter,99.31,0.71
5,codet5p-220m,Prefix-Tuning,99.87,0.0
6,codet5p-220m,Prompt-Tuning,97.46,1.13
7,codet5p-770m,Transducer Tuning,96.75,3.94
8,codet5p-770m,Full Fine-Tuning,99.87,0.0
9,codet5p-770m,LoRA,99.87,0.0


# Merge with Trainable Param

In [6]:
param_dict = {
    "codet5p-220m": {
        "Transducer Tuning": 30728,
        "Prefix-Tuning": 184320,
        "Prompt-Tuning": 38400,
        "LoRA": 884736,
        "Full Fine-Tuning": 222882048,
        "Linear Adapter": 589824
    },
    "codet5p-770m": {
        "Transducer Tuning": 37128,
        "Prefix-Tuning": 491520,
        "Prompt-Tuning": 102400,
        "LoRA": 2359296,
        "Full Fine-Tuning": 737639424,
        "Linear Adapter": 1048576
    },
}

In [7]:
# Initialize the new dictionary
converted_param_dict = {
    "backbone_model": [],
    "tuning_method": [],
    "trainable_param": []
}

# Iterate through the original dictionary to populate the new one
for model, approaches in param_dict.items():
    for approach, param in approaches.items():
        converted_param_dict["backbone_model"].append(model)
        converted_param_dict["tuning_method"].append(approach)
        converted_param_dict["trainable_param"].append(param)

In [8]:
usage = pd.DataFrame(converted_param_dict)
usage

Unnamed: 0,backbone_model,tuning_method,trainable_param
0,codet5p-220m,Transducer Tuning,30728
1,codet5p-220m,Prefix-Tuning,184320
2,codet5p-220m,Prompt-Tuning,38400
3,codet5p-220m,LoRA,884736
4,codet5p-220m,Full Fine-Tuning,222882048
5,codet5p-220m,Linear Adapter,589824
6,codet5p-770m,Transducer Tuning,37128
7,codet5p-770m,Prefix-Tuning,491520
8,codet5p-770m,Prompt-Tuning,102400
9,codet5p-770m,LoRA,2359296


In [9]:
df_metric = pd.merge(left=df_metric, right=usage, on=["backbone_model", "tuning_method"], how="outer")
df_metric.fillna(0, inplace=True)
df_metric

Unnamed: 0,backbone_model,tuning_method,codebleu_mean,codebleu_std,trainable_param
0,codet5p-220m,Transducer Tuning,98.1,0.39,30728.0
1,codet5p-220m,Full Fine-Tuning,99.87,0.0,222882048.0
2,codet5p-220m,LoRA,99.87,0.0,884736.0
3,codet5p-220m,No Fine-Tuning,96.0,0.0,0.0
4,codet5p-220m,Linear Adapter,99.31,0.71,589824.0
5,codet5p-220m,Prefix-Tuning,99.87,0.0,184320.0
6,codet5p-220m,Prompt-Tuning,97.46,1.13,38400.0
7,codet5p-770m,Transducer Tuning,96.75,3.94,37128.0
8,codet5p-770m,Full Fine-Tuning,99.87,0.0,737639424.0
9,codet5p-770m,LoRA,99.87,0.0,2359296.0


# Perform Statistical Test

In [10]:
def read_csv(root_path: str)->list:
    output_paths = []
    for filename in os.listdir(root_path):
        filepath = os.path.join(root_path, filename)
        output_paths.append(filepath)
    return output_paths

output_paths = read_csv(TASK)
output_paths

['code_repair/codet5p-220m_prompt-tuning.csv',
 'code_repair/codet5p-220m_concatpervector.csv',
 'code_repair/codet5p-770m_full-finetuning.csv',
 'code_repair/codet5p-770m_lora.csv',
 'code_repair/codet5p-220m_prefix-tuning.csv',
 'code_repair/codet5p-770m_prefix-tuning.csv',
 'code_repair/codet5p-770m_no-finetuning.csv',
 'code_repair/codet5p-770m_concatpervector.csv',
 'code_repair/codet5p-220m_no-finetuning.csv',
 'code_repair/codet5p-220m_full-finetuning.csv',
 'code_repair/codet5p-220m_lora.csv',
 'code_repair/codet5p-220m_no-gnn.csv',
 'code_repair/codet5p-770m_no-gnn.csv',
 'code_repair/codet5p-770m_prompt-tuning.csv']

In [11]:
def create_dict_for_stat_test(paths: list)->pd.DataFrame:
    gaft_dict = {}
    baseline_dict = {}
    for path in paths:
        if "ipynb_checkpoints" not in path:
            filename = os.path.basename(path)
            filename = filename.split("_")
            backbone_model = filename[0]
            tuning_method = os.path.splitext(filename[1])[0]
    
            df = pd.read_csv(path)

            # filter seed and similar ids
            temp_task = TASK if TASK != "code_repair" else "code_repair_long"
            ids_path = os.path.join(DATASET_BASEPATH, f"{temp_task}/included_ids.csv")
            included_ids = pd.read_csv(ids_path)
            mask_ids = df["idx.1"].isin(included_ids["idx"])
            mask_seed = df["seed"].isin(SEEDS)
            df = df[(mask_seed) & (mask_ids)].copy()
            if tuning_method == "concatpervector":
                if backbone_model not in gaft_dict:
                    gaft_dict[backbone_model] = {}
                gaft_dict[backbone_model][tuning_method] = df
            else:
                if backbone_model not in baseline_dict:
                    baseline_dict[backbone_model] = {}
                baseline_dict[backbone_model][tuning_method] = df
    return gaft_dict, baseline_dict 

In [12]:
gaft_dict, baseline_dict = create_dict_for_stat_test(output_paths)

In [13]:
temp = gaft_dict["codet5p-770m"]["concatpervector"]
idx = 0
temp[temp["bleu-cn"] < 100].iloc[idx].preds, temp[temp["bleu-cn"] < 100].iloc[idx].labels

('private void METHOD_1 ( java.util.List < TYPE_1 > parameters, TYPE_2 VAR_1 ) { while ( VAR_1. METHOD_2 ( ) ) { TYPE_3 VAR_2 = METHOD_3 ( VAR_1 ) ; if ( VAR_2 == null ) { break ; } if ( VAR_2. METHOD_4 ( ) ) { METHOD_6 ( parameters, METHOD_7 ( VAR_2 ) ) ; } VAR_1 = ( ( TYPE_2 ) ( VAR_2. METHOD_8 ( ) ) ) ; } } METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD METHOD',
 'private void METHOD_1 ( java.util.List < TYPE_1 > parameters, TYPE_2 VAR_1 ) { while ( VAR_1. ME

In [14]:
def perform_wilcoxon_test(gaft_dict, baseline_dict, metric_name):
    p_val_dict = {}
    r_val_dict = {}
    for backbone_model, tuning_method_dict in baseline_dict.items():
        df_reference = gaft_dict[backbone_model]["concatpervector"]
        group1 = df_reference[metric_name].to_list()
        p_val_dict[backbone_model] = {}
        r_val_dict[backbone_model] = {}
        for tuning_method, df_baseline in tuning_method_dict.items():    
            group2 = df_baseline[metric_name].to_list()
            assert len(group1) == len(group2)
            print(f"Performing test | {len(group1)} samples | {backbone_model} | concatpervector vs {tuning_method}")

            w, alternative, p, rbc, CLES = wilcoxon(group1, group2, **{"zero_method": "zsplit"}).iloc[0].to_list()

            # w, alternative, p = wilcoxon(group1, group2, alternative="two-sided", zero_method="zsplit")
            p_val_dict[backbone_model][tuning_method] = p
            r_val_dict[backbone_model][tuning_method] = rbc

    return p_val_dict, r_val_dict

In [15]:
NAME_MAPPING_INVERSE = {v:k for k,v in NAME_MAPPING.items()}
RENAME_TUNING_METHOD_INVERSE = {v:k for k,v in RENAME_TUNING_METHOD_DICT.items()}

In [16]:
def combine_df(df_input, p_val_dict, col_name):
    df = df_input.copy()
    df[col_name] = 0
    for idx, row in df.iterrows():
        backbone_model = row["backbone_model"]
        tuning_method = RENAME_TUNING_METHOD_INVERSE[row["tuning_method"]]
        if tuning_method in p_val_dict.get(backbone_model, {}):
            df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]
    return df

In [17]:
p_val_dict, r_val_dict = perform_wilcoxon_test(gaft_dict, baseline_dict, "codebleu-cn")
df_final = combine_df(df_metric, p_val_dict, "p_val")
df_final = combine_df(df_final, r_val_dict, "rbc")

Performing test | 5672 samples | codet5p-220m | concatpervector vs prompt-tuning
Performing test | 5672 samples | codet5p-220m | concatpervector vs prefix-tuning
Performing test | 5672 samples | codet5p-220m | concatpervector vs no-finetuning
Performing test | 5672 samples | codet5p-220m | concatpervector vs full-finetuning
Performing test | 5672 samples | codet5p-220m | concatpervector vs lora
Performing test | 5672 samples | codet5p-220m | concatpervector vs no-gnn
Performing test | 5672 samples | codet5p-770m | concatpervector vs full-finetuning
Performing test | 5672 samples | codet5p-770m | concatpervector vs lora
Performing test | 5672 samples | codet5p-770m | concatpervector vs prefix-tuning
Performing test | 5672 samples | codet5p-770m | concatpervector vs no-finetuning
Performing test | 5672 samples | codet5p-770m | concatpervector vs no-gnn
Performing test | 5672 samples | codet5p-770m | concatpervector vs prompt-tuning


  df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]
  df.at[idx, col_name] = p_val_dict[backbone_model][tuning_method]


In [18]:
ordering = ['backbone_model', 'tuning_method', 'trainable_param', 
            'codebleu_mean', 'codebleu_std','p_val', 'rbc'
        ]
df_final = df_final[ordering].copy()
df_final["backbone_model"] = df_final["backbone_model"].apply(lambda x: NAME_MAPPING[x]) 
df_final

Unnamed: 0,backbone_model,tuning_method,trainable_param,codebleu_mean,codebleu_std,p_val,rbc
0,CodeT5+ 220M,Transducer Tuning,30728.0,98.1,0.39,0.0,0.0
1,CodeT5+ 220M,Full Fine-Tuning,222882048.0,99.87,0.0,1.853589e-46,-1.0
2,CodeT5+ 220M,LoRA,884736.0,99.87,0.0,1.853589e-46,-1.0
3,CodeT5+ 220M,No Fine-Tuning,0.0,96.0,0.0,1.2210329999999999e-26,0.796826
4,CodeT5+ 220M,Linear Adapter,589824.0,99.31,0.71,2.205944e-25,-0.63346
5,CodeT5+ 220M,Prefix-Tuning,184320.0,99.87,0.0,1.853589e-46,-1.0
6,CodeT5+ 220M,Prompt-Tuning,38400.0,97.46,1.13,0.2148741,0.212282
7,CodeT5+ 770M,Transducer Tuning,37128.0,96.75,3.94,0.0,0.0
8,CodeT5+ 770M,Full Fine-Tuning,737639424.0,99.87,0.0,5.500745e-40,-1.0
9,CodeT5+ 770M,LoRA,2359296.0,99.87,0.0,5.500745e-40,-1.0


In [19]:
df_final.to_csv(f"t_test_{TASK}.csv")