In [2]:
import re

def extract_correctness(text):
    match = re.search(r'\b(correct|incorrect)\b', text, re.IGNORECASE)
    return match.group(0) if match else None

In [3]:
attacks = ['naive', 'alter', 'irrelevant', 'paraphrase']
models = ['deepseek_7b', 'deepseek_14b', 'deepseek_32b', 'qwen_2_5_7b', 'qwq_32b', 'llama_3_1_8B_it']

In [7]:
import os
from tqdm import tqdm
import pandas as pd

columns = ['attack', 'model', 'judge_model', 'overall_acc', 'easy_acc', 'med_acc', 'hard_acc']
results_df = pd.DataFrame(columns=columns)
results_df_path = 'all_results.csv'

judge_model = "llama_3_1_8B_it"

not_exist = []
not_complete = []
for attack in tqdm(attacks):
    for model in tqdm(models):
        easy_acc_list, med_acc_list, hard_acc_list = [], [], []
        overall_acc_list = []
        csv_path = f'target_{model}_judge_{judge_model}_attack_{attack}.csv'
        if not os.path.exists(csv_path):
            not_exist.append(csv_path)
            continue
        
        df = pd.read_csv(csv_path)
        # Remove duplicate rows based on 'orig_question' and 'ground_truth'
        df = df.drop_duplicates(subset=['orig_question', 'ground_truth'], keep='first')

        # if len(df) < 300:
        #     not_complete.append(csv_path)
        #     continue
        
        for idx, row in df.iterrows():
            difficulty = row['difficulty']
            final_eval_result = row['eval_result']
            overall_acc_list.append(final_eval_result)
            if difficulty == 'Easy':
                easy_acc_list.append(final_eval_result)
            elif difficulty == 'Medium':
                med_acc_list.append(final_eval_result)
            elif difficulty == 'Hard':
                hard_acc_list.append(final_eval_result)
        
        overall_acc = (overall_acc_list.count('correct') / len(overall_acc_list)) * 100
        easy_acc = (easy_acc_list.count('correct') / len(easy_acc_list)) * 100
        med_acc = (med_acc_list.count('correct') / len(med_acc_list)) * 100
        hard_acc = (hard_acc_list.count('correct') / len(hard_acc_list)) * 100
        
        print("*" * 50)
        print(f"len(df): {len(df)}")
        print(f"attack: {attack}")
        print(f"model: {model}")
        print(f"judge_model: {judge_model}")
        print(f"overall_acc: {overall_acc}")
        print(f"easy_acc: {easy_acc}")
        print(f"med_acc: {med_acc}")
        print(f"hard_acc: {hard_acc}")
        print("*" * 50)

        new_row = {
            'attack': attack,
            'model': model,
            'judge_model': judge_model,
            'overall_acc': overall_acc,
            'easy_acc': easy_acc,
            'med_acc': med_acc,
            'hard_acc': hard_acc,
        }
        new_idx = len(results_df)
        results_df.loc[new_idx] = new_row
        results_df.to_csv(results_df_path, index=False)

# Remove duplicates from the final results dataframe based on 'attack', 'model', and 'judge_model'
results_df = results_df.drop_duplicates(subset=['attack', 'model', 'judge_model'], keep='first')
results_df.to_csv(results_df_path, index=False)

100%|██████████| 6/6 [00:00<00:00, 44.44it/s]
 25%|██▌       | 1/4 [00:00<00:00,  7.28it/s]

**************************************************
len(df): 109
attack: naive
model: deepseek_7b
judge_model: llama_3_1_8B_it
overall_acc: 34.862385321100916
easy_acc: 18.181818181818183
med_acc: 33.33333333333333
hard_acc: 55.00000000000001
**************************************************
**************************************************
len(df): 75
attack: naive
model: deepseek_14b
judge_model: llama_3_1_8B_it
overall_acc: 25.333333333333336
easy_acc: 20.0
med_acc: 23.25581395348837
hard_acc: 37.5
**************************************************
**************************************************
len(df): 46
attack: naive
model: deepseek_32b
judge_model: llama_3_1_8B_it
overall_acc: 15.217391304347828
easy_acc: 30.0
med_acc: 7.6923076923076925
hard_acc: 11.11111111111111
**************************************************
**************************************************
len(df): 300
attack: naive
model: qwen_2_5_7b
judge_model: llama_3_1_8B_it
overall_acc: 9.0
easy_acc: 11.42857



**************************************************
len(df): 74
attack: alter
model: deepseek_7b
judge_model: llama_3_1_8B_it
overall_acc: 47.2972972972973
easy_acc: 20.0
med_acc: 50.0
hard_acc: 68.75
**************************************************
**************************************************
len(df): 50
attack: alter
model: deepseek_14b
judge_model: llama_3_1_8B_it
overall_acc: 40.0
easy_acc: 40.0
med_acc: 37.93103448275862
hard_acc: 40.0
**************************************************
**************************************************
len(df): 25
attack: alter
model: deepseek_32b
judge_model: llama_3_1_8B_it
overall_acc: 8.0
easy_acc: 14.285714285714285
med_acc: 8.333333333333332
hard_acc: 0.0
**************************************************


100%|██████████| 6/6 [00:00<00:00, 40.48it/s]
 50%|█████     | 2/4 [00:00<00:00,  6.88it/s]

**************************************************
len(df): 300
attack: alter
model: qwen_2_5_7b
judge_model: llama_3_1_8B_it
overall_acc: 5.0
easy_acc: 5.714285714285714
med_acc: 4.046242774566474
hard_acc: 7.142857142857142
**************************************************
**************************************************
len(df): 90
attack: alter
model: qwq_32b
judge_model: llama_3_1_8B_it
overall_acc: 14.444444444444443
easy_acc: 11.11111111111111
med_acc: 17.307692307692307
hard_acc: 10.526315789473683
**************************************************
**************************************************
len(df): 300
attack: alter
model: llama_3_1_8B_it
judge_model: llama_3_1_8B_it
overall_acc: 19.0
easy_acc: 24.285714285714285
med_acc: 17.91907514450867
hard_acc: 14.285714285714285
**************************************************




**************************************************
len(df): 108
attack: irrelevant
model: deepseek_7b
judge_model: llama_3_1_8B_it
overall_acc: 37.03703703703704
easy_acc: 13.636363636363635
med_acc: 36.92307692307693
hard_acc: 60.0
**************************************************
**************************************************
len(df): 78
attack: irrelevant
model: deepseek_14b
judge_model: llama_3_1_8B_it
overall_acc: 25.64102564102564
easy_acc: 20.0
med_acc: 22.22222222222222
hard_acc: 41.17647058823529
**************************************************
**************************************************
len(df): 40
attack: irrelevant
model: deepseek_32b
judge_model: llama_3_1_8B_it
overall_acc: 5.0
easy_acc: 10.0
med_acc: 4.545454545454546
hard_acc: 0.0
**************************************************
**************************************************
len(df): 300
attack: irrelevant
model: qwen_2_5_7b
judge_model: llama_3_1_8B_it
overall_acc: 9.666666666666666
easy_acc: 8.5714

100%|██████████| 6/6 [00:00<00:00, 50.01it/s]
 75%|███████▌  | 3/4 [00:00<00:00,  7.40it/s]

**************************************************
len(df): 122
attack: irrelevant
model: qwq_32b
judge_model: llama_3_1_8B_it
overall_acc: 22.950819672131146
easy_acc: 21.428571428571427
med_acc: 25.0
hard_acc: 19.047619047619047
**************************************************
**************************************************
len(df): 300
attack: irrelevant
model: llama_3_1_8B_it
judge_model: llama_3_1_8B_it
overall_acc: 23.0
easy_acc: 28.57142857142857
med_acc: 19.653179190751445
hard_acc: 25.0
**************************************************




**************************************************
len(df): 116
attack: paraphrase
model: deepseek_7b
judge_model: llama_3_1_8B_it
overall_acc: 25.0
easy_acc: 16.0
med_acc: 21.73913043478261
hard_acc: 47.61904761904761
**************************************************
**************************************************
len(df): 72
attack: paraphrase
model: deepseek_14b
judge_model: llama_3_1_8B_it
overall_acc: 29.166666666666668
easy_acc: 6.666666666666667
med_acc: 35.0
hard_acc: 37.5
**************************************************
**************************************************
len(df): 42
attack: paraphrase
model: deepseek_32b
judge_model: llama_3_1_8B_it
overall_acc: 9.523809523809524
easy_acc: 10.0
med_acc: 12.5
hard_acc: 0.0
**************************************************
**************************************************
len(df): 300
attack: paraphrase
model: qwen_2_5_7b
judge_model: llama_3_1_8B_it
overall_acc: 8.333333333333332
easy_acc: 8.571428571428571
med_acc: 8.09

100%|██████████| 6/6 [00:00<00:00, 50.42it/s]
100%|██████████| 4/4 [00:00<00:00,  7.48it/s]

**************************************************
len(df): 300
attack: paraphrase
model: llama_3_1_8B_it
judge_model: llama_3_1_8B_it
overall_acc: 22.333333333333332
easy_acc: 28.57142857142857
med_acc: 20.809248554913296
hard_acc: 19.642857142857142
**************************************************





In [6]:
not_complete

[]

In [6]:
not_exist

[]