In [1]:
import re

def pattern_matching(output_text):
    """
    Extracts correctness and reasoning score from output_text using regex.
    Handles both markdown-style (bold `**text:**`) and plain text formats.
    """
    # Regex patterns to capture numbers and words after "Reasoning Score:" and "Correctness:"
    reasoning_pattern = r'(?:\*\*Reasoning Score:\*\*|Reasoning Score:)\s*(\d+)'
    correctness_pattern = r'(?:\*\*Correctness:\*\*|Correctness:)\s*(\w+)'

    # Search for Reasoning Score
    score_match = re.search(reasoning_pattern, output_text)
    reasoning_score = int(score_match.group(1)) if score_match else 0  # Return 0 instead of NaN

    # Search for Correctness
    correctness_match = re.search(correctness_pattern, output_text)
    correctness = correctness_match.group(1) if correctness_match else "unknown"  # Default to "unknown" if missing

    return correctness, reasoning_score

In [2]:
import re
import glob
import pandas as pd
from tqdm import tqdm
import math

all_csv = 'all_results_reasoning.csv'
results_df = pd.DataFrame(columns=['csv_path', 'len(df)', 'correctness_acc', 'easy_acc', 'med_acc', 'hard_acc', 'avg_reasoning_score', 'avg_easy_reasoning_score', 'avg_med_reasoning_score', 'avg_hard_reasoning_score'])

csv_paths = glob.glob("*.csv")
for csv_path in tqdm(csv_paths):
    if csv_path == all_csv:
        continue

    easy_corr_list, med_corr_list, hard_corr_list = [], [], []
    easy_reas_list, med_reas_list, hard_reas_list = [], [], []

    correctness_list = []
    reasoning_scores = []

    df = pd.read_csv(csv_path)
    for idx, row in df.iterrows():
        difficulty = row['difficulty']
        correctness = row['correctness']
        reasoning_score = row['reasoning_score']
        output_text = row['eval_result']
        # print(f'{output_text}')

        # if reasoning_score is None or math.isnan(reasoning_score):
        correctness, reasoning_score = pattern_matching(output_text)
        correctness_list.append(correctness)
        reasoning_scores.append(reasoning_score)
        if difficulty == 'Easy':
            easy_corr_list.append(correctness)
            easy_reas_list.append(reasoning_score)
        elif difficulty == 'Medium':
            med_corr_list.append(correctness)
            med_reas_list.append(reasoning_score)
        elif difficulty == 'Hard':
            hard_corr_list.append(correctness)
            hard_reas_list.append(reasoning_score)

        df.at[idx, 'correctness'] = correctness
        df.at[idx, 'reasoning_score'] = reasoning_score
        df.to_csv(csv_path)

        # print(f'correctness: {correctness}, {type(correctness)}')
        # print(f'reasoning_score: {reasoning_score}, {type(reasoning_score)}')
    
    correctness_acc = (correctness_list.count('correct') / len(correctness_list)) * 100
    easy_acc = (easy_corr_list.count('correct') / len(easy_corr_list)) * 100
    med_acc = (med_corr_list.count('correct') / len(med_corr_list)) * 100
    hard_acc = (hard_corr_list.count('correct') / len(hard_corr_list)) * 100

    avg_reasoning_score = (sum(reasoning_scores) / len(reasoning_scores))
    avg_easy_reasoning_score = (sum(easy_reas_list) / len(easy_reas_list))
    avg_med_reasoning_score = (sum(med_reas_list) / len(med_reas_list))
    avg_hard_reasoning_score = (sum(hard_reas_list) / len(hard_reas_list))


    print('\n')
    print("*"*50)
    print(f"csv_path: {csv_path}")
    print(f"len(df): {len(df)}")
    print(f"correctness_acc: {correctness_acc}")
    print(f"easy_acc: {easy_acc}")
    print(f"med_acc: {med_acc}")
    print(f"hard_acc: {hard_acc}")
    print('\n')
    print(f"avg_reasoning_score: {avg_reasoning_score}")
    print(f"avg_easy_reasoning_score: {avg_easy_reasoning_score}")
    print(f"avg_med_reasoning_score: {avg_med_reasoning_score}")
    print(f"avg_hard_reasoning_score: {avg_hard_reasoning_score}")
    print("*"*50)
    print('\n')

    new_row = {
        "csv_path": csv_path,
        "len(df)": len(df),
        "correctness_acc": correctness_acc,
        "easy_acc": easy_acc,
        "med_acc": med_acc,
        "hard_acc": hard_acc,
        "avg_reasoning_score": avg_reasoning_score,
        "avg_easy_reasoning_score": avg_easy_reasoning_score,
        "avg_med_reasoning_score": avg_med_reasoning_score,
        "avg_hard_reasoning_score": avg_hard_reasoning_score,
    }

    new_idx = len(results_df)
    results_df.loc[new_idx] = new_row
    results_df.to_csv(all_csv)


  8%|▊         | 2/25 [00:32<06:08, 16.00s/it]



**************************************************
csv_path: target_deepseek_14b_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 193
correctness_acc: 96.37305699481865
easy_acc: 98.33333333333333
med_acc: 97.32142857142857
hard_acc: 85.71428571428571


avg_reasoning_score: 4.694300518134715
avg_easy_reasoning_score: 4.633333333333334
avg_med_reasoning_score: 4.741071428571429
avg_hard_reasoning_score: 4.619047619047619
**************************************************




 12%|█▏        | 3/25 [00:48<05:52, 16.04s/it]



**************************************************
csv_path: target_deepseek_14b_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 235
correctness_acc: 100.0
easy_acc: 100.0
med_acc: 100.0
hard_acc: 100.0


avg_reasoning_score: 4.965957446808511
avg_easy_reasoning_score: 4.955223880597015
avg_med_reasoning_score: 4.978260869565218
avg_hard_reasoning_score: 4.931034482758621
**************************************************




 16%|█▌        | 4/25 [01:10<06:28, 18.50s/it]



**************************************************
csv_path: target_deepseek_14b_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 271
correctness_acc: 99.63099630996311
easy_acc: 100.0
med_acc: 99.37888198757764
hard_acc: 100.0


avg_reasoning_score: 4.974169741697417
avg_easy_reasoning_score: 5.0
avg_med_reasoning_score: 4.968944099378882
avg_hard_reasoning_score: 4.948717948717949
**************************************************




 20%|██        | 5/25 [01:48<08:26, 25.32s/it]



**************************************************
csv_path: target_deepseek_14b_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 269
correctness_acc: 96.6542750929368
easy_acc: 95.71428571428572
med_acc: 98.10126582278481
hard_acc: 92.5


avg_reasoning_score: 4.921933085501859
avg_easy_reasoning_score: 4.8428571428571425
avg_med_reasoning_score: 4.981012658227848
avg_hard_reasoning_score: 4.825
**************************************************




 24%|██▍       | 6/25 [02:02<06:44, 21.30s/it]



**************************************************
csv_path: target_deepseek_32b_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 204
correctness_acc: 97.05882352941177
easy_acc: 95.08196721311475
med_acc: 97.45762711864407
hard_acc: 100.0


avg_reasoning_score: 4.745098039215686
avg_easy_reasoning_score: 4.672131147540983
avg_med_reasoning_score: 4.762711864406779
avg_hard_reasoning_score: 4.875
**************************************************




 28%|██▊       | 7/25 [02:20<06:07, 20.40s/it]



**************************************************
csv_path: target_deepseek_32b_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 274
correctness_acc: 98.54014598540147
easy_acc: 100.0
med_acc: 98.10126582278481
hard_acc: 97.77777777777777


avg_reasoning_score: 4.916058394160584
avg_easy_reasoning_score: 4.942857142857143
avg_med_reasoning_score: 4.8924050632911396
avg_hard_reasoning_score: 4.955555555555556
**************************************************




 32%|███▏      | 8/25 [02:41<05:47, 20.46s/it]



**************************************************
csv_path: target_deepseek_32b_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 286
correctness_acc: 99.3006993006993
easy_acc: 98.57142857142858
med_acc: 99.41176470588235
hard_acc: 100.0


avg_reasoning_score: 4.937062937062937
avg_easy_reasoning_score: 4.9
avg_med_reasoning_score: 4.9411764705882355
avg_hard_reasoning_score: 4.977777777777778
**************************************************




 36%|███▌      | 9/25 [03:01<05:27, 20.44s/it]



**************************************************
csv_path: target_deepseek_32b_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 289
correctness_acc: 98.26989619377161
easy_acc: 98.57142857142858
med_acc: 98.22485207100591
hard_acc: 97.95918367346938


avg_reasoning_score: 4.882352941176471
avg_easy_reasoning_score: 4.9714285714285715
avg_med_reasoning_score: 4.846153846153846
avg_hard_reasoning_score: 4.877551020408164
**************************************************




 40%|████      | 10/25 [03:08<04:02, 16.20s/it]



**************************************************
csv_path: target_deepseek_7b_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 130
correctness_acc: 93.84615384615384
easy_acc: 91.30434782608695
med_acc: 93.84615384615384
hard_acc: 100.0


avg_reasoning_score: 4.630769230769231
avg_easy_reasoning_score: 4.565217391304348
avg_med_reasoning_score: 4.6
avg_hard_reasoning_score: 4.888888888888889
**************************************************




 44%|████▍     | 11/25 [03:16<03:15, 13.94s/it]



**************************************************
csv_path: target_deepseek_7b_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 195
correctness_acc: 99.48717948717949
easy_acc: 100.0
med_acc: 99.11504424778761
hard_acc: 100.0


avg_reasoning_score: 4.9282051282051285
avg_easy_reasoning_score: 4.982456140350878
avg_med_reasoning_score: 4.902654867256637
avg_hard_reasoning_score: 4.916666666666667
**************************************************




 48%|████▊     | 12/25 [03:25<02:40, 12.34s/it]



**************************************************
csv_path: target_deepseek_7b_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 194
correctness_acc: 99.48453608247422
easy_acc: 100.0
med_acc: 99.09909909909909
hard_acc: 100.0


avg_reasoning_score: 4.938144329896907
avg_easy_reasoning_score: 4.967213114754099
avg_med_reasoning_score: 4.936936936936937
avg_hard_reasoning_score: 4.857142857142857
**************************************************




 52%|█████▏    | 13/25 [03:34<02:14, 11.18s/it]



**************************************************
csv_path: target_deepseek_7b_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 194
correctness_acc: 99.48453608247422
easy_acc: 98.4126984126984
med_acc: 100.0
hard_acc: 100.0


avg_reasoning_score: 4.963917525773196
avg_easy_reasoning_score: 4.968253968253968
avg_med_reasoning_score: 4.972477064220183
avg_hard_reasoning_score: 4.904761904761905
**************************************************




 56%|█████▌    | 14/25 [03:40<01:46,  9.73s/it]



**************************************************
csv_path: target_llama_3_1_8B_it_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 300
correctness_acc: 96.66666666666667
easy_acc: 95.71428571428572
med_acc: 96.53179190751445
hard_acc: 98.21428571428571


avg_reasoning_score: 4.773333333333333
avg_easy_reasoning_score: 4.757142857142857
avg_med_reasoning_score: 4.77456647398844
avg_hard_reasoning_score: 4.785714285714286
**************************************************




 60%|██████    | 15/25 [03:46<01:26,  8.65s/it]



**************************************************
csv_path: target_llama_3_1_8B_it_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 300
correctness_acc: 99.0
easy_acc: 100.0
med_acc: 99.42196531791907
hard_acc: 96.42857142857143


avg_reasoning_score: 4.913333333333333
avg_easy_reasoning_score: 4.928571428571429
avg_med_reasoning_score: 4.92485549132948
avg_hard_reasoning_score: 4.857142857142857
**************************************************




 64%|██████▍   | 16/25 [03:55<01:18,  8.69s/it]



**************************************************
csv_path: target_llama_3_1_8B_it_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 300
correctness_acc: 99.0
easy_acc: 100.0
med_acc: 98.26589595375722
hard_acc: 100.0


avg_reasoning_score: 4.9
avg_easy_reasoning_score: 4.885714285714286
avg_med_reasoning_score: 4.913294797687861
avg_hard_reasoning_score: 4.875
**************************************************




 68%|██████▊   | 17/25 [04:02<01:05,  8.17s/it]



**************************************************
csv_path: target_llama_3_1_8B_it_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 300
correctness_acc: 99.33333333333333
easy_acc: 97.14285714285714
med_acc: 100.0
hard_acc: 100.0


avg_reasoning_score: 4.883333333333334
avg_easy_reasoning_score: 4.928571428571429
avg_med_reasoning_score: 4.861271676300578
avg_hard_reasoning_score: 4.892857142857143
**************************************************




 72%|███████▏  | 18/25 [04:09<00:55,  7.86s/it]



**************************************************
csv_path: target_qwen_2_5_7b_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 300
correctness_acc: 95.66666666666667
easy_acc: 88.57142857142857
med_acc: 98.84393063583815
hard_acc: 94.64285714285714


avg_reasoning_score: 4.626666666666667
avg_easy_reasoning_score: 4.571428571428571
avg_med_reasoning_score: 4.6531791907514455
avg_hard_reasoning_score: 4.607142857142857
**************************************************




 76%|███████▌  | 19/25 [04:16<00:45,  7.53s/it]



**************************************************
csv_path: target_qwen_2_5_7b_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 300
correctness_acc: 97.33333333333334
easy_acc: 94.28571428571428
med_acc: 98.84393063583815
hard_acc: 96.42857142857143


avg_reasoning_score: 4.753333333333333
avg_easy_reasoning_score: 4.685714285714286
avg_med_reasoning_score: 4.791907514450867
avg_hard_reasoning_score: 4.714285714285714
**************************************************




 80%|████████  | 20/25 [04:22<00:36,  7.28s/it]



**************************************************
csv_path: target_qwen_2_5_7b_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 300
correctness_acc: 98.33333333333333
easy_acc: 98.57142857142858
med_acc: 98.26589595375722
hard_acc: 98.21428571428571


avg_reasoning_score: 4.8133333333333335
avg_easy_reasoning_score: 4.771428571428571
avg_med_reasoning_score: 4.791907514450867
avg_hard_reasoning_score: 4.928571428571429
**************************************************




 84%|████████▍ | 21/25 [04:29<00:28,  7.11s/it]



**************************************************
csv_path: target_qwen_2_5_7b_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 300
correctness_acc: 96.66666666666667
easy_acc: 95.71428571428572
med_acc: 97.6878612716763
hard_acc: 94.64285714285714


avg_reasoning_score: 4.8
avg_easy_reasoning_score: 4.771428571428571
avg_med_reasoning_score: 4.815028901734104
avg_hard_reasoning_score: 4.785714285714286
**************************************************




 88%|████████▊ | 22/25 [04:39<00:23,  7.78s/it]



**************************************************
csv_path: target_qwq_32b_judge_llama_3_1_8B_it_attack_alter.csv
len(df): 289
correctness_acc: 98.6159169550173
easy_acc: 98.57142857142858
med_acc: 98.7878787878788
hard_acc: 98.11320754716981


avg_reasoning_score: 4.865051903114187
avg_easy_reasoning_score: 4.857142857142857
avg_med_reasoning_score: 4.878787878787879
avg_hard_reasoning_score: 4.830188679245283
**************************************************




 92%|█████████▏| 23/25 [04:48<00:16,  8.32s/it]



**************************************************
csv_path: target_qwq_32b_judge_llama_3_1_8B_it_attack_irrelevant.csv
len(df): 291
correctness_acc: 97.59450171821305
easy_acc: 98.57142857142858
med_acc: 97.0059880239521
hard_acc: 98.11320754716981


avg_reasoning_score: 4.962199312714777
avg_easy_reasoning_score: 4.957142857142857
avg_med_reasoning_score: 4.970059880239521
avg_hard_reasoning_score: 4.943396226415095
**************************************************




 96%|█████████▌| 24/25 [04:55<00:07,  7.94s/it]



**************************************************
csv_path: target_qwq_32b_judge_llama_3_1_8B_it_attack_naive.csv
len(df): 298
correctness_acc: 97.6510067114094
easy_acc: 98.57142857142858
med_acc: 96.51162790697676
hard_acc: 100.0


avg_reasoning_score: 4.939597315436242
avg_easy_reasoning_score: 4.9
avg_med_reasoning_score: 4.965116279069767
avg_hard_reasoning_score: 4.909090909090909
**************************************************




100%|██████████| 25/25 [05:09<00:00, 12.38s/it]



**************************************************
csv_path: target_qwq_32b_judge_llama_3_1_8B_it_attack_paraphrase.csv
len(df): 296
correctness_acc: 99.32432432432432
easy_acc: 98.57142857142858
med_acc: 100.0
hard_acc: 98.14814814814815


avg_reasoning_score: 4.949324324324325
avg_easy_reasoning_score: 4.942857142857143
avg_med_reasoning_score: 4.941520467836257
avg_hard_reasoning_score: 4.981481481481482
**************************************************





