# Task 2

In [1]:
import pandas as pd
import os
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [2]:
data = pd.read_csv('data/answerList_data.csv')
data_tp = data[data['TP'] == True]
data_tp_group_by_method = data_tp.groupby('FailingMethod')

In [None]:
# Write explanations to file
for method, group in data_tp_group_by_method:
    for i, (index, row) in enumerate(group.iterrows(), 1):
        os.makedirs('data/correct_explainations', exist_ok=True)
        with open(f'data/correct_explainations/{method}.txt', 'a') as f:
            f.write(f"{i}: {row['Answer.explanation']}\n")

In [3]:
correct_explainations = {}

for method, group in data_tp_group_by_method:
    correct_explainations[method] = []
    for i, (index, row) in enumerate(group.iterrows(), 1):
        correct_explainations[method].append(row['Answer.explanation'])

In [4]:
llm_consolidated_explanations = {}
dir = 'data/llm_consolidated_explanations'
variant = ['1', '2', '3', '4']


for variant_name in variant:
    llm_consolidated_explanations[variant_name] = {}  # Initialize dict for each variant
    for filename in os.listdir(f'{dir}/{variant_name}'):
        if filename.endswith('.txt'):
            method = filename[:-4]  # Remove .txt extension
            with open(f'{dir}/{variant_name}/{filename}', 'r') as f:
                llm_consolidated_explanations[variant_name][method] = f.read().strip()


In [6]:
# Calculate BLEU score for each method

weights = (0.5, 0.5, 0, 0)
bleu_scores = {}

for variant_name in variant:
    bleu_scores[variant_name] = {}
    for method in llm_consolidated_explanations[variant_name]:
        references = []
        for explanation in correct_explainations[method]:
            references.append(explanation.lower().split(" "))
        candidate = llm_consolidated_explanations[variant_name][method].lower().split(" ")
        score = sentence_bleu(references, candidate, weights=weights)
        bleu_scores[variant_name][method] = score

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [7]:
# Calculate ROUGE score for each method

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {}

for variant_name in variant:
    rouge_scores[variant_name] = {}
    for method in llm_consolidated_explanations[variant_name]:
        scores = {key: [] for key in ['rouge1', 'rouge2', 'rougeL']}
        candidate = llm_consolidated_explanations[variant_name][method].lower()
        
        for reference in correct_explainations[method]:
            temp_scores = scorer.score(reference.lower(), candidate)
            for key in temp_scores:
                scores[key].append(temp_scores[key])
        
        rouge_scores[variant_name][method] = {}
        for key in scores:
            avg_score = sum(score.fmeasure for score in scores[key]) / len(scores[key])
            rouge_scores[variant_name][method][key] = avg_score


In [8]:
# Print scores as a table
print(f"{'Method':<12}{'Variant':<10}{'BLEU':<10}{'ROUGE1':<10}{'ROUGE2':<10}{'ROUGEL':<10}")

# Get all unique methods
all_methods = set()
for variant_name in variant:
    all_methods.update(llm_consolidated_explanations[variant_name].keys())

prev_method = None
for method in sorted(all_methods):
    if prev_method is not None:
        print() # Add blank line between methods
    for variant_name in variant:
        if method in llm_consolidated_explanations[variant_name]:
            print(f"{method:<12}{variant_name:<10}{bleu_scores[variant_name][method]:.4f}{' ':<6}{rouge_scores[variant_name][method]['rouge1']:.4f}{' ':<6}{rouge_scores[variant_name][method]['rouge2']:.4f}{' ':<6}{rouge_scores[variant_name][method]['rougeL']:.4f}")
    prev_method = method

Method      Variant   BLEU      ROUGE1    ROUGE2    ROUGEL    
HIT01_8     1         0.5592      0.2584      0.0425      0.1754

HIT02_24    1         0.2887      0.2394      0.0265      0.1526

HIT03_6     2         0.4436      0.1796      0.0355      0.1143

HIT04_7     3         0.2887      0.1467      0.0188      0.0887

HIT05_35    3         0.3717      0.2052      0.0306      0.1402

HIT06_51    3         0.3354      0.1891      0.0368      0.1370

HIT07_33    4         0.4488      0.2361      0.0698      0.1701

HIT08_54    4         0.5707      0.2239      0.0617      0.1367


In [13]:
# Calculate average BLEU scores for each variant
print("Average BLEU scores:")
for variant_name in variant:
    avg_bleu = sum(bleu_scores[variant_name].values()) / len(bleu_scores[variant_name])
    print(f"  {variant_name}: {avg_bleu:.4f}")

print("\nAverage ROUGE-1 scores:")
for variant_name in variant:
    rouge1_scores = [scores['rouge1'] for scores in rouge_scores[variant_name].values()]
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    print(f"  {variant_name}: {avg_rouge1:.4f}")

print("\nAverage ROUGE-2 scores:")
for variant_name in variant:
    rouge2_scores = [scores['rouge2'] for scores in rouge_scores[variant_name].values()]
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
    print(f"  {variant_name}: {avg_rouge2:.4f}")

print("\nAverage ROUGE-L scores:")
for variant_name in variant:
    rougeL_scores = [scores['rougeL'] for scores in rouge_scores[variant_name].values()]
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)
    print(f"  {variant_name}: {avg_rougeL:.4f}")


Average BLEU scores:
  1: 0.4239
  2: 0.4436
  3: 0.3319
  4: 0.5097

Average ROUGE-1 scores:
  1: 0.2489
  2: 0.1796
  3: 0.1803
  4: 0.2300

Average ROUGE-2 scores:
  1: 0.0345
  2: 0.0355
  3: 0.0288
  4: 0.0658

Average ROUGE-L scores:
  1: 0.1640
  2: 0.1143
  3: 0.1220
  4: 0.1534
