In [21]:
import pandas as pd
import ast
import re
import random

# Import generated explanations

In [22]:
# import file
filename = 'NLEs_FaithfulnessStudy.csv'
df = pd.read_csv(filename)

In [35]:
def clean_words(word_set):
    cleaned_set = set(re.sub(r"[^a-zA-Z0-9]", "", word.lower()) for word in word_set)
    return cleaned_set

def calculate_metrics_and_accuracy(row):
    # Preparing the sets
    NLE_words_rationale = clean_words(set(row['NLE_with_rationale'].lower().split())) # Comparison set
    NLE_words_no_rationale = clean_words(set(row['NLE_no_rationale'].lower().split())) # Comparison set
    
    words_list = ast.literal_eval(row['pred_rationales_words'])
    words_list = clean_words(set(word.lower() for word in words_list)) # Ground Truth 
    

    # For rationale
    TP_rationale = len(words_list.intersection(NLE_words_rationale))
    FP_rationale = len(NLE_words_rationale) - TP_rationale
    FN_rationale = len(words_list) - TP_rationale

    
    precision_rationale = TP_rationale / (TP_rationale + FP_rationale) if (TP_rationale + FP_rationale) > 0 else 0
    recall_rationale = TP_rationale / (TP_rationale + FN_rationale) if (TP_rationale + FN_rationale) > 0 else 0
    F1_rationale = (2 * precision_rationale * recall_rationale) / (precision_rationale + recall_rationale) if (precision_rationale + recall_rationale) > 0 else 0
    accuracy_rationale = TP_rationale / (TP_rationale + FP_rationale + FN_rationale) if (TP_rationale + FP_rationale + FN_rationale) > 0 else 0
    
    # For no rationale
    TP_no_rationale = len(words_list.intersection(NLE_words_no_rationale))
    FP_no_rationale = len(NLE_words_no_rationale) - TP_no_rationale
    FN_no_rationale = len(words_list) - TP_no_rationale
    
    precision_no_rationale = TP_no_rationale / (TP_no_rationale + FP_no_rationale) if (TP_no_rationale + FP_no_rationale) > 0 else 0



    recall_no_rationale = TP_no_rationale / (TP_no_rationale + FN_no_rationale) if (TP_no_rationale + FN_no_rationale) > 0 else 0
    F1_no_rationale = (2 * precision_no_rationale * recall_no_rationale) / (precision_no_rationale + recall_no_rationale) if (precision_no_rationale + recall_no_rationale) > 0 else 0
    accuracy_no_rationale = TP_no_rationale / (TP_no_rationale + FP_no_rationale + FN_no_rationale) if (TP_no_rationale + FP_no_rationale + FN_no_rationale) > 0 else 0
    
    return precision_rationale, recall_rationale, F1_rationale, accuracy_rationale, precision_no_rationale, recall_no_rationale, F1_no_rationale, accuracy_no_rationale

# Applying the function and assigning the results to new columns
results = df.apply(calculate_metrics_and_accuracy, axis=1)
df[['precision_rationale', 'recall_rationale', 'F1_rationale', 'accuracy_rationale', 'precision_no_rationale', 'recall_no_rationale', 'F1_no_rationale', 'accuracy_no_rationale']] = pd.DataFrame(results.tolist(), index=df.index)


In [34]:
calculate_mean = ['precision_rationale', 'recall_rationale', 'F1_rationale', 'accuracy_rationale', 'precision_no_rationale', 'recall_no_rationale', 'F1_no_rationale', 'accuracy_no_rationale']

for metric in calculate_mean:
    average_metric = df[str(metric)].mean()
    std_metric = df[str(metric)].std()
    print(f'The overall average for {metric} is: {average_metric} (Std={std_metric})')
    print('___')

The overall average for precision_rationale is: 0.08929025303222575 (Std=0.018691945218619253)
___
The overall average for recall_rationale is: 0.9705555555555554 (Std=0.07297902681224139)
___
The overall average for F1_rationale is: 0.16301102645603405 (Std=0.031413930598948814)
___
The overall average for accuracy_rationale is: 0.08905690939849895 (Std=0.01876971463176385)
___
The overall average for precision_no_rationale is: 0.1161111111111111 (Std=0.14979997434643783)
___
The overall average for recall_no_rationale is: 0.014479128913588019 (Std=0.01854177976717904)
___
The overall average for F1_no_rationale is: 0.02565966725161319 (Std=0.0328344908915196)
___
The overall average for accuracy_no_rationale is: 0.013279552663983231 (Std=0.017131671662237436)
___


# Qualitative Control of Faithfulness

In [12]:
filename = 'NLEs_FaithfulnessStudy.csv'
df = pd.read_csv(filename)

In [13]:
# Define a function to select the rationale column based on the prediction for easier comparison
def select_rationale(row):
    if row['pred'] == 'politics':
        return row['formatted_politics_rationales']
    elif row['pred'] == 'science':
        return row['formatted_science_rationales']
    else:
        return row['formatted_leisure_rationales']

# Apply the function to create a new column
df['selected_rationale'] = df.apply(select_rationale, axis=1)

# Select only the required columns
filtered_df = df[['NLE_with_rationale', 'NLE_no_rationale', 'selected_rationale']]

# If you also need to randomly select 50 instances from this filtered DataFrame:
sampled_filtered_df = filtered_df.sample(n=50, random_state=1)

In [15]:
sampled_filtered_df.to_csv('Qualitative_Assessment.csv', index=False)