In [6]:
import pandas as pd
import numpy as np
import textstat
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings('ignore')

raw_data = pd.read_csv("data.csv")

ground_truth_df = pd.read_csv("ground_truths_per_failing_method_llm.csv")

ground_truth_df = ground_truth_df.rename(columns={'Explanation': 'ground_truth_explanation'})

marged_data = pd.merge(raw_data, ground_truth_df[['FailingMethod', 'ground_truth_explanation']],
              on='FailingMethod', how='left')

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

gt_embeddings = {}
for _, row in ground_truth_df.iterrows():
    method = row['FailingMethod']
    gt_text = row['ground_truth_explanation']
    gt_embeddings[method] = embedding_model.encode(gt_text, convert_to_tensor=False)

def compute_readability(text):
    try:
        return textstat.flesch_reading_ease(str(text))
    except Exception as e:
        return np.nan


def compute_similarity(answer_text, gt_emb):
    try:
        answer_emb = embedding_model.encode(str(answer_text), convert_to_tensor=False)
        return cosine_similarity([answer_emb], [gt_emb])[0][0]
    except Exception as e:
        return np.nan

marged_data['readability'] = marged_data['Answer.explanation'].apply(compute_readability)
marged_data['similarity'] = marged_data.apply(lambda row: compute_similarity(row['Answer.explanation'],
                                                          gt_embeddings.get(row['FailingMethod'], None)),
                            axis=1)
MIN_READABILITY = 40
MIN_SIMILARITY = 0.7

filtered_df = marged_data[(marged_data['readability'] >= MIN_READABILITY) & (marged_data['similarity'] >= MIN_SIMILARITY)]

def shannon_entropy(series):
    counts = series.value_counts()
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log(probabilities))


diversity_columns = ['Worker.profession', 'Worker.gender', 'Worker.country', 'Worker.programmingLanguage']

print("\nDiversity Metrics (Entropy) for Filtered Data:")
for col in diversity_columns:
    if col in filtered_df.columns:
        entropy_val = shannon_entropy(filtered_df[col])
        print(f" - {col}: {entropy_val:.3f}")


if all(col in filtered_df.columns for col in diversity_columns):
    diverse_subset = (filtered_df.groupby(diversity_columns)
                      .apply(lambda g: g.sort_values(by='similarity', ascending=False).iloc[0])
                      .reset_index(drop=True))
else:
    diverse_subset = filtered_df.copy()

max_readability = marged_data['readability'].max()
max_similarity = marged_data['similarity'].max()
print(f"\nMax readability in dataset: {max_readability:.2f}")
print(f"Max semantic similarity in dataset: {max_similarity:.3f}")


similarity_threshold_near_max = max_similarity * 0.95
near_max_df = marged_data[marged_data['similarity'] >= similarity_threshold_near_max]

print("\nDiversity metrics for answers with near-max semantic similarity:")
for col in diversity_columns:
    if col in near_max_df.columns:
        entropy_val = shannon_entropy(near_max_df[col])
        print(f" - {col}: {entropy_val:.3f}")

output_filename = "diverse_explanations.csv"
diverse_subset.to_csv(output_filename, index=False)
print(f"\nDiverse explanations saved to '{output_filename}'")


Diversity Metrics (Entropy) for Filtered Data:
 - Worker.profession: 1.379
 - Worker.gender: 0.464
 - Worker.country: 1.565
 - Worker.programmingLanguage: 3.395

Max readability in dataset: 206.84
Max semantic similarity in dataset: 0.856

Diversity metrics for answers with near-max semantic similarity:
 - Worker.profession: 0.662
 - Worker.gender: -0.000
 - Worker.country: 0.900
 - Worker.programmingLanguage: 1.494

Diverse explanations saved to 'diverse_explanations.csv'
