# Task 3: Diverse Explanations

## Loading Data & Preparation

In [1]:
# Load bug reports explanations

import os
from pathlib import Path
import pandas as pd
import json

data = pd.read_csv("../../data/answerList_data.csv")

bug_reports_data = {}

# read one file per failing method
for failing_method in data['FailingMethod'].unique():
    with open(Path(f"../exercise2/{failing_method}.txt"), 'r') as f:
        # individual explanations are separated by a newline
        explanations = f.read().split("\n")
        explanations = [explanation for explanation in explanations if explanation != ""]
        bug_reports_data[failing_method] = explanations   

print(bug_reports_data["HIT01_8"])  

['Minutes are set to -15; which is less then 0 and it throws illegal arg exception', 'The code never gets that far. The problem is at line 279 which prevents a negative minutes value being accepted even though the programmer comments indicate that since version 2.3 negative minutes up to -59 are acceptable. The @throws IllegalArgumentException comment is also referring to versions before 2.3.', 'In the code there is a check that 0 <= minutes < 60 and the minutesOffset is -15 which does not fall into these prarmeters thus throwing an Exception', 'There is a logical check for if minuteOffset is less than 0 or greater than 59 causing it to throw an exception because the value is out of bounds (negative number)', 'YES. The issue is on line 279 (as I explained in my first question; of which I misunderstood that I was only being asked about the specific issue; not generalized issue). On line 279 the variable "minutesOffSet" is parameterized to throw an exception if it is < 0 or > 59. Line 27

In [2]:
# load the ground truth explanations
with open("ground_truth_explanations.json", "r") as f:
    ground_truth_explanations = json.load(f)

In [3]:
# calculate readability scores for the ground truth explanations

import textstat
readability_scores = {}
for method, explanation in ground_truth_explanations.items():
    readability_scores[method] = textstat.flesch_reading_ease(explanation)

# sort the explanations by readability
sorted_explanations = sorted(readability_scores.items(), key=lambda x: x[1], reverse=True)

# print readibility scores
for method, score in sorted_explanations:
    print(f"{method}: {score}")

readability_threshold = {}
for method, score in readability_scores.items():
    readability_threshold[method] = round(score * 0.8,3)

# print readability thresholds
for method, score in readability_threshold.items():
    print(f"{method}: {score}")

HIT03_6: 64.75
HIT05_35: 58.62
HIT07_33: 53.04
HIT02_24: 51.18
HIT04_7: 50.16
HIT08_54: 46.81
HIT06_51: 43.02
HIT01_8: 28.67
HIT01_8: 22.936
HIT02_24: 40.944
HIT03_6: 51.8
HIT04_7: 40.128
HIT05_35: 46.896
HIT06_51: 34.416
HIT07_33: 42.432
HIT08_54: 37.448


In [4]:
# define simialrity
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load a high-performance model for semantic similarity
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def compute_similarity(ground_truth, user_explanation):
    """
    Computes cosine similarity between the ground truth explanation and a user-given explanation.
    """
    embeddings = model.encode([ground_truth, user_explanation], convert_to_tensor=True)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return similarity.item()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Compute similarity for all user explanations and set threshold
similarity_scores = {}

for method, ground_truth in ground_truth_explanations.items():
    scores = [compute_similarity(ground_truth, user_exp) for user_exp in bug_reports_data.get(method, [])]
    similarity_scores[method] = scores

# average similarity per failing method
average_similarity_scores = {}
for method, scores in similarity_scores.items():
    average_similarity_scores[method] = sum(scores) / len(scores)

for method, score in average_similarity_scores.items():
    print(f"{method}: {score}")

# set similarity threshold to 70th percentile
similarity_threshold = {}
for method, scores in similarity_scores.items():
    similarity_threshold[method] = np.percentile(scores, 70)

for method, score in similarity_threshold.items():
    print(f"{method}: {score}")

HIT01_8: 0.5436896428233012
HIT02_24: 0.3701791672501713
HIT03_6: 0.25586818529409355
HIT04_7: 0.2914702493418008
HIT05_35: 0.42632644800469277
HIT06_51: 0.30135327534129225
HIT07_33: 0.4918408831271032
HIT08_54: 0.3938455417131384
HIT01_8: 0.7063461601734161
HIT02_24: 0.5649046778678893
HIT03_6: 0.32944928407669066
HIT04_7: 0.38390407562255857
HIT05_35: 0.5162835419178009
HIT06_51: 0.4160767793655395
HIT07_33: 0.6244548857212067
HIT08_54: 0.4770480155944824


In [7]:
# define function to generate summaries
from openai import OpenAI
import json

#load api key
from dotenv import load_dotenv
load_dotenv()
openai_api_key = os.environ.get('OPENAI_API_KEY')

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=openai_api_key,
)

# Function to generate summaries using LLaMA 3.2
def generate_summary(method, explanations):
    combined_text = " ".join(explanations)

    # Truncate input to fit within token limits
    max_input_length = 4000
    combined_text = combined_text[:max_input_length]

    # OpenRouter API call
    completion = client.chat.completions.create(
        model="meta-llama/llama-3.2-1b-instruct:free",
        messages=[
            {
                "role": "system",
                "content": "You are an expert summarizer. Given detailed bug reports, provide a single, comprehensive explanation that includes all necessary and sufficient information to understand and fix the bug.",
            },
            {
                "role": "user",
                "content": f"Summarize the following bug reports:\n\n{combined_text}",
            },
        ],
        max_tokens=200  # Adjust to control summary length
    )

    if completion.choices is not None:
        return completion.choices[0].message.content
    else:
        return "-1"

In [23]:
# define utility functions

import re
from pathlib import Path
import enchant

def contains_english_word(text):
    """
    Check if a string contains at least one English word.
    Returns True if an English word is found, False otherwise.
    
    Args:
        text (str): Input string to check
        
    Returns:
        bool: True if contains English word, False otherwise
    """
    # Initialize English dictionary
    dictionary = enchant.Dict("en_US")
    
    # Clean the string - keep only letters and spaces
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Split into potential words
    words = cleaned_text.split()
    
    # Check each potential word
    return any(dictionary.check(word.lower()) for word in words)

## Diversity of Explanations

(Q 3.1) There are many different ways which we could use to measure the diversity of explanations. We could, for example, use the type-token ratio or the number of unique words in the explanations to measure the diversity of words used in the explanations. Since we did not get great results with the TTR in previous experiments, we will not use this approach. Instead, we want to regard the Shannon Entropy of the explanations. The Shannon Entropy is a measure of the uncertainty in a random variable. In our case, the random variable is the choice of words in the explanations. The Shannon Entropy is defined as follows:

$$H(X) = - \sum_{i=1}^{n} p(x_i) \cdot \log_2(p(x_i))$$

where $p(x_i)$ is the probability of the $i$-th word in the explanation. Shannon entropy measures the randomness or uncertainty in a distribution—in this case, the distribution of words in a text. Higher entropy means more unpredictability, while lower entropy suggests redundancy or repetition. The minimum entropy is 0, which occurs when all words are the same. The maximum entropy is the logarithm of the number of words in the text, which occurs when all words are equally likely. For normal written English, entropy usually ranges between 4 and 8 bits per word, depending on vocabulary richness.

We could also measure diversity by looking at the semantic diversity of explanations using the embedding distance (mean pairwise cosine similarity) between the explanations. Since this directly contradicts a high similarity between explanations, we decided against this approach.
In the broader range of diversity, we could look at more features than the explanation itself, e.g. at associated categorical features like the bug reporters demographic data, their experience level or the country that they are from. However, this makes it harder to define exact thresholds for diversity.

As the Shannon Entropy seems like the most straight forward approach, we will use this to measure the diversity of explanations in the following.

In [10]:
from collections import Counter
import math

def shannon_entropy(text):
    words = text.split()
    word_counts = Counter(words)
    total_words = len(words)
    entropy = -sum((count/total_words) * math.log2(count/total_words) for count in word_counts.values())
    return entropy

In [None]:
# calculate shannon entropy for ground truth explanations
ground_truth_entropy = {}
for method, explanation in ground_truth_explanations.items():
    ground_truth_entropy[method] = shannon_entropy(explanation)

# sort the explanations by entropy
sorted_entropy = sorted(ground_truth_entropy.items(), key=lambda x: x[1], reverse=True)

# print entropy scores
for method, score in sorted_entropy:
    print(f"{method}: {score}")

HIT08_54: 5.358714497742255
HIT07_33: 5.25585347326784
HIT06_51: 5.115114023681427
HIT01_8: 5.10341455748809
HIT03_6: 4.999664476749764
HIT05_35: 4.898153434632013
HIT02_24: 4.8219280948873635
HIT04_7: 4.812209613812088


We can see that all our ground truth explanations achieve a Shannon Entropy of 4-6 bits per word. We will use this as a reference point for the diversity of explanations. This moderate Entropy is in general desirable, as it indicates a good balance between clarity and lexical variety.

The max readability and max similarity values independent of the diversity per bug report are as follows:

In [28]:
# Process all methods in a single pass to find highest similarity and readability scores/explanations
highest_similarity_scores = {}
highest_readability_scores = {}
most_similar_explanations = {}
most_readable_explanations = {}

for method, explanations in bug_reports_data.items():
    # Calculate similarity scores for all explanations
    similarity_scores = [compute_similarity(ground_truth_explanations[method], explanation) for explanation in explanations]
    max_similarity_index = similarity_scores.index(max(similarity_scores))
    
    # Store highest similarity score and corresponding explanation
    highest_similarity_scores[method] = similarity_scores[max_similarity_index]
    most_similar_explanations[method] = explanations[max_similarity_index]
    
    # Calculate readability scores for all explanations
    readability_scores = []
    for explanation in explanations:
        if len(explanation.split()) == 0 or not contains_english_word(explanation):
            readability_scores.append(0)
        else:
            readability_scores.append(textstat.flesch_reading_ease(explanation))
    
    max_readability_index = readability_scores.index(max(readability_scores))
    
    # Store highest readability score and corresponding explanation
    highest_readability_scores[method] = readability_scores[max_readability_index]
    most_readable_explanations[method] = explanations[max_readability_index]

# Print scores
for method, (similarity, readability) in zip(highest_similarity_scores.keys(), 
                                           zip(highest_similarity_scores.values(), 
                                               highest_readability_scores.values())):
    print(f"{method}: {similarity}, {readability}")

HIT01_8: 0.820572555065155, 99.23
HIT02_24: 0.7392517328262329, 121.22
HIT03_6: 0.6439655423164368, 121.22
HIT04_7: 0.7673725485801697, 120.21
HIT05_35: 0.7223565578460693, 104.64
HIT06_51: 0.6698489785194397, 119.19
HIT07_33: 0.7574557662010193, 118.18
HIT08_54: 0.7468466758728027, 120.21


The readability scores are quite high because some people put very simple "explanations" like "yes" or "no" as their answer.

Yes


In [None]:
# choose the high readability and high similarity explanations and 
# see if they are also high entropy