In [17]:
import pandas as pd
def combine_explanations(data):
    filtered_data = data[data['GroundTruth'] == 1]

    combined_explanations = filtered_data.groupby('FailingMethod')['Answer.explanation'].apply(lambda x: ' '.join(x.astype(str))).reset_index()
    return combined_explanations

data = pd.read_csv('data.csv')
combined_explanations = combine_explanations(data)
combined_explanations.to_csv('combined_explanations.csv', index=False)


In [20]:
from groq import Groq
import re
import time

client = Groq(
    api_key="",
)

data = pd.read_csv('combined_explanations.csv')

def process_explanation(task, explanation):
    while True:
        try:
            response = client.chat.completions.create(
                model="qwen-2.5-32b",
                messages=[
                    {"role": "system", "content": "You are a senior software engineer tasked with consolidating multiple bug explanations into a clear and concise answer."},
                    {"role": "user", "content": f"{task}\n\nExplanation: {explanation}"}
                ],
                max_tokens=300,
                temperature=0.5
            )
            return response.choices[0].message.content.strip()

        except Exception as e:
            error_msg = str(e)
            print(error_msg)
            if "Rate limit reached" in error_msg:
                match = re.search(r"try again in (\d+(\.\d+)?)s", error_msg)
                if match:
                    wait_time = float(match.group(1)) + 1
                    time.sleep(wait_time)
                else:
                    time.sleep(20)
                continue

            return "Error processing this explanation."

task = ("""
You are a senior software engineer responsible for reviewing and consolidating explanations of a software bug. Using the provided explanations and data, create a concise explanation (3-4 sentences) that accurately describes the root cause of the bug and provides sufficient guidance for fixing it. Ensure the explanation includes:

A clear description of the bug,
Its underlying cause, and
A suggested solution or troubleshooting step.
The explanation should be easy to understand for a developer familiar with the codebase and focus only on necessary and sufficient details."""
)

processed_data = []
for _, row in data.iterrows():
    explanation = row['Answer.explanation']
    consolidated = process_explanation(task, explanation)
    print(f"Processing data for {row['FailingMethod']}")
    processed_data.append({
        'FailingMethod': row['FailingMethod'],
        'Explanation': consolidated
    })

result_df = pd.DataFrame(processed_data)
print(result_df)

result_df.to_csv('ground_truths_per_failing_method_llm.csv', index=False)

Processing data for HIT01_8
Processing data for HIT02_24
Processing data for HIT03_6
Processing data for HIT04_7
Processing data for HIT05_35
Processing data for HIT06_51
Processing data for HIT07_33
Processing data for HIT08_54
  FailingMethod                                        Explanation
0       HIT01_8  The bug occurs due to an incorrect validation ...
1      HIT02_24  The bug occurs when a negative value is passed...
2       HIT03_6  The bug occurs due to the `pos` variable being...
3       HIT04_7  **Bug Explanation:**\nThe bug occurs due to an...
4      HIT05_35  The bug occurs when both `array` and `element`...
5      HIT06_51  The bug results in the incorrect handling of `...
6      HIT07_33  The bug occurs due to a `NullPointerException`...
7      HIT08_54  The bug occurs due to an incorrect input forma...


## Task 2.1
For readability, the Flesch Reading Ease Score is used as a key metric. This score considers the average sentence length and the average number of syllables per word to determine how easy a text is to read. The score ranges from 0 to 100, with higher scores indicating easier readability. For this explanation, a threshold of 40 is chosen, reflecting the assumption that readers will have a certain level of computer knowledge and familiarity with technical terms.

| **Score Range** | **Readability Level**        | **Intended Audience**              |
|----------------|-------------------------------|------------------------------------|
| 90 - 100        | Very Easy                     | 5th-grade level or younger        |
| 80 - 89         | Easy                          | 6th-grade level                   |
| 70 - 79         | Fairly Easy                   | 7th-grade level                   |
| 60 - 69         | Standard                      | 8th to 9th-grade level            |
| 50 - 59         | Fairly Difficult              | High school students              |
| 30 - 49         | Difficult                     | College students                  |
| 0 - 29          | Very Difficult                | Advanced readers (e.g., academics)|

We aimed at threshold of 40 since this is an explanation of a bug inside a computer program, fairly the person who will read this will be having a certain level of computer education and will be familiar with the argon. 

Additionally, to ensure that the consolidated bug description aligns closely with the ground truth, we use cosine similarity as a metric to measure the semantic similarity between the two texts. The text is first converted into vector representations using SentenceTransformer embeddings, which capture the meaning of the sentences. Cosine similarity then calculates the angle between these vectors, with a value close to 1 indicating high similarity. A threshold of 0.7 ensures that the merged output remains relevant and aligned with the ground truth.

In [28]:
import textstat
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


READABILITY_THRESHOLD = 40
SIMILARITY_THRESHOLD = 0.7

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


def merge_texts(existing_text, new_explanation):
    prompt = (
        f"""You are a senior software engineer responsible for reviewing and consolidating explanations of a software bug. Using the provided explanations and data, create a concise explanation (3-4 sentences) that accurately describes the root cause of the bug and provides sufficient guidance for fixing it. Ensure the explanation includes:
        - A clear description of the bug,
        - Its underlying cause, and
        - A suggested solution or troubleshooting step.

        The explanation should be easy to understand for a developer familiar with the codebase and focus only on necessary and sufficient details.
        """
    )
    
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model="qwen-2.5-32b",
                messages=[
                    {"role": "system", "content": "You are a senior software engineer tasked with consolidating multiple bug explanations into a clear and concise answer."},
                    {"role": "user", "content": f"{prompt}\n\nExplanation: {existing_text}{new_explanation}"}
                ],
                temperature=0.5,
                max_tokens=300,
            )
            return response.choices[0].message.content.strip()
        
        except Exception as e:
            print(f"Merge attempt {attempt+1} failed: {e}")
            time.sleep(1)

    return existing_text

def compute_similarity(text1, text2):
    emb1 = embedding_model.encode(text1, convert_to_tensor=False)
    emb2 = embedding_model.encode(text2, convert_to_tensor=False)
    return cosine_similarity([emb1], [emb2])[0][0]
    
def iterative_merge(explanations, ground_truth, method="METHOD_ID", readability_threshold=READABILITY_THRESHOLD, similarity_threshold=SIMILARITY_THRESHOLD):
    
    if not explanations:
        return {
            "method": method,
            "final_text": "",
            "readability_score": None,
            "similarity_score": None,
            "explanations_used": 0
        }
    
    merged_text = explanations[0]
    used_count = 1

    readability = textstat.flesch_reading_ease(merged_text)
    similarity = compute_similarity(merged_text, ground_truth)
    print(f"[{method}] Initial readability: {readability:.2f}, similarity: {similarity:.2f}")
    
    for explanation in explanations[1:]:
        if readability >= READABILITY_THRESHOLD and similarity >= SIMILARITY_THRESHOLD:
            print(f"[{method}] Quality thresholds met. Stopping merge.")
            break
        
        print(f"[{method}] Merging explanation {used_count+1}...")
        merged_text = merge_texts(merged_text, explanation)
        used_count += 1
        
        readability = textstat.flesch_reading_ease(merged_text)
        similarity = compute_similarity(merged_text, ground_truth)
        print(f"[{method}] Updated readability: {readability:.2f}, similarity: {similarity:.2f}")
    
    return {
        "method": method,
        "final_text": merged_text,
        "readability_score": readability,
        "similarity_score": similarity,
        "explanations_used": used_count
    }


ground_truths = pd.read_csv("ground_truths_per_failing_method_llm.csv")
raw_data = pd.read_csv("data.csv")
raw_data = raw_data[raw_data["GroundTruth"] == 1]

consolidated_results = []

for idx, gt_row in ground_truths.iterrows():
    method = gt_row["FailingMethod"]
    ground_truth_explanation = gt_row["Explanation"]
    
    explanations = raw_data[raw_data["FailingMethod"] == method]["Answer.explanation"].tolist()
    
    if not explanations:
        continue
    
    print(f"\nProcessing method: {method}")
    result = iterative_merge(explanations, ground_truth_explanation, method=method)
    consolidated_results.append(result)

results_df = pd.DataFrame(consolidated_results)
output_filename = "consolidated_bug_explanations_all.csv"
results_df.to_csv(output_filename, index=False)
print(f"\nConsolidated data for {len(results_df)} bug reports written to '{output_filename}'")



Processing method: HIT01_8
[HIT01_8] Initial readability: 80.62, similarity: 0.70
[HIT01_8] Merging explanation 2...
[HIT01_8] Updated readability: 44.95, similarity: 0.82
[HIT01_8] Quality thresholds met. Stopping merge.

Processing method: HIT02_24
[HIT02_24] Initial readability: 62.64, similarity: 0.35
[HIT02_24] Merging explanation 2...
[HIT02_24] Updated readability: 52.70, similarity: 0.75
[HIT02_24] Quality thresholds met. Stopping merge.

Processing method: HIT03_6
[HIT03_6] Initial readability: 69.28, similarity: 0.45
[HIT03_6] Merging explanation 2...
[HIT03_6] Updated readability: 45.39, similarity: 0.70
[HIT03_6] Merging explanation 3...
[HIT03_6] Updated readability: 52.23, similarity: 0.68
[HIT03_6] Merging explanation 4...
[HIT03_6] Updated readability: 44.41, similarity: 0.80
[HIT03_6] Quality thresholds met. Stopping merge.

Processing method: HIT04_7
[HIT04_7] Initial readability: 43.06, similarity: 0.65
[HIT04_7] Merging explanation 2...
[HIT04_7] Updated readabilit

In [29]:
consolidated_datas = pd.read_csv('consolidated_bug_explanations_all.csv')

for index, row in consolidated_datas.iterrows():
    print(f"FailingMethod {row['method']} required {row['explanations_used']} explanations to pass the threshold")


FailingMethod HIT01_8 required 2 explanations to pass the threshold
FailingMethod HIT02_24 required 2 explanations to pass the threshold
FailingMethod HIT03_6 required 4 explanations to pass the threshold
FailingMethod HIT04_7 required 93 explanations to pass the threshold
FailingMethod HIT05_35 required 2 explanations to pass the threshold
FailingMethod HIT06_51 required 22 explanations to pass the threshold
FailingMethod HIT07_33 required 7 explanations to pass the threshold
FailingMethod HIT08_54 required 36 explanations to pass the threshold
