In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI
import openai

In [2]:
def initialize_client(api_key):
    return OpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key)


In [3]:
# Function to query the LLM and get both reasoning and Yes/No answer
def query_llm(client, model_name, paragraph, question):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": f"Paragraph: \"{paragraph}\"\nQuestion: \"{question}\"\nAnswer strictly in 'Yes' or 'No', followed by reasoning."
                }
            ],
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error querying LLM: {e}")
        return None

In [None]:
import re
#using regex to match yes/no at the start or in the middle
def extract_answer_and_reasoning(response):
    if not response:
        return None, None

    answer_match = re.search(r"\b(yes|no)\b", response, re.IGNORECASE)
    reasoning = None

    if answer_match:
        answer = answer_match.group(1).lower() 
        
        reasoning_start = response.lower().find(answer) + len(answer)
        raw_reasoning = response[reasoning_start:].strip()

        lines = raw_reasoning.splitlines()
        filtered_lines = [line.strip() for line in lines if line.strip() not in ("**", "")]

        cleaned_lines = [re.sub(r"^reasoning:\s*", "", line, flags=re.IGNORECASE) for line in filtered_lines]

        reasoning = " ".join(cleaned_lines).strip()
    else:
        answer = "invalid"

    return answer, reasoning


In [5]:
def evaluate_ambiguity(client, model_name, paragraph, question):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": f"Paragraph: \"{paragraph}\"\nQuestion: \"{question}\"\nClassify the question's clarity in relation to the paragraph as 'Clear' or 'Ambiguous'. Provide reasoning for your classification."
                }
            ],
        )
        result = response.choices[0].message.content.strip()
        label_match = re.search(r"\b(clear|ambiguous)\b", result, re.IGNORECASE)
        label = label_match.group(1).lower() if label_match else "invalid"
        reasoning = result[len(label_match.group(0)) + 1:].strip() if label_match else None
        return label, reasoning
    except Exception as e:
        print(f"Error evaluating ambiguity: {e}")
        return "invalid", None


In [6]:
def grade_explanation_quality(client, model_name, paragraph, reasoning):
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {
                    "role": "user",
                    "content": f"Paragraph: \"{paragraph}\"\nReasoning: \"{reasoning}\"\nRate the quality of the reasoning on a scale of 1 to 5 (1: Poor, 5: Excellent). Provide a brief justification for the rating."
                }
            ],
        )
        result = response.choices[0].message.content.strip()
        grade_match = re.search(r"\b[1-5]\b", result)
        grade = int(grade_match.group(0)) if grade_match else -1
        justification = result[grade_match.end():].strip() if grade_match else None
        return grade, justification
    except Exception as e:
        print(f"Error grading explanation quality: {e}")
        return -1, None


In [None]:
def evaluate_dataset(models, dataset):
    all_results = []

    for model_info in models:
        api_key = model_info["api_key"]
        model_name = model_info["model_name"]
        
        client = initialize_client(api_key)
        print(f"Evaluating model: {model_name}")

        for entry in dataset:
            paragraph = entry["paragraph"]
            for question_entry in entry["questions"]:
                question = question_entry["question"]
                ground_truth = question_entry["ground_truth"].lower()

                llm_response = query_llm(client, model_name, paragraph, question)
                llm_answer, reasoning = extract_answer_and_reasoning(llm_response)

                ambiguity_label, ambiguity_reasoning = evaluate_ambiguity(client, model_name, paragraph, question)

                quality_grade, quality_justification = grade_explanation_quality(client, model_name, paragraph, reasoning)

                is_correct = llm_answer == ground_truth

                all_results.append({
                    "model_name": model_name,
                    "paragraph": paragraph,
                    "question": question,
                    "ground_truth": ground_truth,
                    "llm_answer": llm_answer,
                    "reasoning": reasoning,
                    "ambiguity_label": ambiguity_label,
                    "ambiguity_reasoning": ambiguity_reasoning,
                    "quality_grade": quality_grade,
                    "quality_justification": quality_justification,
                    "is_correct": is_correct,
                })

                print(f"Paragraph: {paragraph}")
                print(f"Question: {question}")
                print(f"Ground Truth: {ground_truth}")
                print(f"LLM Answer: {llm_answer}")
                print(f"Reasoning: {reasoning}")
                print(f"Ambiguity Label: {ambiguity_label}")
                print(f"Ambiguity Reasoning: {ambiguity_reasoning}")
                print(f"Quality Grade: {quality_grade}")
                print(f"Quality Justification: {quality_justification}")
                print(f"Correct: {is_correct}\n")

    return pd.DataFrame(all_results)


In [8]:
dataset = [
    {
        "paragraph": "The company launched its new software with features like data encryption and automatic backups. Pricing information is available, but there's no mention of customer support options.",
        "questions": [
            {
                "question": "Does it mention pricing?",
                "ground_truth": "yes",
            },
            {
                "question": "Is customer support discussed?",
                "ground_truth": "no",
            },
            {
                "question": "Does it talk about data encryption?",
                "ground_truth": "yes",
            },
        ],
    },
    {
        "paragraph": "The smartphone boasts a 6.5-inch display and comes with a dual-lens camera system. However, the battery capacity and processor details were not revealed.",
        "questions": [
            {
                "question": "Is the display size mentioned?",
                "ground_truth": "yes",
            },
            {
                "question": "Does it discuss battery capacity?",
                "ground_truth": "no",
            },
            {
                "question": "Are camera features included?",
                "ground_truth": "yes",
            },
        ],
    },
]


In [None]:
models = [
    {
        "model_name": "meta-llama/llama-3.2-3b-instruct:free",
        "api_key": "ap_key_1",  
    },
    {
        "model_name": "nvidia/llama-3.1-nemotron-70b-instruct",
        "api_key": "api_key_2", 
    },
]

In [None]:
df_results = evaluate_dataset(models, dataset)

print(df_results)

Evaluating model: meta-llama/llama-3.2-3b-instruct:free
Paragraph: The company launched its new software with features like data encryption and automatic backups. Pricing information is available, but there's no mention of customer support options.
Question: Does it mention pricing?
Ground Truth: yes
LLM Answer: no
Reasoning: , it does not mention pricing. The text states "Pricing information is available", which implies that it is mentioned, but does not provide the details itself.
Ambiguity Label: ambiguous
Ambiguity Reasoning: assify the question's clarity as 'Ambiguous'.

The question asks if "it mentions pricing", but the paragraph specifically states that "Pricing information is available", which clearly mentions the existence of pricing information. The question is ambiguous because it doesn't specify what "it" refers to, which could be the software, the company, or something else. To answer the question accurately, it would be necessary to know what "it" refers to in the contex

In [None]:
def calculate_combined_score(df):
    total_questions = len(df)
    correct_answers = df["is_correct"].sum()
    accuracy_score = (correct_answers / total_questions) * 100 if total_questions > 0 else 0

    clear_questions = (df["ambiguity_label"] == "clear").sum()
    ambiguity_handling_score = (clear_questions / total_questions) * 100 if total_questions > 0 else 0

    avg_quality_grade = df["quality_grade"].mean()
    explanation_quality_score = (avg_quality_grade / 5) * 100 if avg_quality_grade > 0 else 0

    
    combined_score = (
        0.5 * accuracy_score
        + 0.3 * ambiguity_handling_score
        + 0.2 * explanation_quality_score
    )

    return {
        "accuracy_score": accuracy_score,
        "ambiguity_handling_score": ambiguity_handling_score,
        "explanation_quality_score": explanation_quality_score,
        "combined_score": combined_score,
    }


# Filter rows where ambiguity_label is "ambiguous" or quality_grade < 3
def human_evaluation_check(df):
    human_check_df = df[
        (df["ambiguity_label"] == "ambiguous") | (df["quality_grade"] < 3)
    ]
    return human_check_df


final_results = []

for model_name in df_results["model_name"].unique():
    model_df = df_results[df_results["model_name"] == model_name]
    scores = calculate_combined_score(model_df)
    scores["model_name"] = model_name
    final_results.append(scores)

scores_df = pd.DataFrame(final_results)

human_check_df = human_evaluation_check(df_results)

best_model = scores_df.sort_values(by="combined_score", ascending=False).iloc[0]


In [14]:
print(human_check_df)
print(best_model)

                               model_name  \
0   meta-llama/llama-3.2-3b-instruct:free   
1   meta-llama/llama-3.2-3b-instruct:free   
5   meta-llama/llama-3.2-3b-instruct:free   
8  nvidia/llama-3.1-nemotron-70b-instruct   

                                           paragraph  \
0  The company launched its new software with fea...   
1  The company launched its new software with fea...   
5  The smartphone boasts a 6.5-inch display and c...   
8  The company launched its new software with fea...   

                              question ground_truth llm_answer  \
0             Does it mention pricing?          yes         no   
1       Is customer support discussed?           no         no   
5        Are camera features included?          yes        yes   
8  Does it talk about data encryption?          yes        yes   

                                           reasoning ambiguity_label  \
0  , it does not mention pricing. The text states...       ambiguous   
1  , The paragraph

In [15]:
human_check_df.to_csv("human_check_list.csv", index=False)


In [11]:
df_results.to_csv("model_comparison_results.csv", index=False)
