# Golden Dataset Evaluation with a Standalone Language Model

In this notebook, a golden dataset is processed with a standalone language model (LLM) and then automatically evaluated. The goal is to examine the model's performance in terms of accuracy, consistency with reference answers, and linguistic quality.

## Procedure:
1. **Reading in the golden dataset**  
   The golden dataset contains predefined inputs (e.g., questions or prompts) and the expected answers (ground truth).

2. **Generating model answers**  
   The questions/prompts are passed to a standalone LLM one after the other. The model's outputs are saved.

3. **Evaluation**  
   The generated answers are compared with the reference answers, e.g., using metrics such as BLEU, ROUGE, or similarity measures.

4. **Analysis**  
   Results are evaluated quantitatively and, if necessary, qualitatively.

#### 1.1 Imports

In [None]:
# Imports
import json
import openai
import os
import time
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI

# === Lokale Projektmodule ===
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from ipynb_notebooks.evaluation_datasets.generation_eval.generation_metrics import run_generation_evaluation
from ipynb_notebooks.evaluation_datasets.generation_eval.llm_as_a_judge import run_llm_judge_parallel, run_llm_rejudge_parallel, calculate_and_visualize_scores_of_evaluation_scheme

#### 1.2 Configurations

In [None]:
# Configurations
# Load environment variables. Assumes that the project directory contains a .env file with API keys
load_dotenv()

# Set the OpenAI API key from the environment variables
# Make sure to update "OPENAI_API_KEY" to match the variable name in your .env file
openai.api_key = os.environ['OPENAI_API_KEY']
client = OpenAI(api_key=openai.api_key)

#### 2. Enriching Golden Dataset with Generated Response from Stand-Alone LLM

In [None]:
# Load JSON dataset (list of query objects)
with open("eval_datasets/golden_qa_evalset_generation.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# GPT-4o model configuration
model_name = "gpt-4o-mini"
temperature = 0.0  # Deterministic output for evaluation

# Function to query GPT-4o with retry logic in case of API errors or rate limits
def query_gpt4o_safe(prompt, retries=3, delay=2):
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                print(f"Error for prompt: {prompt[:50]}... – {e}")
                return ""

In [None]:
# Function to run GPT queries in parallel using threads
def generate_answers_parallel(golden_dataset_json, max_workers=5, output_json="golden_qa_evalset_generation_with_answers.json"):

    with open(f"eval_datasets/{golden_dataset_json}", "r", encoding="utf-8") as f:
        entries = json.load(f)
    
    results = [None] * len(entries)

    # Run queries in parallel threads
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {
            executor.submit(query_gpt4o_safe, entry["query"]): i
            for i, entry in enumerate(entries)
        }
        for future in tqdm(as_completed(future_to_index), total=len(entries), desc="Querying GPT-4o-mini"):
            idx = future_to_index[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                print(f"Error at index {idx}: {e}")
                results[idx] = ""

    # Add the generated answers to the original entries
    for entry, answer in zip(entries, results):
        entry["generated_response"] = answer
        
    output_path = f"eval_datasets/{output_json}"

    # Write updated dataset to output file
    with open(output_path, "w", encoding="utf-8") as f_out:
        json.dump(entries, f_out, ensure_ascii=False, indent=4)

    print(f"\n✅ Done. Answers written to: {output_path}")
    return output_path

In [None]:
golden_dataset_json = "golden_qa_evalset_generation.json"

llm_enriched_golden_dataset = generate_answers_parallel(golden_dataset_json, max_workers=20)

#### 3 Evaluation of Generated Responses

In [None]:
golden_dataset_generation_results = run_generation_evaluation(json_filename=llm_enriched_golden_dataset.split("/")[-1], 
                                               model_name=model_name, 
                                               # evaluation_mode="final_eval"
                                               ) 
display(golden_dataset_generation_results)

#### LLM-as-a-Judge for Claim Support 

In [None]:
input_path = "eval_datasets/golden_qa_evalset_generation_with_answers.json"
output_path = "eval_results/golden_qa_evalset_standalone_llm_as_a_judge_results.json"
final_rejudge_output_path = "eval_results/golden_qa_evalset_standalone_llm_as_a_judge_final_rejudge_results.json"
max_workers = 10

In [None]:
# LLM-as-a-Judge for Comparison and Further Justification

llm_as_a_judge_first_eval_results_path = run_llm_judge_parallel(input_path=input_path, output_path=output_path, max_workers=max_workers)
llm_as_a_judge_rejudge_results_path = run_llm_rejudge_parallel(input_path=llm_as_a_judge_first_eval_results_path, output_path=final_rejudge_output_path, max_workers=max_workers)

In [None]:
output_file_name_LLMaaJ_first = "llm_as_a_judge_first_results"
output_file_name_LLMaaJ_rejudge = "llm_as_a_judge_rejudge_results"

manual_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(manual_results_path, output_file_name_manual)
llm_as_a_judge_first_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_first_eval_results_path, output_file_name_LLMaaJ_first)
llm_as_a_judge_final_rejudge_eval_scores = calculate_and_visualize_scores_of_evaluation_scheme(llm_as_a_judge_rejudge_results_path, output_file_name_LLMaaJ_rejudge)