# Import

In [None]:
import os

### MUST SET YOUR OWN OPENAI API KEY HERE
os.environ["OPENAI_API_KEY"] = '...'
os.environ["ANTHROPIC_API_KEY"] = '...'
os.environ['MISTRAL_API_KEY'] = '...'
os.environ["LLMANA_API_KEY"] ='...'

import numpy as np
import pandas as pd
# import random
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm 
from model_api_client import ModelAPIClient


In [3]:
!pip install litellm
!pip install mistralai
!pip install anthropic


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from openai import Client
client = Client(api_key=os.environ["OPENAI_API_KEY"])

from litellm import completion
from mistralai import Mistral
import anthropic

# Load Data

In [5]:
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

base_dir = os.path.join(prefix, "summeval-data")
print(base_dir)

full_df = pd.read_json(os.path.join(base_dir, "summeval_processed_full.jsonl"), lines=True)


../summeval-data


In [6]:
print(full_df.columns)
print(len(full_df))

Index(['summary', 'expert_annotations', 'turker_annotations', 'references',
       'model_id', 'raw', 'mistral_relevance', 'mistral_fluency',
       'mistral_coherence', 'mistral_consistency', 'all_annotations',
       'scores_coherence_expert', 'scores_coherence_turker',
       'scores_coherence_all', 'var_coherence_expert', 'var_coherence_turker',
       'var_coherence_all', 'mean_coherence_expert', 'mean_coherence_turker',
       'mean_coherence_all', 'var_coherence_expert_disc',
       'var_coherence_turker_disc', 'var_coherence_all_disc', 'diff_coherence',
       'scores_consistency_expert', 'scores_consistency_turker',
       'scores_consistency_all', 'var_consistency_expert',
       'var_consistency_turker', 'var_consistency_all',
       'mean_consistency_expert', 'mean_consistency_turker',
       'mean_consistency_all', 'var_consistency_expert_disc',
       'var_consistency_turker_disc', 'var_consistency_all_disc',
       'diff_consistency', 'scores_fluency_expert', 'scores_flu

In [7]:
N_SUBSET = 300

selected_df = full_df.sample(n=N_SUBSET, random_state=42)
print(len(selected_df))

selected_indices = selected_df.index
print(selected_indices)

300
Index([ 526,  354,  168,  135,  937, 1544, 1253,  237,  478,  650,
       ...
        163, 1296,  266, 1005,  873,  692, 1450, 1263,  192,  548],
      dtype='int64', length=300)


# Build Scoring Prompt

In [16]:
def scoring_prompt_with_explanation(article, summary, assertion_text):
    prompt = f"""You will be given one summary written for a news article. 
Your task is to evaluate the summary on a specific criterion.
Please read and understand these instructions carefully.

Evaluation Criterion:
{assertion_text}

Evaluation Steps:
1. Read the article and summary carefully.
2. Determine whether the summary meets the specified criterion.
3. Provide:
    - A score of 1 if the criterion is met, or 0 if it is not.
    - A brief explanation justifying your score.

RESPONSE FORMAT:
Score: [0 or 1]
Explanation: [one-sentence explanation]

Begin below:

Article: {article}
Summary: {summary}"""

    return prompt


In [17]:
assertion_dictionary_og = {
    'fluency': {
        'C1-A1': 'Fluency measures the quality of individual sentences, are they well-written and grammatically correct. Consider the quality of individual sentences.',
        'C1-A2': 'Each sentence is free from grammatical errors and awkward phrasing.',
        'C1-A3': 'Contains sentences that are incomplete or lack a clear subject-verb-object structure',
        # 'C1-A3': 'Sentences contains grammatical errors and awkward phrasing.', ### negation example
    },
    'relevance': {
        'C2-A1': 'Relevance measures how well the summary captrues the key points of the article. Consider whether all and only the important aspects are contained in the summary.',
        'C2-A2': 'Contains no irrelevant or extraneous information unrelated to the article\'s main points',
        'C2-A3': 'Includes all context necessary for understanding key events or claims',
        'C2-A4': 'Includes absolutely all information that could reasonably be necessary to evaluate events or claims, even if not central to the article’s key points.',
        'C2-A5': 'Includes at least some information needed to understand key events or claims.',
    },
    'coherence': {
        'C3-A1': 'Coherence measures the quality of all sentences collectively, to the fit togheter and soound naturally. Consider the quality of the summary as a whole.',
        'C3-A2': 'Sentences in the summary logically progress from one to another without introducing conflicting or unrelated information.',
        'C3-A3': 'Maintains logical progression without conflicting or contradictory information',
    },
    'consistency': {
        'C4-A1': "Consistency measures whether the facts in the summary are consistent with the facts in the original article. COnsider whether the summary does reproduce all facts accurately and does not make up untrue information.",
        'C4-A2': 'The summary includes no fabricated details or misrepresented facts compared to the original article.',
        'C4-A3': 'Summary contains only verifiable facts directly present in the original article.',
    }
}

In [18]:

assertion_dictionary_advanced = {
    'ambiguity': {
        'C1-A1': 'The summary demonstrates appropriate level of detail for the intended reader.',
        'C1-A2': 'The summary includes the main claim and supporting evidence from each paragraph of the source article.',
    },
    'complexity': {
        'C2-A1': 'The summary accurately captures the article\'s key points, maintains neutral tone throughout, demonstrates coherent organization between sentences, and avoids both redundancy and important omissions while preserving the original meaning.',
        'C2-A2': 'The summary accurately captures the article\'s key points.',
        'C2-A3': 'The summary maintains neutral tone throughout.',  
    },
    'language': {
        'C3-A1': 'The summary demonstrates high extractive fidelity while maintaining abstractive coherence across semantic boundaries.',
        'C3-A2': 'The summary uses words from the original article while connecting ideas in a logical way.',
    },
}

In [None]:
### Change as needed

assertion_dictionary = assertion_dictionary_og

# Run Scoring Code

Models Used:
* {'provider': 'openai', 'model': 'gpt-4o-mini-2024-07-18'} - Recommended # threads: 10
* {'provider': 'openai', 'model': 'gpt-3.5-turbo'} - Recommended # threads: 10
* {'provider': 'mistral', 'model': 'mistral-small-latest'} - Recommended # threads: 5
* {'provider': 'mistral', 'model': 'mistral-medium-latest'} - Recommended # threads: 4
* {'provider': 'anthropic', 'model': 'claude-3-5-haiku-20241022'} - Recommended # threads: 7


In [20]:
N_SAMPLES = 1 ### number of trials for each data point/ assertion

n_assertions = sum(len(assertions) for assertions in assertion_dictionary.values())

score_table   = np.full((N_SUBSET, n_assertions, N_SAMPLES), -1, dtype=int)
explain_table = np.full((N_SUBSET, n_assertions, N_SAMPLES), "", dtype=object)

print(score_table.shape)


(300, 14, 1)


In [21]:
MODELS = {
    "gpt-4o-mini": {"provider": "openai", "model": "gpt-4o-mini-2024-07-18", "threads": 10},
    "gpt-3.5": {"provider": "openai", "model": "gpt-3.5-turbo", "threads": 10},
    "mistral-small": {"provider": "mistral", "model": "mistral-small-latest", "threads": 5},
    "mistral-medium": {"provider": "mistral", "model": "mistral-medium-latest", "threads": 4},
    "claude-haiku": {"provider": "anthropic", "model": "claude-3-5-haiku-20241022", "threads": 7}
}

In [22]:
MODEL_KEY = "gpt-4o-mini" ### Change as needed

PROVIDER = MODELS[MODEL_KEY]["provider"]
MODEL = MODELS[MODEL_KEY]["model"]
RECOMMENDED_THREADS = MODELS[MODEL_KEY]["threads"]

MAX_TOKENS = 1000
MOCK = False

In [None]:
### Default max_retries 3 + exponential delay as more rate limit errors

def score_from_prompt(prompt, max_retries=3, initial_delay=4):
    retries = 0
    delay = initial_delay
    
    while retries < max_retries:
        try:
            if retries > 0:
                print("Attempting Retry #", retries+1)
            time.sleep(2)

            content = ModelAPIClient.call_api(
                prompt, provider=PROVIDER, model=MODEL, max_tokens=MAX_TOKENS, mock=MOCK
            )

            if not content:
                raise ValueError("Empty response")

            content = content.strip()

            score_match = re.search(r'Score:\s*(\d)', content)
            explanation_match = re.search(r'Explanation:\s*(.+)', content, re.DOTALL)

            score = int(score_match.group(1)) if score_match else -1
            explanation = explanation_match.group(1).strip() if explanation_match else "No explanation found"

            if score == -1:
                print("-1 DETECTED")
            return score, explanation

        except Exception as e:
            err_str = str(e)
            print(f"[Retry {retries+1}] Error: {err_str}")
            
            # Retry if it's a 429 or other rate limit type error
            if "429" in err_str or "Rate limit" in err_str or "Too Many Requests" in err_str or "Empty response" in err_str:
                retries += 1
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                return -1, f"Error: {err_str}"

    return -1, "Error: Max retries exceeded"
    


In [None]:
START, END = 0, N_SUBSET ### Change the range as needed to run portions of dataset
tasks = []

for i, row in enumerate(selected_df.iloc[START:END].itertuples(index=False), start=START):
    summary = row.summary
    raw = row.raw
    a_idx = 0
    for j, (category, assertions) in enumerate(assertion_dictionary.items()):
        for a_id, assertion in assertions.items():
            for s in range(N_SAMPLES):
                tasks.append((i, a_idx, s, raw, summary, assertion))
            a_idx += 1


In [None]:
max_threads = RECOMMENDED_THREADS ### Change as needed, refer to recommended value above

def scoring_task(i, a_idx, s, raw, summary, assertion):
    prompt = scoring_prompt_with_explanation(raw, summary, assertion)
    score, explanation = score_from_prompt(prompt) ### max_retries= 0, initial_delay=0
    return (i, a_idx, s, score, explanation)

with ThreadPoolExecutor(max_workers=max_threads) as executor:
    future_to_task = {
        executor.submit(scoring_task, *task): task for task in tasks
    }

    for future in tqdm(as_completed(future_to_task), total=len(future_to_task), desc="Scoring"):
        i, a_idx, s, score, explanation = future.result()
        score_table[i, a_idx, s] = int(score)
        explain_table[i, a_idx, s] = explanation or ""


Scoring: 100%|██████████| 140/140 [01:19<00:00,  1.77it/s]


In [33]:
### Sanity check code, especially if experimenting with max_threads

no_errors = ~((score_table[:END,:,:] == -1).any())
print("Scores up until END are filled appropriately:", no_errors)

if not no_errors:
    indices = np.argwhere(score_table[:END,:,:] == -1)
    print("Indices with errors:", indices)

no_errors_all = ~((score_table[:,:,:] == -1).any())
print("All scores are filled appropriately:", no_errors_all)

if not no_errors_all:
    indices = np.argwhere(score_table[:,:,:] == -1)
    print("Indices with errors:", indices)

Scores up until END are filled appropriately: True
All scores are filled appropriately: False
Indices with errors: [[ 10   0   0]
 [ 10   1   0]
 [ 10   2   0]
 ...
 [299  11   0]
 [299  12   0]
 [299  13   0]]


In [24]:
### Save results

output_dir = "../results/adv" ### Change to create a new folder as needed
os.makedirs(output_dir, exist_ok=True)

np.save(os.path.join(output_dir, f"score_table_{MODEL}.npy"), score_table)
np.save(os.path.join(output_dir, f"explain_table_{MODEL}.npy"), explain_table)