In [None]:
import os

### MUST SET YOUR OWN OPENAI API KEY HERE
os.environ["OPENAI_API_KEY"] = '...'

import numpy as np
import pandas as pd
import random

import textwrap
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from openai import Client
client = Client(api_key=os.environ["OPENAI_API_KEY"])


# Set Up

In [4]:
current_dir = os.getcwd()
if current_dir.endswith("notebooks"):
    prefix = "../"
else:
    prefix = "./"

base_dir = os.path.join(prefix, "summeval-data")
print(base_dir)

full_df = pd.read_json(os.path.join(base_dir, "summeval_processed_full.jsonl"), lines=True)


../summeval-data


In [6]:
N_SUBSET = 300

selected_df = full_df.sample(n=N_SUBSET, random_state=42)
print(len(selected_df))

selected_indices = selected_df.index
print(selected_indices)

300
Index([ 526,  354,  168,  135,  937, 1544, 1253,  237,  478,  650,
       ...
        163, 1296,  266, 1005,  873,  692, 1450, 1263,  192,  548],
      dtype='int64', length=300)


In [7]:
models = [
    "gpt-4o-mini-2024-07-18",
    "gpt-3.5-turbo",
    "mistral-small-latest",
    "mistral-medium-latest",
    "claude-3-5-haiku-20241022"
]

In [8]:
output_dir = "../results/og"  ### Change as needed
score_tables = {}
explain_tables = {}

for model in models:

    score_path = os.path.join(output_dir, f"score_table_{model}.npy")
    explain_path = os.path.join(output_dir, f"explain_table_{model}.npy")

    score_tables[model] = np.load(score_path, allow_pickle=True)
    explain_tables[model] = np.load(explain_path, allow_pickle=True)

    print(f"{model} → Scores shape: {score_tables[model].shape}, Explains shape: {explain_tables[model].shape}")
    
    if np.all(score_tables[model] != -1) and score_tables[model].size > 0 and explain_tables[model].size > 0:
        print(f"[OK]: Scores & explanations loaded successfully.\n")
    else:
        print(f"[WARN]: Missing or invalid data.\n")

gpt-4o-mini-2024-07-18 → Scores shape: (300, 14, 1), Explains shape: (300, 14, 1)
[OK]: Scores & explanations loaded successfully.

gpt-3.5-turbo → Scores shape: (300, 14, 1), Explains shape: (300, 14, 1)
[OK]: Scores & explanations loaded successfully.

mistral-small-latest → Scores shape: (300, 14, 1), Explains shape: (300, 14, 1)
[OK]: Scores & explanations loaded successfully.

mistral-medium-latest → Scores shape: (300, 14, 1), Explains shape: (300, 14, 1)
[OK]: Scores & explanations loaded successfully.

claude-3-5-haiku-20241022 → Scores shape: (300, 14, 1), Explains shape: (300, 14, 1)
[OK]: Scores & explanations loaded successfully.



In [9]:
properties_og = ["fluency", "relevance", "coherence", "consistency"]

assertion_dictionary_og = {
    'fluency': {
        'C1-A1': 'Fluency measures the quality of individual sentences, are they well-written and grammatically correct. Consider the quality of individual sentences.',
        'C1-A2': 'Each sentence is free from grammatical errors and awkward phrasing.',
        'C1-A3': 'Contains sentences that are incomplete or lack a clear subject-verb-object structure',
        # 'C1-A3': 'Sentences contains grammatical errors and awkward phrasing.', ### negation example
    },
    'relevance': {
        'C2-A1': 'Relevance measures how well the summary captrues the key points of the article. Consider whether all and only the important aspects are contained in the summary.',
        'C2-A2': 'Contains no irrelevant or extraneous information unrelated to the article\'s main points',
        'C2-A3': 'Includes all context necessary for understanding key events or claims',
        'C2-A4': 'Includes absolutely all information that could reasonably be necessary to evaluate events or claims, even if not central to the article’s key points.',
        'C2-A5': 'Includes at least some information needed to understand key events or claims.',
    },
    'coherence': {
        'C3-A1': 'Coherence measures the quality of all sentences collectively, to the fit togheter and soound naturally. Consider the quality of the summary as a whole.',
        'C3-A2': 'Sentences in the summary logically progress from one to another without introducing conflicting or unrelated information.',
        'C3-A3': 'Maintains logical progression without conflicting or contradictory information',
    },
    'consistency': {
        'C4-A1': "Consistency measures whether the facts in the summary are consistent with the facts in the original article. COnsider whether the summary does reproduce all facts accurately and does not make up untrue information.",
        'C4-A2': 'The summary includes no fabricated details or misrepresented facts compared to the original article.',
        'C4-A3': 'Summary contains only verifiable facts directly present in the original article.',
    }
}

In [None]:
### Change as needed

properties = properties_og
assertion_dictionary = assertion_dictionary_og

In [14]:
def _build_assertion_labels(assertion_dictionary):
    assertion_labels = []
    wrapped_labels = []
    for prop, assertions in assertion_dictionary.items():
        for a_id, assertion_text in assertions.items():
            label = f"{prop.capitalize()} - {a_id}"
            wrapped = textwrap.fill(assertion_text, width=50)
            full_label = f"{label}\n{wrapped}"
            assertion_labels.append(label)
            wrapped_labels.append(full_label)
    return assertion_labels, wrapped_labels

assertion_labels, wrapped_labels = _build_assertion_labels(assertion_dictionary)
print(assertion_labels)
print(wrapped_labels)

['Fluency - C1-A1', 'Fluency - C1-A2', 'Fluency - C1-A3', 'Relevance - C2-A1', 'Relevance - C2-A2', 'Relevance - C2-A3', 'Relevance - C2-A4', 'Relevance - C2-A5', 'Coherence - C3-A1', 'Coherence - C3-A2', 'Coherence - C3-A3', 'Consistency - C4-A1', 'Consistency - C4-A2', 'Consistency - C4-A3']
['Fluency - C1-A1\nFluency measures the quality of individual\nsentences, are they well-written and grammatically\ncorrect. Consider the quality of individual\nsentences.', 'Fluency - C1-A2\nEach sentence is free from grammatical errors and\nawkward phrasing.', 'Fluency - C1-A3\nContains sentences that are incomplete or lack a\nclear subject-verb-object structure', 'Relevance - C2-A1\nRelevance measures how well the summary captrues\nthe key points of the article. Consider whether\nall and only the important aspects are contained\nin the summary.', "Relevance - C2-A2\nContains no irrelevant or extraneous information\nunrelated to the article's main points", 'Relevance - C2-A3\nIncludes all contex

In [11]:
correlation_cases = ["strong", "weak", "inverse"]

# 1: Assertion Selection

### Strong Correlation

C4-A1, C4-A2: score 0.93

### Weak Correlation

C2-A2, C2-A3: score 0.76

### Inverse Correlation

C1-A2, C1-A3: score -0.57

In [15]:
correlation_values = {
    "strong": 0.93,
    "weak": 0.76,
    "inverse": -0.57
}

# 2: Sorted Case View

In [16]:
assertion_pair_indices = {
    "strong": (
        assertion_labels.index("Consistency - C4-A1"),
        assertion_labels.index("Consistency - C4-A2")
    ),
    "weak": (
        assertion_labels.index("Relevance - C2-A2"),
        assertion_labels.index("Relevance - C2-A3")
    ),
    "inverse": (
        assertion_labels.index("Fluency - C1-A2"),
        assertion_labels.index("Fluency - C1-A3")
    )
}

print(assertion_pair_indices)

{'strong': (11, 12), 'weak': (4, 5), 'inverse': (1, 2)}


In [17]:
avg_scores_across_models = np.mean(np.stack([score_table_model for score_table_model in score_tables.values()]), axis=0)  # shape: (300, 12)
avg_scores_across_models = avg_scores_across_models.squeeze()
print(avg_scores_across_models.shape)

(300, 14)


In [19]:
def get_strong_items(scores, a_idx1, a_idx2, sample_n=5):
    cond = (scores[:, a_idx1] == scores[:, a_idx2])
    full_indices = np.where(cond)[0]

    if len(full_indices) >= sample_n:
        sampled_indices = np.random.choice(full_indices, size=sample_n, replace=False)
    else:
        sampled_indices = full_indices  

    return full_indices, sampled_indices

INV_THRESHOLD_LOW = 0.2
INV_THRESHOLD_HIGH = 0.8

def get_inverse_items(scores, a_idx1, a_idx2, sample_n=5):
    cond = ((scores[:, a_idx1] >= INV_THRESHOLD_HIGH) & (scores[:, a_idx2] <= INV_THRESHOLD_LOW)) | \
           ((scores[:, a_idx1] <= INV_THRESHOLD_LOW) & (scores[:, a_idx2] >= INV_THRESHOLD_HIGH))
    full_indices = np.where(cond)[0]

    if len(full_indices) >= sample_n:
        sampled_indices = np.random.choice(full_indices, size=sample_n, replace=False)
    else:
        sampled_indices = full_indices

    return full_indices, sampled_indices

def get_weak_items(scores, a_idx1, a_idx2, sample_n=5):
    total = scores.shape[0]
    return np.random.choice(total, size=sample_n, replace=False)

In [21]:
a_idx_strong = assertion_pair_indices["strong"]
a_idx_weak = assertion_pair_indices["weak"]
a_idx_inverse = assertion_pair_indices["inverse"]

# Run and display results
strong_all, strong_sample = get_strong_items(avg_scores_across_models, *a_idx_strong)
print(f"Strong {a_idx_strong}: {strong_sample} (Sampled) (Total: {len(strong_all)})")

inverse_all, inverse_sample = get_inverse_items(avg_scores_across_models, *a_idx_inverse)
print(f"Inverse {a_idx_inverse}: {inverse_sample} (Sampled) (Total: {len(inverse_all)})")

weak_sample = get_weak_items(avg_scores_across_models, *a_idx_weak)
print(f"Weak {a_idx_weak}: {weak_sample} (Sampled) (Total: {len(weak_sample)})")

Strong (11, 12): [191 240 134 279 264] (Sampled) (Total: 220)
Inverse (1, 2): [201  16  13 123  96] (Sampled) (Total: 127)
Weak (4, 5): [ 30  10 215  13 241] (Sampled) (Total: 5)


# 3: P-value/ Confidence Interval

In [23]:
from scipy.stats import pearsonr

for label, (a_idx1, a_idx2) in assertion_pair_indices.items():
    a1_scores = avg_scores_across_models[:, a_idx1]
    a2_scores = avg_scores_across_models[:, a_idx2]
    
    corr, p_value = pearsonr(a1_scores, a2_scores)
    print(f"\n{label.capitalize()} Correlation:")
    print(f"  Assertions: {assertion_labels[a_idx1]} vs. {assertion_labels[a_idx2]}")
    print(f"  Pearson r: {corr:.3f}")
    print(f"  p-value: {p_value}")


Strong Correlation:
  Assertions: Consistency - C4-A1 vs. Consistency - C4-A2
  Pearson r: 0.932
  p-value: 9.92992240443276e-134

Weak Correlation:
  Assertions: Relevance - C2-A2 vs. Relevance - C2-A3
  Pearson r: 0.765
  p-value: 7.98319050865786e-59

Inverse Correlation:
  Assertions: Fluency - C1-A2 vs. Fluency - C1-A3
  Pearson r: -0.567
  p-value: 6.71202857035294e-27


# 4: Diagnostic Prompt

## 4.1: Build Diagnostic Prompt

In [24]:
def build_hypothesis_prompt_iter2(df, input_col, output_col, 
                             assertion_text_1, assertion_text_2,
                             assertion_type_1, assertion_type_2,
                             correlation_strength, correlation_value, p_value,
                             assertion_scores, item_indices, *,
                             include_categories=True):
    
    prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior.

Assertion Details:
* A1: {assertion_text_1}
* A2: {assertion_text_2}
* Assertion types: {assertion_type_1} and {assertion_type_2}

Correlation Analysis:
* Correlation coefficient: {correlation_value:.3f}
* Correlation strength: {correlation_strength}
* Statistical significance: p = {p_value:.4g}

Based on the following list of Generative AI system inputs and outputs and corresponding assertion values, generate hypotheses for why the assertions exhibit this correlation pattern. 
Hypotheses should describe very specific logical or semantic relationships between assertions that describe correlation patterns. 

For example, hypotheses might identify patterns such as: 
- The two assertions capture fundamentally different concepts
- One assertion is a logical negation, subset, or superset of another
- The assertions capture similar or redundant information about system behavior
- The assertions tend to fail together or in opposing patterns under certain conditions

Hypotheses should not state vague or general observations, such as the assertions are related to one another. They should be much more specific. 

Example hypotheses:
* "The two assertions capture different concepts with minimal overlap"
* "One assertion is a logical negation of another, creating inverse correlation"
* "Assertion 1 is more sensitive to edge cases than Assertion 2"
* "The assertions have different thresholds, with A1 being more restrictive than A2"
"""

    # Add the 5 data items
    for idx, row_idx in enumerate(item_indices, 1):
        row = df.iloc[row_idx]
        a1_score, a2_score = assertion_scores[row_idx]
        prompt += f"""
Item {idx}: 
Input: {getattr(row, input_col)}
Output: {getattr(row, output_col)}
Assertion 1: {a1_score:.1f}
Assertion 2: {a2_score:.1f}
"""
    prompt += """
Instructions: Return 5 hypotheses for the observed correlation pattern, formatted as "Hypothesis {#}: {hypothesis statement}".
"""

    return prompt

In [26]:
assertion_types = {
    "strong": ("performance-related", "performance-related"),
    "weak": ("performance-related", "performance-related"),
    "inverse": ("performance-related", "performance-related"),
}

def get_assertion_text(label):
    concept, a_id = label.split(" - ")
    return assertion_dictionary[concept.lower()][a_id]

In [27]:
diagnostic_prompts = {}

for case in correlation_cases:
    idx1, idx2 = assertion_pair_indices[case]
    a1_label, a2_label = assertion_labels[idx1], assertion_labels[idx2]
    
    a1_text = get_assertion_text(a1_label)
    a2_text = get_assertion_text(a2_label)

    scores = avg_scores_across_models[:, [idx1, idx2]]

    if case == "strong":
        item_indices = strong_sample
    elif case == "weak":
        item_indices = weak_sample
    else:
        item_indices = inverse_sample

    r_val = correlation_values[case]

    _, p_val = pearsonr(avg_scores_across_models[:, idx1], avg_scores_across_models[:, idx2])

    type1, type2 = assertion_types[case]

    prompt = build_hypothesis_prompt_iter2(
        df=selected_df,
        input_col="raw",
        output_col="summary",
        assertion_text_1=a1_text,
        assertion_text_2=a2_text,
        assertion_type_1=type1,
        assertion_type_2=type2,
        correlation_strength=case,
        correlation_value=r_val,
        p_value=p_val,
        assertion_scores=scores,
        item_indices=item_indices,
        include_categories=True, ### toggle 
    )

    diagnostic_prompts[case] = prompt

print(diagnostic_prompts["inverse"])

Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior.

Assertion Details:
* A1: Each sentence is free from grammatical errors and awkward phrasing.
* A2: Contains sentences that are incomplete or lack a clear subject-verb-object structure
* Assertion types: performance-related and performance-related

Correlation Analysis:
* Correlation coefficient: -0.570
* Correlation strength: inverse
* Statistical significance: p = 6.712e-27

Based on the following list of Generative AI system inputs and outputs and corresponding assertion values, generate hypotheses for why the assertions exhibit this correlation pattern. 
Hypotheses should describe very specific logical or semantic relationships between assertions that describe correlation patterns. 

For example, hypotheses might identify patterns such as: 
- The two assertions capture fundamentally different concepts
- One assertion is a logical negation, subset, or superset 

## 4.2: Run Diagnositic Prompt - OpenAI

In [28]:
import re

def parse_hypotheses_to_list(hypotheses_text):
    chunks = re.split(r'\bHypothesis\s*\d+\s*:\s*', hypotheses_text)
    
    parsed = [chunk.strip() for chunk in chunks if chunk.strip()]
    
    return parsed

In [29]:
dict_hypotheses = {case: [] for case in correlation_cases}

for i, case in enumerate(correlation_cases):
    print("Case: ", case)
    prompt = diagnostic_prompts[case]
    response = client.chat.completions.create(
        model="gpt-4o",  
        messages=[
            {"role": "system", "content": "You are an expert in analyzing evaluation criteria and interpreting statistical correlations in AI behavior assessments."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=1000,
    )
    hypotheses = response.choices[0].message.content.strip()
    hypotheses_list = parse_hypotheses_to_list(hypotheses)
    dict_hypotheses[case] = hypotheses_list
    
print(dict_hypotheses)


Case:  strong
Case:  weak
Case:  inverse
{'strong': ['The two assertions capture similar or redundant information about system behavior, as both focus on the accuracy and truthfulness of the summary in relation to the original article, leading to a high correlation.', 'Assertion 2 is a logical subset of Assertion 1. While Assertion 1 includes both consistency and truthfulness, Assertion 2 specifically highlights the absence of fabricated details, which is inherently a component of overall consistency.', 'The assertions tend to fail together under conditions where the AI system introduces any factual inaccuracies or makes up information, as any fabrication would inherently affect both consistency and the presence of fabricated details.', 'The strong correlation may be due to both assertions operating under similar thresholds, where even minor inconsistencies or fabrications are flagged, resulting in similar assertion values across different inputs.', 'The assertions are semantically ali

In [38]:
for case in correlation_cases:
    idx1, idx2 = assertion_pair_indices[case]
    a1_label, a2_label = assertion_labels[idx1], assertion_labels[idx2]

    a1_text = get_assertion_text(a1_label)
    a2_text = get_assertion_text(a2_label)

    hyps = dict_hypotheses.get(case, []) or []

    print(f"\n=== {case.upper()} ===")
    print(f"Old Assertion 1 ({a1_label}):  {a1_text}")
    print(f"Old Assertion 2 ({a2_label}):  {a2_text}\n")

    for i, h in enumerate(hyps[:5], start=1):
        print(f"Hypothesis {i}: {h}")


=== STRONG ===
Old Assertion 1 (Consistency - C4-A1):  Consistency measures whether the facts in the summary are consistent with the facts in the original article. COnsider whether the summary does reproduce all facts accurately and does not make up untrue information.
Old Assertion 2 (Consistency - C4-A2):  The summary includes no fabricated details or misrepresented facts compared to the original article.

Hypothesis 1: The two assertions capture similar or redundant information about system behavior, as both focus on the accuracy and truthfulness of the summary in relation to the original article, leading to a high correlation.
Hypothesis 2: Assertion 2 is a logical subset of Assertion 1. While Assertion 1 includes both consistency and truthfulness, Assertion 2 specifically highlights the absence of fabricated details, which is inherently a component of overall consistency.
Hypothesis 3: The assertions tend to fail together under conditions where the AI system introduces any factua

# 5: Refinement Prompt

## 5.1 Build Refinement Prompt

In [39]:
def build_combined_assertion_prompt_iter2(assertion_1_text,assertion_2_text,correlation_strength,correlation_value,p_value,diagnosed_issue,specific_hypothesis,example_assertions=None):
    prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior. Based on a hypothesized relationship between these assertions, your task is to generate new assertions that resolve the identified issues. 

Assertion Details:
A1: {assertion_1_text}
A2: {assertion_2_text}

Correlation Analysis:
Correlation coefficient: {correlation_value:.3f}
Correlation strength: {correlation_strength}
Statistical significance: p = {p_value:.4g}

Diagnosed issue (general): {diagnosed_issue}
Specific hypothesis: {specific_hypothesis}

Generate five new assertions that combine A1 and A2 while preserving the main evaluation goals. Assertions should be binary statements that can be evaluated for each target system input-output pair."""
    
    if example_assertions:
        prompt += "\n\nFor example:"
        for ex in example_assertions:
            prompt += f"\n{ex}"

    prompt += "\n\nFormat your response as:\nAssertion: <assertion text>"

    return prompt

In [40]:
def build_refinement_prompt_comprehensiveness_iter2(assertion_1_text, assertion_2_text, correlation_strength, correlation_value, p_value, diagnosed_issue, specific_hypothesis, example_assertions=None):
    prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior. Based on a hypothesized relationship between these assertions, your task is to generate new, **refined assertions** that better distinguish between different dimensions of system performance.

Assertion Details:
A1: {assertion_1_text}  
A2: {assertion_2_text}

Correlation Analysis:
Correlation coefficient: {correlation_value:.3f}  
Correlation strength: {correlation_strength}  
Statistical significance: p = {p_value:.4g}  

Diagnosed issue (general): {diagnosed_issue}
Specific hypothesis: {specific_hypothesis}

Generate five ideas for refined assertions that separate or clarify the evaluation criteria expressed in A1 and A2. These refinements should help evaluators more clearly distinguish **specific aspects** of output quality."""

    if example_assertions:
            prompt += "\n\nFor example:"
            for ex in example_assertions:
                prompt += f"\n{ex}"

    prompt += "\n\nFormat your response as:\nAssertion: <refined assertion text>"

    return prompt

In [None]:
### Manually selected from dict_hypotheses

selected_hypotheses = {
    "strong": "Assertion 2 is a subset of Assertion 1, as Assertion 1 requires overall factual consistency, which inherently includes the absence of fabricated details covered by Assertion 2, resulting in a strong correlation when both assertions are satisfied.",
    "weak": "The two assertions, A1 and A2, capture overlapping but distinct concepts, where A1 focuses on excluding irrelevant information, while A2 emphasizes the inclusion of necessary context, leading to scenarios where outputs can satisfy one without necessarily satisfying the other.",
    "inverse": "Assertion A1 is a logical negation of Assertion A2, as a sentence free from grammatical errors and awkward phrasing (A1) inherently cannot contain incomplete or structurally deficient elements (A2), creating an inverse relationship."
}

In [43]:
### Manually selected from matching selected_hypotheses to their best fit diagnosis 
### (refer to structural validity workflow design doc/ figma) 

selected_diagnoses = {
    "strong": "Each assertion captures redundant properties of system outputs.",
    "weak": "The assertions capture complementary aspects of the property.",
    "inverse": "One assertion is a logical negation of another."
}

In [44]:
example_assertions = [
    "Each sentence in the summary accurately conveys a complete thought without grammatical or structural errors.",
    "The summary only includes key information directly relevant to the article's main claims or events.",
    "Information in the summary is presented in a logically coherent order, with no contradictory or disjointed transitions."
]

In [45]:
refinement_prompts = {}

for case in correlation_cases:
    idx1, idx2 = assertion_pair_indices[case]
    a1_label, a2_label = assertion_labels[idx1], assertion_labels[idx2]
    
    a1_text = get_assertion_text(a1_label)
    a2_text = get_assertion_text(a2_label)

    r_val = correlation_values[case]
    _, p_val = pearsonr(avg_scores_across_models[:, idx1], avg_scores_across_models[:, idx2])

    diagnosis = selected_diagnoses[case]
    hypothesis = selected_hypotheses[case]
    
    prompt = build_refinement_prompt_comprehensiveness_iter2( ### change function here
        assertion_1_text=a1_text,
        assertion_2_text=a2_text,
        correlation_strength=case,
        correlation_value=corr,
        p_value=p_val,
        diagnosed_issue=diagnosis,
        specific_hypothesis=hypothesis,
        example_assertions=example_assertions
    )

    refinement_prompts[case] = prompt

print(refinement_prompts["inverse"])

Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior. Based on a hypothesized relationship between these assertions, your task is to generate new, **refined assertions** that better distinguish between different dimensions of system performance.

Assertion Details:
A1: Each sentence is free from grammatical errors and awkward phrasing.  
A2: Contains sentences that are incomplete or lack a clear subject-verb-object structure

Correlation Analysis:
Correlation coefficient: -0.567  
Correlation strength: inverse  
Statistical significance: p = 6.712e-27  

Diagnosed issue (general): One assertion is a logical negation of another.
Specific hypothesis: Assertion A1 is a logical negation of Assertion A2, as a sentence free from grammatical errors and awkward phrasing (A1) inherently cannot contain incomplete or structurally deficient elements (A2), creating an inverse relationship.

Generate five ideas for refined asserti

## 5.2 Run Refinement Prompts - OpenAI

In [46]:
def parse_assertions_to_list(text):
    chunks = re.split(r'\bAssertion\s*\d+\s*:\s*', text)
    
    parsed = [chunk.strip() for chunk in chunks if chunk.strip()]
    
    return parsed

In [49]:
dict_assertions = {case: [] for case in correlation_cases}

for case in correlation_cases:
    print("Case: ", case)

    idx1, idx2 = assertion_pair_indices[case]
    a1_label, a2_label = assertion_labels[idx1], assertion_labels[idx2]
    a1_text = get_assertion_text(a1_label)
    a2_text = get_assertion_text(a2_label)

    prompt = refinement_prompts[case]
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an expert in designing evaluation criteria for AI behavior analysis. Your job is to create new, precise, and binary assertions."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
        max_tokens=1000,
    )

    assertions_text = response.choices[0].message.content.strip()
    assertions_list = parse_assertions_to_list(assertions_text)
    dict_assertions[case] = assertions_list

Case:  strong
Case:  weak
Case:  inverse


In [50]:
for case in correlation_cases:
    print(f"\n=== {case.upper()} ===")
    print("Hypothesis: ", selected_hypotheses[case])
    print("Diagnosed Issue: ", selected_diagnoses[case])

    idx1, idx2 = assertion_pair_indices[case]
    a1_label, a2_label = assertion_labels[idx1], assertion_labels[idx2]
    a1_text = get_assertion_text(a1_label)
    a2_text = get_assertion_text(a2_label)

    print(f"Old Assertion 1 ({a1_label}): {a1_text}")
    print(f"Old Assertion 2 ({a2_label}): {a2_text}\n")

    for i, a in enumerate(dict_assertions.get(case, [])[:5], start=1):
        print(f"New Assertion {i}: {a}")


=== STRONG ===
Hypothesis:  Assertion 2 is a subset of Assertion 1, as Assertion 1 requires overall factual consistency, which inherently includes the absence of fabricated details covered by Assertion 2, resulting in a strong correlation when both assertions are satisfied.
Diagnosed Issue:  Each assertion captures redundant properties of system outputs.
Old Assertion 1 (Consistency - C4-A1): Consistency measures whether the facts in the summary are consistent with the facts in the original article. COnsider whether the summary does reproduce all facts accurately and does not make up untrue information.
Old Assertion 2 (Consistency - C4-A2): The summary includes no fabricated details or misrepresented facts compared to the original article.

New Assertion 1: Assertion: The summary accurately reflects the chronological order of events as presented in the original article, ensuring temporal consistency.

Assertion: The summary excludes any editorializing or subjective interpretations th

# Appendix A: Archived Prompts

In [None]:
# def build_hypothesis_prompt_iter1(df, input_col, output_col, 
#                              assertion_text_1, assertion_text_2,
#                              assertion_type_1, assertion_type_2,
#                              correlation_strength, correlation_value, p_value,
#                              assertion_scores, item_indices, *,
#                              include_categories=True):
    
#     prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior.

# Assertion Details:
# * A1: {assertion_text_1}
# * A2: {assertion_text_2}
# * Assertion types: {assertion_type_1} and {assertion_type_2}

# Correlation Analysis:
# * Correlation coefficient: {correlation_value:.3f}
# * Correlation strength: {correlation_strength}
# * Statistical significance: p = {p_value:.4g}

# Based on the following list of Generative AI system inputs and outputs and corresponding assertion values, generate hypotheses for why the assertions exhibit this correlation pattern.
# """

#     # Add the 5 data items
#     for idx, row_idx in enumerate(item_indices, 1):
#         row = df.iloc[row_idx]
#         a1_score, a2_score = assertion_scores[row_idx]
#         prompt += f"""
# Item {idx}: 
# Input: {getattr(row, input_col)}
# Output: {getattr(row, output_col)}
# Assertion 1: {a1_score:.1f}
# Assertion 2: {a2_score:.1f}
# """

#     if include_categories:
#         prompt += """
# Instructions: Return 5 hypotheses for the observed correlation pattern, formatted as "Hypothesis {#}: {hypothesis statement}". Consider these categories when generating hypotheses:

# Conceptual Relationship Categories:
# * Conceptual Independence: The two assertions capture fundamentally different concepts
# * Logical Dependency: One assertion is a logical negation, subset, or superset of another
# * Information Overlap: The assertions capture similar or redundant information about system behavior

# Measurement Sensitivity Categories:
# * Threshold Differences: The assertions have different sensitivity thresholds for positive evaluation
# * Edge Case Sensitivity: One assertion is more sensitive to edge cases, outliers, or boundary conditions
# * Granularity Mismatch: The assertions operate at different levels of detail or specificity

# Input-Output Relationship Categories:
# * Input vs. Output Focus: One assertion primarily evaluates input characteristics while the other focuses on output quality
# * Context Dependency: The assertions respond differently to specific input contexts or domains
# * Temporal Sensitivity: One assertion is more sensitive to sequence, timing, or order effects

# System Behavior Categories:
# * Performance Trade-offs: The assertions represent competing objectives or trade-offs in system performance
# * Failure Mode Correlation: The assertions tend to fail together or in opposing patterns under certain conditions

# Example hypotheses:
# * "The two assertions capture different concepts with minimal overlap"
# * "One assertion is a logical negation of another, creating inverse correlation"
# * "Assertion 1 is more sensitive to edge cases than Assertion 2"
# * "The assertions have different thresholds, with A1 being more restrictive than A2"
# """
#     else:
#         prompt += """
# Instructions: Return 5 hypotheses for the observed correlation pattern, formatted as "Hypothesis {#}: {hypothesis statement}".
# """

#     return prompt

In [None]:
# def build_combined_assertion_prompt_iter1(assertion_1_text,assertion_2_text,correlation_strength,correlation_value,p_value,diagnosed_issue,example_assertions=None):
#     prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior. Based on a hypothesized relationship between these assertions, your task is to generate new assertions that resolve the identified issues. 

# Assertion Details:
# A1: {assertion_1_text}
# A2: {assertion_2_text}

# Correlation Analysis:
# Correlation coefficient: {correlation_value:.3f}
# Correlation strength: {correlation_strength}
# Statistical significance: p = {p_value:.4g}
# Diagnosed issue: {diagnosed_issue}

# Generate five new assertions that combine A1 and A2 while preserving the main evaluation goals. Assertions should be binary statements that can be evaluated for each target system input-output pair."""
    
#     if example_assertions:
#         prompt += "\n\nFor example:"
#         for ex in example_assertions:
#             prompt += f"\n{ex}"

#     prompt += "\n\nFormat your response as:\nAssertion: <assertion text>"

#     return prompt

In [None]:
# def build_refinement_prompt_completeness_iter1(assertion_1_text, assertion_2_text, correlation_strength, correlation_value, p_value, diagnosed_issue, example_assertions=None):
#     prompt = f"""Your job is to help understand statistical relationships between two binary assertions used to evaluate AI system behavior. Based on a hypothesized relationship between these assertions, your task is to generate new, **refined assertions** that better distinguish between different dimensions of system performance.

# Assertion Details:
# A1: {assertion_1_text}  
# A2: {assertion_2_text}

# Correlation Analysis:
# Correlation coefficient: {correlation_value:.3f}  
# Correlation strength: {correlation_strength}  
# Statistical significance: p = {p_value:.4g}  
# Diagnosed issue: {diagnosed_issue}

# Generate five ideas for refined assertions that separate or clarify the evaluation criteria expressed in A1 and A2. These refinements should help evaluators more clearly distinguish **specific aspects** of output quality."""

#     if example_assertions:
#             prompt += "\n\nFor example:"
#             for ex in example_assertions:
#                 prompt += f"\n{ex}"

#     prompt += "\n\nFormat your response as:\nAssertion: <refined assertion text>"

#     return prompt