# LLM-based mPURIFY

## LLM-as-a-judge 

In [None]:
#!/usr/bin/env python3
import os
import json
import time
import decimal
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from openai import AzureOpenAI
from tqdm import tqdm
import threading
import traceback
import tempfile
import shutil

 # ── GLOBAL CONFIG ──────────────────────────────────────────────────────────────
# Root of all your data files:
ROOT_DIR = r"C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data"

# used to guard writes to df_meta and global_used
df_lock = threading.Lock()
cost_lock = threading.Lock()

# cache for source CSVs so we don't reload them per row
_source_csv_cache: dict[str, pd.DataFrame] = {}

# ── HELPER: extract first JSON block or fallback ───────────────────────────────
def extract_json_block(raw: str) -> str:
    # find first object or array
    idx_obj = raw.find('{')
    idx_arr = raw.find('[')
    if idx_obj == -1 and idx_arr == -1:
        return ""  # nothing JSON-like at all

    # choose earliest brace/bracket
    if 0 <= idx_arr < idx_obj or idx_obj == -1:
        start, open_c, close_c = idx_arr, '[', ']'
    else:
        start, open_c, close_c = idx_obj, '{', '}'

    depth = 0
    for i, ch in enumerate(raw[start:], start=start):
        if ch == open_c:
            depth += 1
        elif ch == close_c:
            depth -= 1
            if depth == 0:
                return raw[start:i+1]
    return ""  # malformed/unclosed


# ── RATE LIMIT CONFIG ─────────────────────────────────────────────────────────
RATE_LIMIT_PER_MINUTE = 358
MIN_INTERVAL = 60.0 / RATE_LIMIT_PER_MINUTE
_last_api_call = time.time() - MIN_INTERVAL

def _acquire_rate_limit():
    global _last_api_call
    now = time.time()
    wait = MIN_INTERVAL - (now - _last_api_call)
    if wait > 0:
        time.sleep(wait)
    _last_api_call = time.time()

# ── AZURE OPENAI CLIENT SETUP ─────────────────────────────────────────────────
client = AzureOpenAI(
    api_version="",
    azure_endpoint=" ",
    api_key=""
)

# ── COST CONFIG (OPTIONAL) ─────────────────────────────────────────────────────
COST_PER_1K_INPUT  = decimal.Decimal("0.002")
COST_PER_1K_OUTPUT = decimal.Decimal("0.008")

# ── CHARACTERISTICS MAPPING FOR FAKE NEWS ───────────────────────────────────────
characteristics = {
    "1":  "Sensational Appeal",
    "2":  "Emotionally Charged",
    "3":  "Psychologically Manipulative",
    "4":  "Misleading Statistics",
    "5":  "Fabricated Evidence",
    "6":  "Source Masking & Fake Credibility",
    "7":  "Source Obfuscation",
    "8":  "Targeted Audiences and Polarization",
    "9":  "Highly Shareable & Virality-Oriented",
    "10": "Weaponized for Political, Financial, or Social Gains",
    "11": "Simplistic, Polarizing Narratives",
    "12": "Conspiracy Framing",
    "13": "Exploits Cognitive Biases",
    "14": "Impersonation",
    "15": "Narrative Coherence Over Factual Accuracy",
    "16": "Malicious Contextual Reframing",
    "17": "False Attribution & Deceptive Endorsements",
    "18": "Exploitation of Trust in Authorities",
    "19": "Data Voids & Information Vacuum Exploitation",
    "20": "False Dichotomies & Whataboutism",
    "21": "Pseudoscience & Junk Science",
    "22": "Black Propaganda & False Flags",
    "23": "Censorship Framing & Fake Persecution",
    "24": "Astroturfing",
    "25": "Gaslighting",
    "26": "Hate Speech & Incitement",
    "27": "Information Overload & Fatigue",
    "28": "Jamming & Keyword Hijacking",
    "29": "Malinformation",
    "30": "Narrative Laundering",
    "31": "Obfuscation & Intentional Vagueness",
    "32": "Panic Mongering",
    "33": "Quoting Out of Context",
    "34": "Rumor Bombs",
    "35": "Scapegoating",
    "36": "Trolling & Provocation"
}

# ── TECHNIQUES & DEGREE FOR REAL NEWS ────────────────────────────────────────────
real_technique = {
    "2":"it rewrite text comprehensively paraphrasing and use of natural language refinement, altering wording and sentence structures",
    "1":"it polish the original content, refining clarity, flow, and readability without significantly altering the structure or factual content",
    "0":"it perform minor content editing of the original text to improve quality, correcting inaccuracies, grammar, and enhance readability"
}

# ── DEGREE MAPPINGS ────────────────────────────────────────────────────────────
fake_degree_map = {
    "minor":"Inconspicuous",
    "medium":"Moderate",
    "critical":"Alarming"
}
real_degree_map = {
    "light":    "light change (10–20%) changes",
    "moderate": "moderate change (30–50%) changes",
    "complete": "complete change (100%)"
}

# ── PROMPT TEMPLATES ───────────────────────────────────────────────────────────
FAKE_PROMPT = '''{{
  "GREEN_CoIAgenticF3": {{
    "description": "You are a master evaluator. Provided with a new article and Json generation Chains. Your task is to evaluate for consistency, change validity, translation quality, and manipulation detection based on comparison of the original text and specific chain outputs. Ensure all output is strict JSON with no extra commentary. Use single quotes in generated text to avoid parser issues.",
    "EvaluationChains": [
      {{
        "chain": "Chain [1] – Consistency Evaluator",
        "role": " You are a Evaluator/Consistency expert: Evaluate how consistent the generated content in ChainOutputs Chain [6] is with the Original Article in terms of factual correctness, logical flow, preserved meaning, tone, topic, and sentiment.",
        "guidelines": "Provide a 1–5 rating for each dimension unless otherwise noted. Match each evaluation against the Original Article versus Chain [6]. Penalize where misalignment or contradiction is introduced.",
        "evaluation": {{
          "Factual Consistency": {{
            "definition": "Accuracy of facts and details as stated in the original article.",
            "task": "Rate factual accuracy between Original Article and Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Logical Consistency": {{
            "definition": "Absence of contradictions and maintenance of logical structure.",
            "task": "Rate logical alignment and absence of contradiction between Original Article and Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree) ; label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Semantic Consistency": {{
            "definition": "Preservation of key meaning and intent.",
            "task": "Rate preservation of key meaning between Original Article and Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Contextual Consistency": {{
            "definition": "Alignment with the broader context and tone of the original.",
            "task": "Rate consistency of broader context and tone between Original Article and Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Topic Match": {{
            "definition": "Agreement of main topic in both texts; must be concise (1–2 words only).",
            "task": "Identify and compare main topic labels from Original Article and Chain [6]. Indicate matched verdict if they match or mismatched for diverge.",
            "options": label: ["matched", "mismatched"],
            "original_topic_label": "_____ or _____  _____",
            "LLM_topic_label": "_____ or _____ _____",
            "verdict": "_____",
            "justification": ""
          }},
          "Sentiment Match": {{
            "definition": "Emotional alignment between the two versions.",
            "task": "Select sentiment polarity of Original Article and Chain [6] and compare alignment. Indicate matched verdict if they match or mismatched for diverge.",
            "options": ["positive", "neutral", "negative"],
            "original_sentiment_label": "_____",
            "LLM_sentiment_label": "_____",
            "verdict": "_____",
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [2] – Change Validity Evaluator",
        "role": "Evaluator/Change Validity Agent: Determine whether all changes logged earlier (Chain [3]) and infused tactics were correctly reflected in Chain [6] compared to the Original Article.",
        "guidelines": "All comparisons should be between the Original Article and Chain [6] with reference to changes identified in Chain [3].",
        "evaluation": {{
          "Change Validity": {{
            "definition": "Whether the documented changes were accurately applied.",
            "task": "Rate if the changes listed in chain  [3] were applied accurately and fully in Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inaccurate, partially accurate, fully accurate]",
            "score": "",
            "label": "_____",
            "justification": ""
          }},
          "Degree of modification": {{
            "definition": "Extent of deviation from the original: Inconspicuous (minor), Moderate (medium), Alarming (critical).",
            "task": "Rate the overall extent of modification in Chain [6] relative to the Original Article",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree);label:[Inconspicuous (minor), Moderate (medium), Alarming (critical)]",
            "score": "",
            "label": "_____",
            "justification": ""
          }},
          "Edit Validation": {{
            "definition": "Contextual accuracy of individual edits.",
            "task": "Rate if individual edits in Chain [6] are contextually correct",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label:[inaccuracy, partially accuracy, fully accuracy]",
            "score": "",
            "label": "_____",
            "justification": ""
          }},
          "Technique Confirmation": {{
            "definition": "Presence of specific manipulation tactics requested by prompt including.",
            "task": "Rate whether Chain [6] includes specific rhetorical or stylistic tactics that were instructed. Also, indicate if (1) {tactic1} and (2) {tactic2} are present",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label:[one, both, none]",
            "score": "",
            "label": "",
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [3] – Translation Quality Evaluator",
        "role": "Evaluator/Translation Quality Agent: Compare the original English content in Chain [6] with its translated output in Chain [7], assessing quality and detecting correct language code.",
        "guidelines": "Base all ratings on comparison of Chain [6] (English) and Chain [7] (Translated). Penalize inaccurate or overly literal translations.",
        "evaluation": {{
          "Accurate Translation": {{
            "definition": "How precisely the meaning is retained.",
            "task": "Rate whether Chain [7] faithfully captures the meaning in Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Fluency": {{
            "definition": "Grammatical and stylistic readability.",
            "task": "Rate fluency and readability of Chain [7]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Terminology Appropriateness": {{
            "definition": "Use of accurate domain-specific vocabulary.",
            "task": "Rate use of correct specialized terms in Chain [7]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Localization and Cultural Relevance": {{
            "definition": "Cultural sensitivity and idiomatic appropriateness.",
            "task": "Rate cultural appropriateness of Chain [7] for its language audience",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Coherence": {{
            "definition": "Logical structure and flow.",
            "task": "Rate logical structure and cohesion of Chain [7]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Semantic Quality": {{
            "definition": "Subtle meaning alignment and nuance retention.",
            "task": "Rate nuanced meaning retention in Chain [7] from Chain [6]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Language Identification": {{
            "definition": "Correct identification of language(s) from Chain [7].",
            "task": "Detect and enter ISO 639-3 code(s) for the language(s) in Chain [7]. If two or more languages are detected, return all as a list.",
            "language code": [],
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [4] – Manipulation Evaluator",
        "role": "Evaluator/Manipulation Agent: Detect signs of content manipulation, exaggeration, or distortion introduced in Chain [6] that were not present in the Original Article.",
        "guidelines": "Compare Chain [6] directly with the Original Article to flag and rate manipulation severity. Penalize if any fabricated or altered facts are inserted.",
        "evaluation": {{
          "Manipulation Detection": {{
            "definition": "Fabrication or distortion of original meaning, tone, or intent.",
            "task": "Rate manipulation between Original Article and Chain [6]",
            "options": "(1 = No Manipulation, 5 = Extensive Manipulation); label: [no manipulation, partial manipulation, full manipulation]",
            "score": "",
            "label": "_____",
            "patterns_found": [],
            "justification": ""
          }}
        }}
      }}
    ],
    "Original Article": "{article}",
    "ChainOutputs": "{json_chain}"
  }}
}}'''

REAL_PROMPT = '''{{
  "GREEN_CoIAgenticF3": {{
    "description": "You are a master evaluator. Provided with a new article and Json generation Chains. Your task is to evaluate for consistency, change validity, translation quality, and manipulation detection based on comparison of the original text and specific chain outputs. Ensure all output is strict JSON with no extra commentary. Use single quotes in generated text to avoid parser issues.",
    "EvaluationChains": [
      {{
        "chain": "Chain [1] – Consistency Evaluator",
        "role": " You are a Evaluator/Consistency expert: Evaluate how consistent the generated content in ChainOutputs Chain [4] is with the Original Article in terms of factual correctness, logical flow, preserved meaning, tone, topic, and sentiment.",
        "guidelines": "Provide a 1–5 rating for each dimension unless otherwise noted. Match each evaluation against the Original Article versus Chain [4]. Penalize where misalignment or contradiction is introduced.",
        "evaluation": {{
          "Factual Consistency": {{
            "definition": "Accuracy of facts and details as stated in the original article.",
            "task": "Rate factual accuracy between Original Article and Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Logical Consistency": {{
            "definition": "Absence of contradictions and maintenance of logical structure.",
            "task": "Rate logical alignment and absence of contradiction between Original Article and Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Semantic Consistency": {{
            "definition": "Preservation of key meaning and intent.",
            "task": "Rate preservation of key meaning between Original Article and Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Contextual Consistency": {{
            "definition": "Alignment with the broader context and tone of the original.",
            "task": "Rate consistency of broader context and tone between Original Article and Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inconsistent, partially consistent, consistent]",
            "score": "",
            "label": "_____",
            "justification": ""}},
          "Topic Match": {{
            "definition": "Agreement of main topic in both texts; must be concise (1–2 words only).",
            "task": "Identify and compare main topic labels from Original Article and Chain [4]. Indicate matched verdict if they match or mismatched for diverge.",
            "options": label: ["matched", "mismatched"],
            "original_topic_label": "_____ or _____  _____",
            "LLM_topic_label": "_____ or _____ _____",
            "verdict": "_____",
            "justification": ""
          }},
          "Sentiment Match": {{
            "definition": "Emotional alignment between the two versions.",
            "task": "Select sentiment polarity of Original Article and Chain [4] and compare alignment. Indicate matched verdict if they match or mismatched for diverge.",
            "options": ["positive", "neutral", "negative"],
            "original_sentiment_label": "_____",
            "LLM_sentiment_label": "_____",
            "verdict": "_____",
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [2] – Change Validity Evaluator",
        "role": "Evaluator/Change Validity Agent: Determine whether all changes logged earlier (Chain [3]) and infused tactics were correctly reflected in Chain [4] compared to the Original Article.",
        "guidelines": "All comparisons should be between the Original Article and Chain [4] with reference to changes identified in Chain [3].",
        "evaluation": {{
          "Change Validity": {{
            "definition": "Whether the documented changes were accurately applied.",
            "task": "Rate if the changes listed in chain  [3] were applied accurately and fully in Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Degree of modification": {{
            "definition": "Extent of deviation from the original real news version.",
            "task": "Rate the overall extent of modification in Chain [4] relative to the Original Article",
            "options": "label: [light = light change (10–20%), moderate = moderate change (30–50%), complete = complete change (100%)]",
            "score": "",
            "label": "_____",
            "justification": ""
          }},
          "Edit Validation": {{
            "definition": "Contextual accuracy of individual edits.",
            "task": "Rate if individual edits in Chain [4] are contextually correct",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [inaccuracy, partially accuracy, fully accuracy]",
            "score": "",
            "label": "_____",
            "justification": ""
          }},
          "Technique Confirmation": {{
            "definition": "Presence of specific editing techniques as instructed (e.g., polish, rewrite, simplify).",
            "task": "Rate whether Chain [4] includes the requested real news editing technique. Also, indicate if (1) {tech} was done.",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree); label: [not-done, partially done, fully done]",
            "score": "",
            "label": "_____",
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [3] – Translation Quality Evaluator",
        "role": "Evaluator/Translation Quality Agent: Compare the original English content in Chain [4] with its translated output in Chain [5], assessing quality and detecting correct language code.",
        "guidelines": "Base all ratings on comparison of Chain [4] (English) and Chain [5] (Translated). Penalize inaccurate or overly literal translations.",
        "evaluation": {{
          "Accurate Translation": {{
            "definition": "How precisely the meaning is retained.",
            "task": "Rate whether Chain [5] faithfully captures the meaning in Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Fluency": {{
            "definition": "Grammatical and stylistic readability.",
            "task": "Rate fluency and readability of Chain [5]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Terminology Appropriateness": {{
            "definition": "Use of accurate domain-specific vocabulary.",
            "task": "Rate use of correct specialized terms in Chain [5]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Localization and Cultural Relevance": {{
            "definition": "Cultural sensitivity and idiomatic appropriateness.",
            "task": "Rate cultural appropriateness of Chain [5] for its language audience",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Coherence": {{
            "definition": "Logical structure and flow.",
            "task": "Rate logical structure and cohesion of Chain [5]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Semantic Quality": {{
            "definition": "Subtle meaning alignment and nuance retention.",
            "task": "Rate nuanced meaning retention in Chain [5] from Chain [4]",
            "options": "(1 = Strongly Disagree, 5 = Strongly Agree)",
            "score": "",
            "justification": ""
          }},
          "Language Identification": {{
            "definition": "Correct identification of language(s) from Chain [5].",
            "task": "Detect and enter ISO 639-3 code(s) for the language(s) in Chain [5]. If two or more languages are detected, return all as a list.",
            "language code": [],
            "justification": ""
          }}
        }}
      }},
      {{
        "chain": "Chain [4] – Manipulation Evaluator",
        "role": "Evaluator/Manipulation Agent: Detect signs of content manipulation, exaggeration, or distortion introduced in Chain [4] that were not present in the Original Article.",
        "guidelines": "Compare Chain [4] directly with the Original Article to flag and rate manipulation severity. Penalize if any fabricated or altered facts are inserted.",
        "evaluation": {{
          "Manipulation Detection": {{
            "definition": "Fabrication or distortion of original meaning, tone, or intent.",
            "task": "Rate manipulation between Original Article and Chain [4]",
            "options": "(1 = No Manipulation, 5 = Extensive Manipulation); label: [no manipulation, partial manipulation, full manipulation]",
            "score": "",
            "label": "_____",
            "patterns_found": [],
            "justification": ""
          }}
        }}
      }}
    ],
    "Original Article": "{article}",
    "ChainOutputs": "{json_chain}"
  }}
}}'''

# ── PROMPT BUILDER ─────────────────────────────────────────────────────────────
def build_prompt(article, json_chain, veracity, tactics, degree_key):
    # Option 2: strip brackets before splitting
    # join in case tactics is like ["[8", " 19]"]
    raw = ",".join(tactics).strip("[] ")
    clean_tactics = [t.strip() for t in raw.split(",") if t.strip()]

    if veracity == 'fake_news':
        if len(clean_tactics) < 2:
            raise ValueError(f"fake_news requires 2 tactics, got: {clean_tactics!r}")
        key1, key2 = clean_tactics[0], clean_tactics[1]

        tac1 = characteristics.get(key1, "tactic 1")
        tac2 = characteristics.get(key2, "tactic 2")

        return FAKE_PROMPT.format(
            article=article,
            json_chain=json_chain,
            tactic1=tac1,
            tactic2=tac2,
        )

    elif veracity == 'real_news':
        if not clean_tactics:
            raise ValueError(f"real_news requires at least 1 technique, got: {clean_tactics!r}")
        tech_key = clean_tactics[0]

        tech_desc = real_technique.get(tech_key, "text modification")

        return REAL_PROMPT.format(
            article=article,
            json_chain=json_chain,
            tech=tech_desc,
        )

    else:
        raise ValueError(f"Unknown veracity: {veracity!r}")


# ── MAIN OUTPUT & STATE TRACKING ───────────────────────────────────────────────
main_output_dir = "./evaluations"
os.makedirs(main_output_dir, exist_ok=True)
used_file = os.path.join(main_output_dir, "global_used_uuids.json")
if os.path.exists(used_file):
    with open(used_file, 'r', encoding='utf-8') as f:
        global_used = set(json.load(f))
else:
    global_used = set()
    with open(used_file, 'w', encoding='utf-8') as f:
        json.dump([], f)

# ── LOAD METADATA ──────────────────────────────────────────────────────────────
metadata_path = "C:\\Users\\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\\BLUFF\\BLUFF_final_data\\metadata_final_clean.csv"
metadata_with_eval_path = "C:\\Users\\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\\BLUFF\\BLUFF_final_data\\metadata_final_clean_x_LLM_mPURIFY_eval.csv"

df_meta = pd.read_csv(metadata_path, dtype={'uuid': str})
if 'evaluation_path' not in df_meta.columns:
    df_meta['evaluation_path'] = None

# ── PROCESS & SAVE ─────────────────────────────────────────────────────────────
def save_output(uuid, eval_type, content):
    subdir = os.path.join(main_output_dir, f"{eval_type}_eval")
    os.makedirs(subdir, exist_ok=True)
    path = os.path.join(subdir, f"{uuid}_{eval_type}eval.json")
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(content, f, indent=2, ensure_ascii=False)
    return path

# ── IMPROVED COST LEDGER WITH ATOMIC WRITES ──
cost_file = os.path.join(main_output_dir, "overall_costs.json")

def save_cost_record(record: dict):
    """
    Append a single call's cost record with atomic file operations
    to prevent JSON corruption from concurrent access.
    """
    with cost_lock:
        max_retries = 3
        for attempt in range(max_retries):
            try:
                # Try to load existing data
                if os.path.exists(cost_file):
                    with open(cost_file, "r", encoding="utf-8") as f:
                        data = json.load(f)
                else:
                    data = {"total_cost": 0.0, "calls": []}

                # Add new record
                data["calls"].append(record)
                data["total_cost"] += record.get("cost", 0.0)

                # Write atomically using temporary file
                with tempfile.NamedTemporaryFile(mode='w', delete=False, 
                                               dir=main_output_dir, 
                                               suffix='.tmp') as tmp_f:
                    json.dump(data, tmp_f, indent=2, ensure_ascii=False)
                    tmp_name = tmp_f.name

                # Atomic move
                shutil.move(tmp_name, cost_file)
                break  # Success, exit retry loop

            except (json.JSONDecodeError, IOError) as e:
                print(f"[WARN] Cost file error (attempt {attempt + 1}/{max_retries}): {e}")
                if attempt == max_retries - 1:
                    print(f"[ERROR] Failed to save cost record after {max_retries} attempts")
                    # Create backup record
                    backup_file = os.path.join(main_output_dir, f"cost_backup_{int(time.time())}.json")
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        json.dump([record], f, indent=2)
                else:
                    time.sleep(0.1)  # Brief delay before retry


def load_source_csv_safe(src_csv_path):
    """Thread-safe CSV loading with caching"""
    with df_lock:
        if src_csv_path not in _source_csv_cache:
            try:
                _source_csv_cache[src_csv_path] = pd.read_csv(src_csv_path, dtype={'uuid': str})
            except Exception as e:
                print(f"[ERROR] Failed to load CSV {src_csv_path}: {e}")
                return None
        return _source_csv_cache[src_csv_path]

# ── PROCESS EACH ROW WITH BETTER ERROR HANDLING ───────────────────────────────────────────────────────────
def process_row(idx, row):
    global df_meta, global_used

    uuid = str(row['uuid'])
    if uuid in global_used:
        return

    try:
        # Load article from source CSV with thread safety
        src_rel = row['source_dir'].lstrip("/\\")
        src_csv_path = os.path.join(ROOT_DIR, src_rel)
        df_src = load_source_csv_safe(src_csv_path)
        if df_src is None:
            return

        try:
            article = (
                df_src.loc[df_src['uuid'] == uuid, 'content']
                      .iloc[0]
                      .replace('"', '\\"')
            )
        except IndexError:
            print(f"[ERROR] UUID {uuid} not found in source CSV {src_csv_path}")
            return

        # Load JSON chain with better error handling
        json_rel = row['json_dir'].lstrip("/\\")
        json_path = os.path.join(ROOT_DIR, json_rel)
        try:
            with open(json_path, 'r', encoding='utf-8-sig') as f:
                raw = f.read()
        except FileNotFoundError:
            print(f"[ERROR] JSON file not found at {json_path}")
            return
        except Exception as e:
            print(f"[ERROR] Failed to read JSON file {json_path}: {e}")
            return

        snippet = extract_json_block(raw)
        if snippet:
            try:
                obj = json.loads(snippet)
                json_chain = json.dumps(obj).replace('"', '\\"')
            except json.JSONDecodeError:
                print(f"[WARN] Extracted JSON malformed in {json_path}; using raw snippet.")
                json_chain = snippet.replace('"', '\\"')
        else:
            print(f"[WARN] No JSON block in {json_path}; loading full content as text.")
            json_chain = raw.replace('"', '\\"')

        # Build prompt with error handling
        tactics = [t.strip() for t in str(row['technique_keys']).split(',')]
        degree_key = row['degree']
        veracity = row['veracity']
        
        try:
            prompt = build_prompt(article, json_chain, veracity, tactics, degree_key)
        except Exception as e:
            print(f"[ERROR] Failed to build prompt for {uuid}: {e}")
            return

        messages = [
            {"role": "system", "content": "You are a master evaluator. Return strict JSON only."},
            {"role": "user", "content": prompt}
        ]

        # API call with rate limiting and error handling
        _acquire_rate_limit()
        try:
            resp = client.chat.completions.create(model="gpt-4.1", messages=messages)
            
            # Track cost
            try:
                pt = resp.usage.prompt_tokens
                ct = resp.usage.completion_tokens
                cost = float(
                    (decimal.Decimal(pt) / 1000) * COST_PER_1K_INPUT +
                    (decimal.Decimal(ct) / 1000) * COST_PER_1K_OUTPUT
                )
                save_cost_record({
                    "uuid": uuid,
                    "prompt_tokens": pt,
                    "completion_tokens": ct,
                    "cost": cost,
                    "timestamp": time.time()
                })
            except AttributeError:
                print(f"[WARN] API response missing usage data for {uuid}")

        except Exception as e:
            print(f"[ERROR] API call failed for {uuid}: {e!r}")
            return

        # Parse response
        try:
            parsed = json.loads(resp.choices[0].message.content)
        except json.JSONDecodeError:
            print(f"[WARN] Response not valid JSON for {uuid}, storing raw content")
            parsed = resp.choices[0].message.content

        # Save output and update metadata
        try:
            out_path = save_output(uuid, veracity, parsed)
            with df_lock:
                df_meta.at[idx, 'evaluation_path'] = out_path
                global_used.add(uuid)
                
                # Save used UUIDs atomically
                with tempfile.NamedTemporaryFile(mode='w', delete=False, 
                                               dir=main_output_dir, suffix='.tmp') as tmp_f:
                    json.dump(list(global_used), tmp_f)
                    tmp_name = tmp_f.name
                shutil.move(tmp_name, used_file)
                
                # Save metadata
                df_meta.to_csv(metadata_with_eval_path, index=False)
                
        except Exception as e:
            print(f"[ERROR] Failed to save output for {uuid}: {e!r}")

    except Exception as e:
        print(f"[ERROR] Unexpected error processing {uuid}: {e!r}")
        traceback.print_exc()

# ── EXECUTION WITH REDUCED THREAD COUNT ─────────────────────────────────────────────────────────────────
if __name__ == '__main__':
    total = len(df_meta)
    # Reduced thread count to minimize contention
    with ThreadPoolExecutor(max_workers=20) as executor:  # Reduced from 60
        futures = [executor.submit(process_row, idx, row)
                   for idx, row in df_meta.iterrows()]
        for _ in tqdm(as_completed(futures), total=total, desc="Evaluating News"):
            pass

    print(f"All evaluations complete. Latest metadata at {metadata_with_eval_path}")

  df_meta = pd.read_csv(metadata_path, dtype={'uuid': str})
Evaluating News:  89%|████████▊ | 59842/67531 [00:57<03:34, 35.79it/s]  

[WARN] Response not valid JSON for af330011-6fb3-4a61-ad0c-9855deb2004b, storing raw content


Evaluating News:  90%|████████▉ | 60678/67531 [02:17<26:25,  4.32it/s]

[WARN] Response not valid JSON for bad7923a-9dd9-46d5-8a2f-70e4f9f7423e, storing raw content


Evaluating News:  91%|█████████ | 61229/67531 [03:04<15:38,  6.71it/s]

[WARN] Response not valid JSON for 4a944f87-8a0b-4be3-a8b9-29b9f9c8af2a, storing raw content


Evaluating News:  92%|█████████▏| 62156/67531 [04:58<1:58:19,  1.32s/it]

[WARN] Response not valid JSON for 43311ef1-9367-4548-989e-55ccf50261e4, storing raw content


Evaluating News:  93%|█████████▎| 62602/67531 [15:23<1:48:52,  1.33s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_f\deepseek-ai_DeepSeek-R1-Distill-Llama-70B\lit\004660dc-4d52-4cef-a384-9df3c0761a48_lit_minor.json; using raw snippet.


Evaluating News:  93%|█████████▎| 62720/67531 [18:03<1:46:44,  1.33s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_f\deepseek-ai_DeepSeek-R1-Distill-Llama-70B\ara\33bb73f5-c15f-4929-997f-9700671de46c_ara_critical.json; loading full content as text.


Evaluating News:  93%|█████████▎| 62746/67531 [18:40<1:44:11,  1.31s/it]

[WARN] Response not valid JSON for 1a991c48-6320-4fc9-923e-98fadf18b475, storing raw content


Evaluating News:  93%|█████████▎| 62759/67531 [19:00<1:54:18,  1.44s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_f\deepseek-ai_DeepSeek-R1-Distill-Llama-70B\sin\2cae8be6-b8c8-432a-ac22-f148b6badb7b_sin_critical.json; using raw snippet.


Evaluating News:  93%|█████████▎| 62879/67531 [21:42<1:53:27,  1.46s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_f\deepseek-ai_DeepSeek-R1-Distill-Llama-70B\spa\88950fd8-b5c0-4ab0-b79b-20604912623f_spa_critical.json; using raw snippet.


Evaluating News:  93%|█████████▎| 63053/67531 [26:06<1:40:24,  1.35s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_f\deepseek-ai_DeepSeek-R1-Distill-Llama-70B\hau\e5263312-152d-4860-8276-efb7cb2fd22e_hau_critical.json; using raw snippet.


Evaluating News:  94%|█████████▎| 63239/67531 [30:15<1:55:05,  1.61s/it]

[WARN] Response not valid JSON for 22d104fb-a113-43c5-bfe8-44d19c3a3bee, storing raw content
[WARN] Response not valid JSON for db63ff55-b72d-4d22-88e1-c85ce1b61e8f, storing raw content


Evaluating News:  94%|█████████▍| 63474/67531 [36:02<1:29:47,  1.33s/it]

[WARN] Response not valid JSON for 15cf8162-0b3f-489b-b529-cb7f7c9234d4, storing raw content


Evaluating News:  94%|█████████▍| 63749/67531 [42:20<1:27:28,  1.39s/it]

[WARN] Response not valid JSON for c807b8b5-ea2b-4382-bc1f-109e857407ee, storing raw content


Evaluating News:  95%|█████████▍| 63907/67531 [46:10<1:43:16,  1.71s/it]

[WARN] Response not valid JSON for 27d71dcc-6d25-4a9f-9144-562cc154c9cd, storing raw content


Evaluating News:  97%|█████████▋| 65206/67531 [1:18:50<50:45,  1.31s/it]  

[WARN] Response not valid JSON for f774495a-150b-4b37-90dc-4e27967848c5, storing raw content


Evaluating News:  97%|█████████▋| 65324/67531 [1:21:47<56:49,  1.54s/it]  

[WARN] Response not valid JSON for bab2e9ac-e9e1-493a-a24c-b1213b596a18, storing raw content


Evaluating News:  97%|█████████▋| 65519/67531 [1:26:46<49:54,  1.49s/it]  

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-2.0-flash-thinking-exp-01-21\nor\CoI_54756163-ffa8-400b-979b-5edf9e6510e0_nor_polish.json; using raw snippet.


Evaluating News:  97%|█████████▋| 65520/67531 [1:26:49<1:03:58,  1.91s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-2.0-flash-thinking-exp-01-21\pan\CoI_0903b25e-9068-4e10-80b4-5183b640209d_pan_polish.json; using raw snippet.


Evaluating News:  98%|█████████▊| 65878/67531 [1:35:34<42:34,  1.55s/it]  

[WARN] Response not valid JSON for 7ce64e8c-6681-4a05-99c2-9fb88b7e31f7, storing raw content


Evaluating News:  98%|█████████▊| 66150/67531 [1:42:16<30:48,  1.34s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gpt-4.1\ban\CoI_9b1e3d4c-0669-46e9-979a-38e29ff89250_ban_polish.json; using raw snippet.


Evaluating News:  98%|█████████▊| 66369/67531 [1:47:43<38:44,  2.00s/it]

[WARN] Response not valid JSON for df476da4-6c7c-4cda-be53-158a20ceff42, storing raw content


Evaluating News:  99%|█████████▊| 66604/67531 [1:54:05<21:48,  1.41s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\deepseek-ai_DeepSeek-R1-Turbo\ban\CoI_919362f6-e847-43e0-ae82-5cfb5b652a65_ban_polish.json; using raw snippet.


Evaluating News:  99%|█████████▊| 66632/67531 [1:54:47<21:13,  1.42s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\bos\CoI_aa38eb51-4886-4b0c-bb9d-e8c9462f9ca5_bos_rewrite_moderate.json; using raw snippet.


Evaluating News:  99%|█████████▊| 66634/67531 [1:54:50<21:40,  1.45s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\bos\CoI_257fc082-84a7-4f09-a31f-fbde984cda24_bos_rewrite_moderate.json; using raw snippet.


Evaluating News:  99%|█████████▊| 66635/67531 [1:54:52<21:21,  1.43s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\ban\CoI_2feb817a-2b56-42c2-9bdf-235bf1a73160_ban_rewrite_light.json; using raw snippet.
[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\ban\CoI_62b102ba-0198-4190-9d72-7f3d90251563_ban_edit.json; using raw snippet.
[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\ban\CoI_78f386d3-5c41-4473-9789-fd3901b8b47b_ban_edit.json; using raw snippet.


Evaluating News:  99%|█████████▊| 66637/67531 [1:54:55<22:30,  1.51s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\ban\CoI_35c9eb64-9619-4101-8ded-8900e42bb265_ban_rewrite_light.json; using raw snippet.


Evaluating News:  99%|█████████▊| 66639/67531 [1:54:58<23:37,  1.59s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-pro\ban\CoI_70bd9534-cde0-4610-8f43-ea95932e71f5_ban_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66750/67531 [1:57:21<18:03,  1.39s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\meta-llama_Llama-4-Maverick-17B-128E-Instruct-FP8\bos\CoI_84575d13-7974-41d2-b8bb-11f2d49132c3_bos_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66772/67531 [1:57:52<17:12,  1.36s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\meta-llama_Llama-4-Maverick-17B-128E-Instruct-FP8\ban\CoI_5a24da8a-a638-4aa6-89e8-6cfaeadc010d_ban_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66773/67531 [1:57:54<17:05,  1.35s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\meta-llama_Llama-4-Maverick-17B-128E-Instruct-FP8\ban\CoI_41eb5fbc-c5cd-4bac-bc9f-c39f1a498240_ban_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66821/67531 [1:59:07<16:21,  1.38s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\meta-llama_Llama-4-Maverick-17B-128E-Instruct-FP8\srp\CoI_d8961092-6ca1-4f1a-a21a-2bed70ab2d9b_srp_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66822/67531 [1:59:09<17:12,  1.46s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\meta-llama_Llama-4-Maverick-17B-128E-Instruct-FP8\srp\CoI_55df6dfb-3f6d-4ccc-8a60-ea980501b704_srp_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66848/67531 [1:59:57<25:03,  2.20s/it]

[WARN] Response not valid JSON for cd9451c0-363a-404e-b713-90c79585ef41, storing raw content


Evaluating News:  99%|█████████▉| 66868/67531 [2:00:27<16:55,  1.53s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\tel\CoI_039f877e-4c7d-44a2-9cf5-3b94075bf00a_tel_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66888/67531 [2:01:02<16:04,  1.50s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\vie\CoI_67676cec-b625-4447-ae26-888a25213c63_vie_polish.json; loading full content as text.


Evaluating News:  99%|█████████▉| 66890/67531 [2:01:06<16:29,  1.54s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\vie\CoI_256835f4-b0ab-4ff6-9e80-99de77d3367f_vie_rewrite_light.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66893/67531 [2:01:10<15:44,  1.48s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\fra\CoI_896d37c0-d4c7-499a-801d-8817e573bd07_fra_polish.json; loading full content as text.


Evaluating News:  99%|█████████▉| 66900/67531 [2:01:24<19:45,  1.88s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\mya\CoI_07e7bc73-941a-4a1b-b334-70f8eb1d10b0_mya_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66940/67531 [2:02:29<14:37,  1.48s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\kur\CoI_8cb00b9f-e03b-4b79-9949-e182e152d946_kur_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66943/67531 [2:02:34<15:55,  1.63s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\deu\CoI_ac086f77-287f-4873-8b43-0847fdc2f0cc_deu_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66966/67531 [2:03:12<12:52,  1.37s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\nor\CoI_17de3548-047a-4481-894c-51951fd99d45_nor_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66969/67531 [2:03:16<12:57,  1.38s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\lav\CoI_fd02c778-ebd5-49cb-b732-f200d04189e1_lav_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 66970/67531 [2:03:19<16:48,  1.80s/it]

[WARN] Response not valid JSON for 32495a7d-3a46-43ca-8656-91e6d62d1c8c, storing raw content


Evaluating News:  99%|█████████▉| 67008/67531 [2:04:19<11:39,  1.34s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\sqi\CoI_83870ef3-d5de-4e68-a8b2-15f4a2c6df30_sqi_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67009/67531 [2:04:20<11:48,  1.36s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\sqi\CoI_5dfc166a-57cf-4121-98ef-93237186456d_sqi_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67011/67531 [2:04:23<11:53,  1.37s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\tam\CoI_715b981f-d328-484d-9dec-29e9f92f1c7e_tam_edit.json; loading full content as text.


Evaluating News:  99%|█████████▉| 67015/67531 [2:04:29<12:06,  1.41s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\swe\CoI_38905679-5725-42d3-8753-812580417194_swe_rewrite_light.json; loading full content as text.
[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\urd\CoI_ba96fa33-a119-4ef7-b102-38e1f862264a_urd_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67038/67531 [2:05:04<10:49,  1.32s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\ell\CoI_023fb968-42bf-4d78-8f16-c55976b51192_ell_rewrite_complete.json; using raw snippet.
[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\ell\CoI_03cd7b07-942a-45b1-a7e2-955e67bd0328_ell_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67040/67531 [2:05:07<11:07,  1.36s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\ell\CoI_dc57699c-52d3-4cff-ba20-50d092dadd91_ell_rewrite_moderate.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67084/67531 [2:06:19<11:09,  1.50s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\hrv\CoI_074cfbe6-e87c-4386-aba2-3654512d2602_hrv_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67085/67531 [2:06:20<11:23,  1.53s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\mal\CoI_91029a41-0e8b-47c2-a3e2-0d0d584aa663_mal_rewrite_moderate.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67109/67531 [2:06:56<09:15,  1.32s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\nep\CoI_0b8dff37-404b-4f71-a4a4-25657b6a41a8_nep_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67110/67531 [2:06:57<09:33,  1.36s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\ben\CoI_d8b8606c-9702-42a4-b28e-87b087fed6a7_ben_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67136/67531 [2:07:35<09:25,  1.43s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\grn\CoI_45a6ea23-5020-4b52-8ff8-c6234e9c2596_grn_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67138/67531 [2:07:38<09:09,  1.40s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\grn\CoI_5c9040d3-73dd-459d-9455-905d383aa335_grn_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67141/67531 [2:07:42<09:08,  1.41s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\grn\CoI_772d4aa2-8462-4f03-82d2-d2b6952784eb_grn_rewrite_moderate.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67142/67531 [2:07:44<09:23,  1.45s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\grn\CoI_e90c0c49-f54b-495c-88b0-e90ade5cd82a_grn_rewrite_complete.json; loading full content as text.


Evaluating News:  99%|█████████▉| 67154/67531 [2:08:00<08:17,  1.32s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\hau\CoI_cbb133a8-a906-4c42-a087-25600b0f31de_hau_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67158/67531 [2:08:06<09:15,  1.49s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\hau\CoI_73cc3709-6b26-4f19-a71a-566552ce043f_hau_polish.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67159/67531 [2:08:08<09:16,  1.50s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\hau\CoI_27998003-e3da-4a96-8f6f-255b4d43102b_hau_edit.json; using raw snippet.


Evaluating News:  99%|█████████▉| 67167/67531 [2:08:20<09:33,  1.58s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\ban\CoI_8e773a75-d08d-4809-bd31-dd2b54462d7d_ban_polish.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67204/67531 [2:09:13<07:41,  1.41s/it]

[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\swa\CoI_5c79db19-5575-4227-9c2a-6ef92806651c_swa_polish.json; loading full content as text.


Evaluating News: 100%|█████████▉| 67211/67531 [2:09:23<07:19,  1.37s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\tgl\CoI_35394b2f-d863-4a0c-bc29-4b736a1b6331_tgl_polish.json; using raw snippet.
[WARN] No JSON block in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\tgl\CoI_0aa6c56c-26ce-4bd2-8fde-689629c98116_tgl_edit.json; loading full content as text.


Evaluating News: 100%|█████████▉| 67231/67531 [2:09:53<06:55,  1.39s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\microsoft_Phi-4-multimodal-instruct\pan\CoI_3b408b3c-2aac-4551-9435-7b3c68bf4c6c_pan_rewrite_complete.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67252/67531 [2:10:23<06:36,  1.42s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-flash\pol\CoI_298cc26d-5130-4239-9e09-3e9d6a3c604b_pol_polish.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67259/67531 [2:10:33<06:04,  1.34s/it]

[WARN] Response not valid JSON for 298cc26d-5130-4239-9e09-3e9d6a3c604b, storing raw content


Evaluating News: 100%|█████████▉| 67285/67531 [2:11:10<05:46,  1.41s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\gemini-1.5-flash\pan\CoI_5a32ca24-ca58-481d-bb95-571dfd943f0c_pan_rewrite_complete.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67292/67531 [2:11:20<05:32,  1.39s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\o1-2024-12-17\kur\CoI_b6a4eee8-2682-4f5a-9474-e689adf10d2b_kur_edit.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67300/67531 [2:11:33<06:07,  1.59s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\o1-2024-12-17\ban\CoI_b8a3108b-6ea0-4c01-b8bd-d1ba57e37f30_ban_polish.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67301/67531 [2:11:35<06:24,  1.67s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\o1-2024-12-17\ban\CoI_fa45e04c-755a-4ffd-bff8-d101c127fdcf_ban_rewrite_moderate.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67304/67531 [2:11:39<05:49,  1.54s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\o1-2024-12-17\srp\CoI_5121d018-460a-40b1-892f-b30630ec685a_srp_edit.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67306/67531 [2:11:42<05:27,  1.45s/it]

[WARN] Extracted JSON malformed in C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\eng_x_r\o1-2024-12-17\srp\CoI_e56840f8-0d09-4dba-b146-1c43e59fb0ba_srp_polish.json; using raw snippet.


Evaluating News: 100%|█████████▉| 67400/67531 [2:14:08<03:08,  1.44s/it]

[WARN] Response not valid JSON for 94dfae14-783e-4128-98cf-a8b376fa784f, storing raw content


Evaluating News: 100%|█████████▉| 67432/67531 [2:15:01<02:45,  1.67s/it]

[WARN] Response not valid JSON for f32cea45-4e06-42a7-afe0-ad0f19793a39, storing raw content


Evaluating News: 100%|██████████| 67531/67531 [2:17:41<00:00,  8.17it/s]

All evaluations complete. Latest metadata at C:\Users\jsl5710\OneDrive - The Pennsylvania State University\JasonL Research Projects\BLUFF\BLUFF_final_data\metadata_final_clean_x_LLM_mPURIFY_eval.csv



