In [None]:
import os
import time
import random
import re
import json
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI, RateLimitError, APIError, Timeout

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

df = pd.read_csv("ground_truth.csv")

SYS_MSG = """
You are an expert provenance auditor.

Return **ONLY** a JSON object that follows this exact schema
{
  "class"       : "LoanDoc | PublicWeb | Synthesised",
  "confidence"  : 0-100,                  // subjective probability
  "loan_doc_id" : "file_name.pdf"         // required only if class = LoanDoc
}

──────────────── DECISION RULES ────────────────
1. **LoanDoc** Verbatim or lightly edited excerpt from any internal loan file in the list below.
2. **PublicWeb** Appears on freely available websites, blogs, press releases, gov/data portals.
3. **Synthesised** Blends or paraphrases two or more sources, or was written from scratch by an AI.

If you choose **"LoanDoc"**, set `loan_doc_id` to the best match from:

[TS1.pdf]   Term Sheet  
[APP1.pdf]  Appraisal  
[OM1.pdf]   Offering Memo  

(Leave `loan_doc_id` as an empty string for all other classes.)

OUTPUT POLICY  
• No extra keys, comments, or prose - **only** the JSON object.
""".strip()

FEW_SHOT = """
Example 1
<!-- internal reasoning: appraisal language + dollar value should map to LoanDoc -->
{"class":"LoanDoc","confidence":92,"loan_doc_id":"APP1.pdf"}

Example 2
<!-- internal reasoning: covenant with $ amounts belongs in a term sheet -->
{"class":"LoanDoc","confidence":91,"loan_doc_id":"TS1.pdf"}

Example 3
<!-- internal reasoning: covenant-style clause appears only in term sheets -->
{"class":"LoanDoc","confidence":92,"loan_doc_id":"TS1.pdf"}

Example 4
<!-- internal reasoning: appraisal narrative language; no dollar or percent signs -->
{"class":"LoanDoc","confidence":90,"loan_doc_id":"APP1.pdf"}

Example 5
<!-- internal reasoning: generic definition easily found on public finance sites -->
{"class":"PublicWeb","confidence":85,"loan_doc_id":""}

Example 6
<!-- internal reasoning: blends Fannie-Mae DSCR guideline with sponsor equity requirement; not verbatim from any one doc -->
{"class":"Synthesised","confidence":80,"loan_doc_id":""}
""".strip()
COT = """
Think silently, then output ONLY the JSON.

<!--
STEP 1: Identify surface cues  
- origin hints (numbers? legal verbs? valuation jargon?)  
- presence of web tell-tales (URL, “according to”, etc.)  
- any words unique to TS1, APP1, OM1

STEP 2: Decide provisional class + doc_id  
- choose class LoanDoc / PublicWeb / Synthesised  
- if LoanDoc: pick TS1 / APP1 / OM1 or "" when unsure  
- note confidence 0-100

STEP 3: Self-check  
- Does evidence contradict class?  
- If confidence <60, consider next-best label.  

END STEPS  
→
""".strip()

In [2]:
def make_prompt(variant: str, snippet: str) -> list[dict]:
    """Construct the messages list expected by the Chat Completions API."""
    messages = [{"role": "system", "content": SYS_MSG}]

    if variant == "fewshot":
        user_msg = FEW_SHOT + f'\nText: """{snippet}"""\nPrediction:'
    elif variant == "zeroshot":
        user_msg = f'Text: """{snippet}"""\nPrediction:'
    else:
        raise ValueError(f"Unknown variant: {variant}")

    messages.append({"role": "user", "content": user_msg})
    return messages

In [3]:
def call_chat_completion(
    messages: list[dict],
    variant: str,
    model: str = "gpt-4.1-2025-04-14",
    max_retries: int = 6,
    base_delay: float = 1.0,
):
    """Robust wrapper around `client.chat.completions.create` with back-off."""
    for attempt in range(max_retries):
        try:
            kwargs = dict(model=model, messages=messages, temperature=0)

            if variant in ("zeroshot", "fewshot"):
                # Force JSON‑only replies
                kwargs["response_format"] = {"type": "json_object"}

            return client.chat.completions.create(**kwargs)

        except (RateLimitError, APIError, Timeout) as e:
            if attempt == max_retries - 1:
                raise
            wait = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"{type(e).__name__}: retrying in {wait:.1f}s…")
            time.sleep(wait)


In [4]:
records = []
variants = ["fewshot"]  # Add "zeroshot" here if you want to compare

for idx, row in df.iterrows():
    sample_id = row.get("id", idx)
    snippet = row["snippet"]
    true_lbl = row["label"]

    for variant in variants:
        messages = make_prompt(variant, snippet)
        rsp = call_chat_completion(messages, variant)

        raw = rsp.choices[0].message.content.strip()

        # Robust JSON extraction (handles stray text)
        try:
            out = json.loads(raw)
        except json.JSONDecodeError:
            m = re.search(r"\{.*?\}", raw, re.S)
            if not m:
                print(f"⚠ No JSON for id={sample_id}, variant={variant}\n{raw[:120]}…")
                continue
            out = json.loads(m.group())

        records.append({
            "id": sample_id,
            "prompt_variant": variant,
            "ground_truth": true_lbl,
            "predicted": out.get("class"),
            "confidence": out.get("confidence"),
            "loan_doc_pred": out.get("loan_doc_id", ""),
            "raw_response": raw,
        })

results = pd.DataFrame(records)
results.to_csv("run_log.csv", index=False)
print("✓ Completed — results saved to run_log.csv")

✓ Completed — results saved to run_log.csv


In [5]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(results["ground_truth"], results["predicted"], digits=3))

cm = pd.DataFrame(
    confusion_matrix(
        results["ground_truth"],
        results["predicted"],
        labels=sorted(results["ground_truth"].unique()),
    ),
    index=sorted(results["ground_truth"].unique()),
    columns=sorted(results["ground_truth"].unique()),
)
cm.to_csv("confusion_matrix.csv")
cm


              precision    recall  f1-score   support

     LoanDoc      1.000     0.625     0.769         8
   PublicWeb      1.000     1.000     1.000         6
 Synthesised      0.667     1.000     0.800         6

    accuracy                          0.850        20
   macro avg      0.889     0.875     0.856        20
weighted avg      0.900     0.850     0.848        20



Unnamed: 0,LoanDoc,PublicWeb,Synthesised
LoanDoc,5,0,3
PublicWeb,0,6,0
Synthesised,0,0,6
