In [1]:
import os, time, random, json, pandas as pd
from dotenv import load_dotenv
import re, json
from openai import OpenAI, RateLimitError, APIError, Timeout 
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))    
df = pd.read_csv("ground_truth.csv")

SYS_MSG = (
    """
{
  "class"        : "LoanDoc | PublicWeb | Synthesised",
  "confidence"   : 0-100,            // subjective probability
  "loan_doc_id"  : "file_name.pdf"   // only when class = LoanDoc
}
SYSTEM (prime directive)
You are an expert provenance auditor.  
Return ONLY valid JSON matching the schema below—no extra keys,
no comments, no explanations.

DECISION RULES
1. "LoanDoc": The text is a verbatim or lightly edited excerpt
   from internal loan paperwork you have already seen.
2. "PublicWeb": It appears on freely available websites,
   press releases, blogs, or public data portals.
3. "Synthesised": It blends or paraphrases two or more sources,
   or it was written from scratch by an analyst or AI.

If you choose "LoanDoc", include the best matching
`loan_doc_id` from this list ⬇️
=================================================================
[TS_2025-001.pdf]  Term Sheet, 101 Main St.
[APP_2025-001.pdf] Appraisal, 101 Main St.
[OM_SterlingPlaza.pdf] Offering Memo, Sterling Plaza
=================================================================
Output schema → {class, confidence, loan_doc_id}

"""
)

FEW_SHOT = """
// Example 1
/* internal reasoning: exact phrase match to Appraisal */
{"class":"LoanDoc","confidence":93,"loan_doc_id":"APP_2025-001.pdf"}

// Example 2
/* internal reasoning: appears on HUD.gov */
{"class":"PublicWeb","confidence":87,"loan_doc_id":""}

// Example 3
/* reasoning: combines NOI from TS and cap-rate from Appraisal */
{"class":"Synthesised","confidence":78,"loan_doc_id":""}
"""

def make_prompt(variant: str, snippet: str) -> list[dict]:
    messages = [{"role": "system", "content": SYS_MSG}]

    if variant == "zeroshot":
        user_msg = f'Classify the following text and output JSON only:\n"""{snippet}"""'
        messages.append({"role": "user", "content": user_msg})
        return messages

    if variant == "fewshot":
        user_msg = FEW_SHOT + f'\nText: """{snippet}"""\nPrediction:'
        messages.append({"role": "user", "content": user_msg})
        return messages

    if variant == "cot":
        cot_head = """
First, think step-by-step and wrap every reasoning line in an HTML comment (<!-- -->).
After the reasoning, output **only** the JSON object described above.
"""
        messages.append({"role": "user", "content": cot_head + f'\nText: """{snippet}"""'})
        return messages

    raise ValueError(f"Unknown variant: {variant}")

In [2]:
def call_chat_completion(messages: list[dict],
                          variant: str,
                          model: str = "gpt-4o-mini",
                          max_retries: int = 6,
                          base_delay: float = 1.0):
    """
    Wrapper around client.chat.completions.create with exponential back-off.
    For zeroshot / fewshot we enforce JSON-only responses via response_format.
    """
    for attempt in range(max_retries):
        try:
            kwargs = dict(model=model,
                          messages=messages,
                          temperature=0)

            if variant in ("zeroshot", "fewshot"):
                kwargs["response_format"] = {"type": "json_object"}

            return client.chat.completions.create(**kwargs)

        except (RateLimitError, APIError, Timeout) as e:
            if attempt == max_retries - 1:
                raise
            wait = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"{type(e).__name__}: retrying in {wait:.1f}s…")
            time.sleep(wait)

In [None]:
records = []

for idx, row in df.iterrows():
    sample_id = row.get("id", idx)        
    snippet   = row["snippet"]
    true_lbl  = row["label"]

    for variant in ("zeroshot", "fewshot", "cot"):
        prompt = make_prompt(variant, snippet)
        rsp    = call_chat_completion(prompt, variant)

        raw = rsp.choices[0].message.content.strip()

        try:
            out = json.loads(raw)
        except json.JSONDecodeError:

            m = re.search(r"\{.*?\}", raw, re.S)
            if not m:
                print(f"⚠ No JSON for id={sample_id}, variant={variant}\n{raw[:120]}…")
                continue
            out = json.loads(m.group())

        records.append({
            "id"           : sample_id,
            "prompt_variant": variant,
            "ground_truth" : true_lbl,
            "predicted"    : out.get("class"),
            "confidence"   : out.get("confidence"),
            "loan_doc_pred": out.get("loan_doc_id", ""),
            "raw_response" : raw
        })

results = pd.DataFrame(records)
results.to_csv("run_log.csv", index=False)
print("✓ Completed — results saved to run_log.csv")

✓ Completed — results saved to run_log.csv


In [4]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(results["ground_truth"], results["predicted"], digits=3))

cm = pd.DataFrame(
    confusion_matrix(results["ground_truth"], results["predicted"],
                     labels=sorted(results["ground_truth"].unique())),
    index=sorted(results["ground_truth"].unique()),
    columns=sorted(results["ground_truth"].unique())
)
cm.to_csv("confusion_matrix.csv")
cm

              precision    recall  f1-score   support

     LoanDoc      1.000     0.444     0.615         9
   PublicWeb      0.600     1.000     0.750         9
 Synthesised      1.000     0.833     0.909         6

    accuracy                          0.750        24
   macro avg      0.867     0.759     0.758        24
weighted avg      0.850     0.750     0.739        24



Unnamed: 0,LoanDoc,PublicWeb,Synthesised
LoanDoc,4,5,0
PublicWeb,0,9,0
Synthesised,0,1,5
