# 🚀🔎 Literature Screening – Model Inference  
Run an Azure OpenAI deployment on either the **train** or **test** split created in *data_preparation.ipynb* and save all responses.

Expected directory structure after running:  
```
outputs/  
└── <MODEL_NAME>/  
    ├── train/  
    │   ├── predictions/  
    │   ├── misclassified/  
    │   └── unparsed/  
    └── test/  
        ├── predictions/  
        ├── misclassified/  
        └── unparsed/
```

In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 1 – Imports and environment setup 🌍      ║
# ╚════════════════════════════════════════════════╝
import os, json, random, shutil, time
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd

from dotenv import load_dotenv
from openai import AzureOpenAI
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tqdm.auto import tqdm

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

load_dotenv()  # expects ENDPOINT_URL, DEPLOYMENT_NAME, AZURE_OPENAI_API_KEY


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 2 – Azure OpenAI helper 🤖               ║
# ╚════════════════════════════════════════════════╝
def make_client() -> AzureOpenAI:
    return AzureOpenAI(
        api_key        = os.getenv("AZURE_OPENAI_API_KEY"),
        azure_endpoint = os.getenv("ENDPOINT_URL"),
        api_version    = os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview"),
    )

client          = make_client()
DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME", "").strip()
print(f"Using deployment: [{DEPLOYMENT_NAME or 'NOT SET'}]")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 3 – Select evaluation split 🎛️          ║
# ╚════════════════════════════════════════════════╝
# Choose "train" or "test" before running the notebook
EVAL_SPLIT = "test"   # change to "train" if desired

dataset_path = Path("outputs/datasets") / f"{EVAL_SPLIT}_dataset.csv"
if not dataset_path.exists():
    raise FileNotFoundError(f"Dataset file not found: {dataset_path}")

df_all = pd.read_csv(dataset_path)
print(f"Loaded {len(df_all):,} rows from {dataset_path.name}")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 4 – Prompt template builder ✍️          ║
# ╚════════════════════════════════════════════════╝
template_text = Path("prompts/v5.txt").read_text(encoding="utf-8")

def build_prompt(title: str, abstract: str) -> str:
    return (template_text
            .replace("{TITLE}",    title.replace("\n", " ").strip())
            .replace("{ABSTRACT}", abstract.replace("\n", " ").strip()))


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 5 – Output directories 📂                ║
# ╚════════════════════════════════════════════════╝
MODEL_DIR         = Path("outputs") / (DEPLOYMENT_NAME or "unknown_model") / EVAL_SPLIT
predictions_dir   = MODEL_DIR / "predictions"
misclassified_dir = MODEL_DIR / "misclassified"
unparsed_dir      = MODEL_DIR / "unparsed"

for p in (predictions_dir, misclassified_dir, unparsed_dir):
    p.mkdir(parents=True, exist_ok=True)

print("Predictions saved to:", predictions_dir.resolve())


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 6 – LLM call with retry 💬               ║
# ╚════════════════════════════════════════════════╝
@retry(wait=wait_random_exponential(min=1, max=20),
       stop=stop_after_attempt(5))
def call_llm(prompt: str) -> str:
    messages = [
        {"role": "system",
         "content": [{"type": "text",
                      "text": "You are an AI assistant that helps people find information."}]},
        {"role": "user",
         "content": [{"type": "text", "text": prompt}]}
    ]
    try:
        resp = client.chat.completions.create(
            model                 = DEPLOYMENT_NAME,
            messages              = messages,
            max_completion_tokens = 800,
            temperature           = 1,
            top_p                 = 1,
        )
    except Exception as e:
        if "max_completion_tokens" in str(e) and "unsupported_parameter" in str(e):
            resp = client.chat.completions.create(
                model       = DEPLOYMENT_NAME,
                messages    = messages,
                max_tokens  = 800,
                temperature = 1,
                top_p       = 1,
            )
        else:
            raise
    return resp.choices[0].message.content.strip()


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 7 – Inference loop 🚀                    ║
# ╚════════════════════════════════════════════════╝
pred_rows        = []
unparsed_counter = 0

for _, row in tqdm(df_all.iterrows(),
                   total=len(df_all),
                   desc=f"Calling {DEPLOYMENT_NAME} ({EVAL_SPLIT})"):
    prompt      = build_prompt(row["title"], row["abstract"])
    raw_output  = ""
    pred_label  = "ParseError"
    rationale   = ""

    try:
        raw_output = call_llm(prompt)
    except Exception as e:
        raw_output = f"CALL_ERROR: {e}"

    try:
        parsed     = json.loads(raw_output)
        pred_label = parsed.get("classification", "").strip()
        rationale  = parsed.get("rationale", "").strip()

        if pred_label.lower() == "included":
            pred_label = "Included"
        elif pred_label.lower() == "excluded":
            pred_label = "Excluded"
        else:
            raise ValueError("Unknown classification value")
    except Exception as e:
        pred_label        = "ParseError"
        rationale         = f"PARSE_ERROR: {e}"
        unparsed_counter += 1

    # Save individual JSON
    with open(predictions_dir / f"{row['id']}.json", "w", encoding="utf-8") as f:
        json.dump({
            "ground_truth": row["label"],
            "prediction"  : pred_label,
            "rationale"   : rationale,
            "raw_response": raw_output
        }, f, ensure_ascii=False, indent=2)

    pred_rows.append({
        "id"          : row["id"],
        "ground_truth": row["label"],
        "prediction"  : pred_label,
        "rationale"   : rationale
    })

df_pred = pd.DataFrame(pred_rows)
print(f"Finished. Unparsed responses: {unparsed_counter}")


In [None]:
# ╔════════════════════════════════════════════════╗
# ║ Cell 8 – Organise misclassified / unparsed 📦 ║
# ╚════════════════════════════════════════════════╝
mis_df = df_pred[df_pred["ground_truth"] != df_pred["prediction"]]
for _, r in mis_df.iterrows():
    src = predictions_dir / f"{r['id']}.json"
    dst = misclassified_dir / src.name
    if src.exists():
        shutil.copy(src, dst)

unparsed_df = df_pred[df_pred["prediction"] == "ParseError"]
for _, r in unparsed_df.iterrows():
    src = predictions_dir / f"{r['id']}.json"
    dst = unparsed_dir / src.name
    if src.exists():
        shutil.copy(src, dst)

print(f"Misclassified copied: {len(mis_df)}, Unparsed copied: {len(unparsed_df)}")


## ✔️ Inference complete  
Run once for `EVAL_SPLIT="train"` and once for `EVAL_SPLIT="test"` to populate both folders.