# Fraud PoC â€” Robust LLM ingestion, parsing and repair

This notebook contains integrated, ready-to-run cells to:
- Backup the DuckDB file
- Provide robust streaming assembly for Ollama responses
- Parse numeric `risk_score` reliably
- Insert per-transaction LLM results (one LLM call per tx)
- Reprocess rows with missing/NaN `risk_score` (repair)

Update DB_PATH below if your DB file is located elsewhere.

In [10]:
# CONFIG
import os
DB_PATH = os.environ.get('FRAUD_DB_PATH', 'fraud_poc.duckdb')   # change if needed
OLLAMA_URL = os.environ.get('OLLAMA_URL', 'http://localhost:11434/api/generate')
MODEL = os.environ.get('LLM_MODEL', 'olmo-3')
print('DB_PATH =', DB_PATH)
print('OLLAMA_URL =', OLLAMA_URL)
print('MODEL =', MODEL)


DB_PATH = fraud_poc.duckdb
OLLAMA_URL = http://localhost:11434/api/generate
MODEL = olmo-3


In [11]:
# Backup the DB file (run in notebook)
import shutil
if os.path.exists(DB_PATH):
    bak = DB_PATH + '.bak'
    shutil.copy2(DB_PATH, bak)
    print(f'Backup created: {bak}')
else:
    raise FileNotFoundError(f'DB not found at {DB_PATH}; set DB_PATH correctly and run this cell again.')


Backup created: fraud_poc.duckdb.bak


In [12]:
# Imports and robust parsing helpers
import duckdb, json, re, math, datetime, uuid, requests
import numpy as np

def parse_risk_score(value):
    """Return float in 0..1 or math.nan if not parseable."""
    if value is None:
        return math.nan
    # numeric types
    if isinstance(value, (int, float, np.integer, np.floating)):
        v = float(value)
        return math.nan if math.isnan(v) else v
    s = str(value).strip()
    # try JSON content
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            # common keys
            for key in ("risk_score","score","risk","riskScore"):
                if key in obj:
                    return parse_risk_score(obj[key])
        elif isinstance(obj, (int, float)):
            return float(obj)
    except Exception:
        pass
    low = s.lower()
    if low in ("","null","none","n/a","na","nan"):
        return math.nan
    # percent like 82%
    m = re.search(r'(-?\d+(?:[.,]\d+)?)\s*%', s)
    if m:
        try:
            num = float(m.group(1).replace(',','.'))
            return num/100.0
        except:
            return math.nan
    # find first numeric token
    m = re.search(r'(-?\d+(?:[.,]\d+)?)', s)
    if m:
        try:
            num = float(m.group(1).replace(',','.'))
        except:
            return math.nan
        if num < 0:
            return math.nan
        if num > 1 and num <= 100:
            return num/100.0
        return float(num)
    return math.nan

def extract_final_text_from_response(raw):
    """Attempt to get final textual output from a streaming llm_response field.
    If raw contains newline-separated JSON lines, parse last JSON that has 'response' or numeric keys.
    Otherwise return the last non-empty line or entire text as fallback.
    """
    if raw is None:
        return ""
    text = str(raw)
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    # search from last line backwards
    for ln in reversed(lines):
        try:
            obj = json.loads(ln)
            if isinstance(obj, dict):
                # direct numeric key
                for key in ("risk_score","score","risk","riskScore"):
                    if key in obj:
                        return obj[key]
                if obj.get('response'):
                    return obj['response']
                if obj.get('thinking'):
                    return obj['thinking']
            elif isinstance(obj, (int,float)):
                return obj
        except Exception:
            # not JSON; consider this line as candidate
            if len(ln) > 0:
                return ln
    return text


In [13]:
# Streaming wrapper to call Ollama and assemble final text (per prompt)
def call_ollama_stream(prompt, model=MODEL, ollama_url=OLLAMA_URL, timeout=300):
    payload = {"model": model, "prompt": prompt, "temperature": 0.0, "max_tokens": 512}
    resp = requests.post(ollama_url, json=payload, stream=True, timeout=timeout)
    resp.raise_for_status()
    assembled = ""
    raw_lines = []
    for line in resp.iter_lines(decode_unicode=True):
        if not line:
            continue
        try:
            chunk = json.loads(line)
            raw_lines.append(chunk)
        except Exception:
            raw_lines.append({'text': line})
            continue
        if chunk.get('response'):
            assembled += chunk['response']
        elif chunk.get('thinking'):
            assembled += chunk['thinking']
        if chunk.get('done'):
            break
    return assembled, raw_lines

In [14]:
# DB insert helper that stores parsed_response, llm_response, raw_lines, and flags needs_review
def safe_insert_llm_result(con, row_id, tx_id, model, assembled, raw_lines, parsed_val, needs_review, now):
    parsed_json = {"parsed_risk": None if math.isnan(parsed_val) else float(parsed_val)}
    # Ensure needs_review column exists; add if missing
    try:
        con.execute("ALTER TABLE llm_results ADD COLUMN IF NOT EXISTS needs_review BOOLEAN DEFAULT FALSE")
    except Exception:
        pass
    con.execute("""
        INSERT INTO llm_results (id, tx_id, llm_model, llm_response, parsed_response, risk_score, needs_review, created_at)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        row_id,
        tx_id,
        model,
        assembled,
        json.dumps(parsed_json),
        (None if math.isnan(parsed_val) else float(parsed_val)),
        needs_review,
        now
    ))


In [15]:
# Example: process unprocessed transactions (one LLM call per tx)
con = duckdb.connect(DB_PATH)
unprocessed_txs = con.execute("""
    SELECT t.tx_id, t.account_id, t.amount, t.currency, t.merchant, t.description
    FROM transactions t
    LEFT JOIN llm_results l ON t.tx_id = l.tx_id
    WHERE l.id IS NULL
    LIMIT 10
""").fetchall()
print(f"Found {len(unprocessed_txs)} unprocessed txs (limit 10).")
for tx in unprocessed_txs:
    tx_id, account_id, amount, currency, merchant, description = tx
    prompt = f"Transaction: account={account_id} amount={amount} {currency} merchant={merchant} description={description}\n\nReturn a numeric risk_score between 0 and 1 and a short explanation." 
    try:
        assembled, raw_lines = call_ollama_stream(prompt)
    except Exception as exc:
        print(f"LLM call failed for tx {tx_id}: {exc}")
        # insert placeholder row marked for review
        row_id = str(uuid.uuid4())
        now = datetime.datetime.utcnow()
        safe_insert_llm_result(con, row_id, tx_id, MODEL, "", [{"error": str(exc)}], math.nan, True, now)
        continue
    parsed_val = parse_risk_score(assembled)
    needs_review = False
    if math.isnan(parsed_val) or parsed_val < 0 or parsed_val > 1:
        needs_review = True
    else:
        parsed_val = max(0.0, min(1.0, float(parsed_val)))
    row_id = str(uuid.uuid4())
    now = datetime.datetime.utcnow()
    safe_insert_llm_result(con, row_id, tx_id, MODEL, assembled, raw_lines, parsed_val, needs_review, now)
    print(f"Inserted: tx_id={tx_id} id={row_id} risk_score={parsed_val} needs_review={needs_review}")

Found 10 unprocessed txs (limit 10).
Inserted: tx_id=tx_000011 id=77715f6b-ae5a-4783-8734-e2216bdc1e87 risk_score=-0.15 needs_review=True
Inserted: tx_id=tx_000012 id=5a340584-1dcf-45e1-ae5e-47a26191dd8c risk_score=0.0 needs_review=False
Inserted: tx_id=tx_000013 id=176b33d0-5b22-4a7c-bc3f-bc6f0b0dccbf risk_score=0.0 needs_review=False
Inserted: tx_id=tx_000014 id=37f03f1b-840a-4b09-9415-d98ea251156e risk_score=0.3 needs_review=False
Inserted: tx_id=tx_000015 id=96569138-3ebd-4909-9a68-4c444984a079 risk_score=0.0 needs_review=False
Inserted: tx_id=tx_000016 id=f1f0792d-8653-4018-b210-29201423c886 risk_score=-0.2 needs_review=True
Inserted: tx_id=tx_000017 id=a440783a-16c7-402f-b310-c99b1825146c risk_score=0.0 needs_review=False
Inserted: tx_id=tx_000018 id=0f566012-09e0-46a7-9e89-cab62a17a0dc risk_score=0.41 needs_review=False
Inserted: tx_id=tx_000019 id=0c411dda-d190-45b2-8859-f81d1f42e62c risk_score=0.0 needs_review=False
Inserted: tx_id=tx_000020 id=78795e71-4675-41cb-a3cb-2497c338

In [16]:
# Repair: reprocess rows with NULL/NaN risk_score if you have tx data available
con = duckdb.connect(DB_PATH)
to_reprocess = con.execute("""
SELECT l.id, l.tx_id, t.account_id, t.amount, t.currency, t.merchant, t.description
FROM llm_results l
LEFT JOIN transactions t ON l.tx_id = t.tx_id
WHERE l.risk_score IS NULL OR (l.risk_score != l.risk_score)
LIMIT 100
""").fetchall()
print(f"Rows to reprocess: {len(to_reprocess)}")
for row in to_reprocess:
    llm_id, tx_id, account_id, amount, currency, merchant, description = row
    if tx_id is None:
        print(f"No tx data for llm result {llm_id}; skipping")
        continue
    prompt = f"Transaction: account={account_id} amount={amount} {currency} merchant={merchant} description={description}\n\nReturn a numeric risk_score between 0 and 1 and a short explanation." 
    try:
        assembled, raw_lines = call_ollama_stream(prompt)
    except Exception as exc:
        print(f"Reprocess failed for llm row {llm_id}: {exc}")
        continue
    parsed_val = parse_risk_score(assembled)
    needs_review = False
    if math.isnan(parsed_val) or parsed_val < 0 or parsed_val > 1:
        needs_review = True
    else:
        parsed_val = max(0.0, min(1.0, float(parsed_val)))
    # update existing row
    con.execute("""
        UPDATE llm_results
        SET llm_response = ?, parsed_response = ?, risk_score = ?, needs_review = ?
        WHERE id = ?
    """, (assembled, json.dumps({"parsed_risk": None if math.isnan(parsed_val) else parsed_val}), (None if math.isnan(parsed_val) else parsed_val), needs_review, llm_id))
    print(f"Reprocessed {llm_id}: risk_score={parsed_val} needs_review={needs_review}")


Rows to reprocess: 3
Reprocessed ea39f16f-25f5-414a-9f94-5e362757bee3: risk_score=1.0 needs_review=False
Reprocessed 93b0a2e5-8a25-4a9c-b59b-0fd6cbe9d9ce: risk_score=0.7 needs_review=False
Reprocessed 65f50de2-c08e-409a-bdff-a7e0f1b50c63: risk_score=0.0 needs_review=False


In [17]:
# Diagnostics: show remaining missing / flagged rows
con = duckdb.connect(DB_PATH)
print('Total llm_results rows:', con.execute('SELECT COUNT(*) FROM llm_results').fetchone()[0])
print('Missing/NaN risk_score count:', con.execute("SELECT COUNT(*) FROM llm_results WHERE risk_score IS NULL OR (risk_score != risk_score)").fetchone()[0])
print('Needs review count:', con.execute("SELECT COUNT(*) FROM llm_results WHERE needs_review = TRUE").fetchone()[0])
print('\nSample needs_review rows:')
rows = con.execute("SELECT id, tx_id, risk_score, parsed_response, SUBSTR(llm_response,1,200) FROM llm_results WHERE needs_review = TRUE ORDER BY created_at DESC LIMIT 10").fetchall()
for r in rows:
    print(r)

Total llm_results rows: 23
Missing/NaN risk_score count: 0
Needs review count: 2

Sample needs_review rows:
('f1f0792d-8653-4018-b210-29201423c886', 'tx_000016', -0.2, '{"parsed_risk": -0.2}', 'Okay, so I need to figure out a risk score between 0 and 1 for this transaction. Let me start by looking at the details given. The account is acct_0024, and the amount is -48.94 USD. The merchant is o')
('77715f6b-ae5a-4783-8734-e2216bdc1e87', 'tx_000011', -0.15, '{"parsed_risk": -0.15}', 'Okay, so I need to figure out a risk score between 0 and 1 for this transaction. Let me start by looking at the details provided. The transaction is from account acct_0038 with an amount of $113.88 US')


Notes:
- The notebook assumes your `llm_results` table has columns: id, tx_id, llm_model, llm_response, parsed_response, risk_score, needs_review, created_at. If your schema differs, adjust the SQL and column names accordingly.
- The `call_ollama_stream` function uses streaming to assemble text. It must be used once per transaction (or use batch prompts if you prefer).
- After running this notebook, you should see NaNs reduced. Any rows where parsing still fails will be marked needs_review.
If you want, I can also produce a PR that replaces the notebook in your repository with this version. Tell me whether you want a PR (and which target branch), or whether you'd prefer to paste these cells into your existing notebook yourself.