In [None]:
# cell: diagnose_and_repair_risk_scores
import duckdb, json, re, math, pandas as pd, numpy as np
DB_PATH = "fraud_poc.duckdb"

# Robust parser to extract a numeric risk score in 0.0-1.0
def parse_risk_score(value):
    if value is None:
        return math.nan
    if isinstance(value, (int, float, np.integer, np.floating)):
        v = float(value)
        if math.isnan(v):
            return math.nan
        return v
    s = str(value).strip()
    # Try JSON decode to find keys
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            for key in ("risk_score","score","risk","riskScore"):
                if key in obj:
                    return parse_risk_score(obj[key])
        elif isinstance(obj, (int, float)):
            return float(obj)
    except Exception:
        pass
    lower = s.lower()
    if lower in ("", "null", "none", "n/a", "na", "nan"):
        return math.nan
    # percent like "82%" -> 0.82
    m = re.search(r'(-?\d+(?:[.,]\d+)?)\s*%', s)
    if m:
        try:
            num = float(m.group(1).replace(',', '.'))
            return num / 100.0
        except:
            return math.nan
    # find first numeric token
    m = re.search(r'(-?\d+(?:[.,]\d+)?)', s)
    if m:
        try:
            num = float(m.group(1).replace(',', '.'))
        except:
            return math.nan
        if num < 0:
            return math.nan
        # If number > 1 and <=100 assume percent
        if num > 1 and num <= 100:
            return float(num) / 100.0
        # If between 0 and 1, keep
        return float(num)
    return math.nan

# Connect to DuckDB file
con = duckdb.connect(DB_PATH)

# 1) Show table schema (duckdb)
print("Table schema (llm_results):")
print(con.execute("PRAGMA table_info('llm_results')").df())

# 2) Load candidate rows to inspect - INCLUDE ALL COLUMNS YOU NEED
df = con.execute("""
    SELECT id, tx_id, llm_model, llm_provider, risk_score, 
           llm_response, parsed_response, created_at 
    FROM llm_results
""").df()
print(f"Total rows in llm_results: {len(df)}")

# Helper to detect missing/NaN-like stored values
def stored_is_missing(v):
    if v is None:
        return True
    # If it's numeric and NaN
    try:
        f = float(v)
        return math.isnan(f)
    except Exception:
        # string values like 'NaN', 'nan', '' considered missing
        s = str(v).strip().lower()
        return s in ("", "nan", "none", "null", "na", "n/a")

# Filter rows that appear missing/NaN
df['stored_missing'] = df['risk_score'].apply(stored_is_missing)
missing_df = df[df['stored_missing']].copy()
print("Rows with missing/NaN risk_score:", len(missing_df))

if len(missing_df) == 0:
    print("No missing rows to repair. Done.")
else:
    # Try to parse from llm_response first, then parsed_response
    parsed = []
    for _, row in missing_df.iterrows():
        # Try llm_response and parsed_response (actual column names)
        raw = row.get('llm_response') or row.get('parsed_response') or ""
        parsed_val = parse_risk_score(raw)
        parsed.append(parsed_val)
    missing_df['parsed_risk'] = parsed

    # Show examples where parse succeeded
    ok = missing_df[~missing_df['parsed_risk'].isnull() & (missing_df['parsed_risk'] == missing_df['parsed_risk'])]
    print("Parsed successfully (candidates for update):", len(ok))
    if len(ok) > 0:
        # Use actual column names from your schema
        display_cols = ['id','created_at','llm_model','risk_score','parsed_risk','llm_response']
        print(ok[display_cols].head(20).to_string(index=False))

        # Prepare small repairs table and update in DuckDB
        repairs = ok[['id','parsed_risk']].copy()
        # Ensure parsed_risk is a plain Python float (no numpy types)
        repairs['parsed_risk'] = repairs['parsed_risk'].astype(float)
        con.register('repairs_df', repairs)
        # If risk_score column is numeric, this will update numerically.
        # If it's text, DuckDB will store numeric as text; we handle type conversion below if needed.
        con.execute("""
            UPDATE llm_results
            SET risk_score = repairs_df.parsed_risk
            FROM repairs_df
            WHERE llm_results.id = repairs_df.id
        """)
        print("Updated", len(repairs), "rows in llm_results.")

    else:
        print("No parsable numeric values found in llm_response/parsed_response for missing rows.")
        # Show samples for manual inspection - use actual column names
        display_cols = ['id','created_at','llm_model','risk_score','llm_response']
        # Only display columns that exist
        display_cols = [c for c in display_cols if c in missing_df.columns]
        print("Sample missing rows (first 10):")
        print(missing_df[display_cols].head(10).to_string(index=False))

# Final verification: how many still missing?
df_after = con.execute("SELECT id, risk_score FROM llm_results").df()
def is_missing_after(v):
    if v is None:
        return True
    try:
        f = float(v)
        return math.isnan(f)
    except:
        s = str(v).strip().lower()
        return s in ("", "nan", "none", "null", "na", "n/a")
print("Rows still missing/NaN after repair:", df_after['risk_score'].apply(is_missing_after).sum())

Table schema (llm_results):
    cid             name       type  notnull         dflt_value     pk
0     0               id    VARCHAR     True               None   True
1     1            tx_id    VARCHAR    False               None  False
2     2        llm_model    VARCHAR    False               None  False
3     3     llm_provider    VARCHAR    False               None  False
4     4  llm_prompt_hash    VARCHAR    False               None  False
5     5       llm_prompt    VARCHAR    False               None  False
6     6     llm_response    VARCHAR    False               None  False
7     7  parsed_response       JSON    False               None  False
8     8       risk_score     DOUBLE    False               None  False
9     9  evidence_tx_ids       JSON    False               None  False
10   10  call_latency_ms    INTEGER    False               None  False
11   11       provenance       JSON    False               None  False
12   12       created_at  TIMESTAMP    False  cur

In [None]:
import duckdb, json, re, math, pandas as pd, numpy as np
DB_PATH = "fraud_poc.duckdb"
con = duckdb.connect(DB_PATH)

# existing simpler parser (keeps your logic)
def parse_numeric_from_string(s):
    if s is None:
        return math.nan
    s = str(s).strip()
    if s == "":
        return math.nan
    # percent like "82%" -> 0.82
    m = re.search(r'(-?\d+(?:[.,]\d+)?)\s*%', s)
    if m:
        try:
            num = float(m.group(1).replace(',', '.'))
            return num/100.0
        except:
            return math.nan
    # first numeric token
    m = re.search(r'(-?\d+(?:[.,]\d+)?)', s)
    if m:
        try:
            num = float(m.group(1).replace(',', '.'))
        except:
            return math.nan
        if num < 0:
            return math.nan
        if num > 1 and num <= 100:
            return num/100.0
        return float(num)
    return math.nan

# new function: extract last JSON object or last meaningful line from streaming llm_response
def extract_final_text_from_response(raw):
    if raw is None:
        return ""
    text = str(raw)
    # If raw contains newline-separated JSON objects, try to parse last JSON
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    # Try last-to-first lines to find a JSON object containing response/risk_score
    for ln in reversed(lines):
        try:
            obj = json.loads(ln)
            # if object contains direct numeric key
            if isinstance(obj, dict):
                for key in ("risk_score","score","risk","riskScore"):
                    if key in obj:
                        return obj[key]   # could be numeric or nested
                # If it has 'response' or 'thinking', join them
                if 'response' in obj and obj['response']:
                    return obj['response']
                if 'thinking' in obj and obj['thinking']:
                    return obj['thinking']
            elif isinstance(obj, (int,float)):
                return obj
        except Exception:
            # not a JSON line; treat ln as candidate text
            # attempt to return the textual line
            if len(ln) > 0:
                return ln
    # fallback: return whole string if nothing parsed
    return text

# Inspect missing rows and attempt to parse
missing_rows = con.execute("""
SELECT id, llm_response, parsed_response, risk_score
FROM llm_results
WHERE risk_score IS NULL
ORDER BY created_at DESC
LIMIT 200
""").df()

print("Missing rows loaded:", len(missing_rows))
candidates = []

for _, row in missing_rows.iterrows():
    id_ = row['id']
    raw = row.get('llm_response') or row.get('parsed_response') or ""
    extracted = extract_final_text_from_response(raw)
    parsed_val = None
    # If extract returned a dict-like numeric already (from JSON), handle it
    if isinstance(extracted, (int,float,np.integer,np.floating)):
        parsed_val = float(extracted)
    else:
        parsed_val = parse_numeric_from_string(extracted)
    candidates.append((id_, extracted, parsed_val))

cand_df = pd.DataFrame(candidates, columns=['id','extracted_text','parsed_val'])
# show those parsed successfully
ok = cand_df[~cand_df['parsed_val'].isnull() & (cand_df['parsed_val'] == cand_df['parsed_val'])]
print("Parsable candidates count:", len(ok))
if len(ok) > 0:
    print(ok.head(20).to_string(index=False))
    # prepare update
    ok['parsed_val'] = ok['parsed_val'].astype(float)
    con.register('ok_repairs', ok[['id','parsed_val']])
    con.execute("""
    UPDATE llm_results
    SET risk_score = ok_repairs.parsed_val
    FROM ok_repairs
    WHERE llm_results.id = ok_repairs.id
    """)
    print("Updated", len(ok), "rows.")
else:
    print("No parsable numeric values found with streaming-aware extractor. Show sample extracted texts:")
    print(cand_df.head(20).to_string(index=False))

Missing rows loaded: 3
Parsable candidates count: 0
No parsable numeric values found with streaming-aware extractor. Show sample extracted texts:
                                  id extracted_text  parsed_val
65f50de2-c08e-409a-bdff-a7e0f1b50c63              }         NaN
93b0a2e5-8a25-4a9c-b59b-0fd6cbe9d9ce              }         NaN
ea39f16f-25f5-414a-9f94-5e362757bee3            "]}         NaN


In [None]:
import duckdb
DB="fraud_poc.duckdb"
con = duckdb.connect(DB)

ids = [
  '65f50de2-c08e-409a-bdff-a7e0f1b50c63',
  '93b0a2e5-8a25-4a9c-b59b-0fd6cbe9d9ce',
  'ea39f16f-25f5-414a-9f94-5e362757bee3'
]

for id_ in ids:
    row = con.execute(
        "SELECT id, LENGTH(llm_response) AS len, llm_response, LENGTH(parsed_response) AS len_parsed, parsed_response FROM llm_results WHERE id = ?",
        [id_]
    ).fetchall()
    print("ROW", id_, row)

ROW 65f50de2-c08e-409a-bdff-a7e0f1b50c63 [('65f50de2-c08e-409a-bdff-a7e0f1b50c63', 283171, '{"model":"olmo-3","created_at":"2026-01-09T14:45:31.724355211Z","response":"","thinking":"Okay","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:31.900774664Z","response":"","thinking":",","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:32.076435372Z","response":"","thinking":" let","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:32.249815011Z","response":"","thinking":"\'s","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:32.421546337Z","response":"","thinking":" tackle","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:32.634394438Z","response":"","thinking":" this","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:32.812822977Z","response":"","thinking":" problem","done":false}\n{"model":"olmo-3","created_at":"2026-01-09T14:45:33.002765186Z","response":"","thinking":" step","done":false}\n{"model":"olmo-

In [None]:
import os, requests, json, math, duckdb
DB="fraud_poc.duckdb"
con = duckdb.connect(DB)
OLLAMA_URL = os.environ.get("OLLAMA_URL","http://localhost:11434/api/generate")
MODEL = "olmo-3"

# robust parser reused (parse_numeric_from_string or parse_risk_score from earlier)
# assume parse_numeric_from_string() already defined in your notebook

rows = con.execute("""
SELECT id, tx_id, llm_prompt
FROM llm_results
""").fetchall()

for id_, tx_id, prompt in rows:
    payload = {"model": MODEL, "prompt": prompt, "temperature": 0.0, "max_tokens": 256}
    resp = requests.post(OLLAMA_URL, json=payload, timeout=300)
    resp.raise_for_status()
    text = resp.text
    # If the server streams lines, you should call with stream=True and assemble lines.
    try:
        obj = resp.json()
        # If API returns final JSON, find risk_score key first
        candidate = obj if isinstance(obj, dict) else text
    except Exception:
        candidate = text
    extracted = extract_final_text_from_response(candidate)  # use extractor from earlier cell
    parsed_val = parse_numeric_from_string(extracted)        # use parser from earlier
    if not math.isnan(parsed_val):
        con.execute("UPDATE llm_results SET risk_score = ?, llm_response = ? WHERE id = ?", (parsed_val, text, id_))
    else:
        # keep needs_review True for manual check
        con.execute("UPDATE llm_results SET llm_response = ? WHERE id = ?", (text, id_))

In [None]:
import uuid
import datetime
import requests
import json
import math

# Assuming you have the streaming response collection
resp = requests.post(OLLAMA_URL, json=payload, stream=True, timeout=600)
resp.raise_for_status()

assembled = ""
raw_lines = []

for line in resp.iter_lines(decode_unicode=True):
    if not line: 
        continue
    try:
        chunk = json.loads(line)
        raw_lines.append(chunk)  # Save each chunk
    except:
        continue
    
    if chunk.get("response"):
        assembled += chunk["response"]
    elif chunk.get("thinking"):
        assembled += chunk["thinking"]
    
    if chunk.get("done"):
        break

# Now parse and insert
parsed = parse_risk_score(assembled)  # returns float or nan
needs_review = math.isnan(parsed)
# Find transactions that don't have LLM results yet
unprocessed_txs = con.execute("""
    SELECT t.tx_id, t.account_id, t.amount, t.currency, t.merchant, t.description
    FROM transactions t
    LEFT JOIN llm_results l ON t.tx_id = l.tx_id
    WHERE l.id IS NULL
    LIMIT 10
""").fetchall()

for tx_row in unprocessed_txs:
    incoming = {
        'tx_id': tx_row[0],
        'account_id': tx_row[1],
        'amount': tx_row[2],
        'currency': tx_row[3],
        'merchant': tx_row[4],
        'description': tx_row[5],
    }

    # Generate INSERT values
    row_id = str(uuid.uuid4())

    model = 'olmo-3'
    now = datetime.datetime.utcnow()

    # Assuming you have 'assembled' from Ollama response
    parsed = parse_risk_score(assembled) if 'assembled' in locals() else math.nan

    con.execute("""
    INSERT INTO llm_results (id, tx_id, llm_model, llm_response, parsed_response, risk_score, created_at)
    VALUES (?,?,?,?,?,?,?)
    """, (
        row_id,
        incoming['tx_id'],
        model,
        assembled if 'assembled' in locals() else '',
        json.dumps({}),
        (parsed if not math.isnan(parsed) else None),
        now
    ))
    print(f"Inserted llm_results id={row_id}, risk_score={parsed}, needs_review={needs_review}")
# Now 'assembled' contains the final text you should parse and store

Inserted llm_results id=ec167f3b-9718-4868-a0d0-3f4b59ea6a48, risk_score=0.0, needs_review=False
Inserted llm_results id=c0fc9d88-2307-41fe-a682-ba3c97f4b111, risk_score=0.0, needs_review=False
Inserted llm_results id=30e960f8-477f-425a-b116-3f8b4d46b654, risk_score=0.0, needs_review=False
Inserted llm_results id=054a91ff-c0c1-4957-b677-e97114567e49, risk_score=0.0, needs_review=False
Inserted llm_results id=7835fcfb-74ce-4aad-b5e2-814fb2094e26, risk_score=0.0, needs_review=False
Inserted llm_results id=16e9aeb8-5452-43e8-ad3f-ed82b26b3aaf, risk_score=0.0, needs_review=False
Inserted llm_results id=63ef833a-0325-40d6-aab2-849828349267, risk_score=0.0, needs_review=False
Inserted llm_results id=00f9403a-fa9a-48e7-a0c9-afdd206d89be, risk_score=0.0, needs_review=False
Inserted llm_results id=43680733-ed79-4942-a464-e7c62f8ea784, risk_score=0.0, needs_review=False
Inserted llm_results id=e3cc1ff1-513b-4841-84f9-ebf64605c6ac, risk_score=0.0, needs_review=False


In [None]:
df_llm = con.execute('SELECT id, tx_id, llm_model, llm_provider, risk_score, created_at FROM llm_results ORDER BY created_at DESC LIMIT 10').df()
df_llm

Unnamed: 0,id,tx_id,llm_model,llm_provider,risk_score,created_at
0,e3cc1ff1-513b-4841-84f9-ebf64605c6ac,tx_000010,olmo-3,,0.0,2026-01-09 16:35:46.963937
1,43680733-ed79-4942-a464-e7c62f8ea784,tx_000009,olmo-3,,0.0,2026-01-09 16:35:46.959811
2,00f9403a-fa9a-48e7-a0c9-afdd206d89be,tx_000008,olmo-3,,0.0,2026-01-09 16:35:46.956415
3,63ef833a-0325-40d6-aab2-849828349267,tx_000007,olmo-3,,0.0,2026-01-09 16:35:46.951915
4,16e9aeb8-5452-43e8-ad3f-ed82b26b3aaf,tx_000006,olmo-3,,0.0,2026-01-09 16:35:46.947778
5,7835fcfb-74ce-4aad-b5e2-814fb2094e26,tx_000005,olmo-3,,0.0,2026-01-09 16:35:46.944346
6,054a91ff-c0c1-4957-b677-e97114567e49,tx_000004,olmo-3,,0.0,2026-01-09 16:35:46.940164
7,30e960f8-477f-425a-b116-3f8b4d46b654,tx_000003,olmo-3,,0.0,2026-01-09 16:35:46.936024
8,c0fc9d88-2307-41fe-a682-ba3c97f4b111,tx_000002,olmo-3,,0.0,2026-01-09 16:35:46.927706
9,ec167f3b-9718-4868-a0d0-3f4b59ea6a48,tx_000001,olmo-3,,0.0,2026-01-09 16:35:46.836211


In [1]:
import duckdb
con = duckdb.connect("fraud_poc.duckdb")  # or DB_PATH you use

# list tables
print("Tables in DB:")
print(con.execute("SHOW TABLES").fetchall())

# show table schemas (duckdb PRAGMA)
for row in con.execute("SHOW TABLES").fetchall():
    tbl = row[0]
    print("\nSchema for", tbl)
    try:
        print(con.execute(f"PRAGMA table_info('{tbl}')").df())
    except Exception as e:
        print("Could not fetch schema:", e)

print("llm_results schema:")
print(con.execute("PRAGMA table_info('llm_results')").df())
print("Sample rows from llm_results (limit 5):")
print(con.execute("SELECT * FROM llm_results LIMIT 5").df())



Tables in DB:
[]
llm_results schema:


CatalogException: Catalog Error: Table with name llm_results does not exist!
Did you mean "pragma_database_list"?

In [None]:
# show any tables with 'tx' or 'transaction' in the name
tables = [r[0] for r in con.execute("SHOW TABLES").fetchall()]
candidates = [t for t in tables if 'tx' in t.lower() or 'trans' in t.lower()]
print("Transaction-like candidate tables:", candidates)
for t in candidates:
    print(t, con.execute(f"PRAGMA table_info('{t}')").df())

Transaction-like candidate tables: ['transactions']
transactions     cid              name       type  notnull            dflt_value     pk
0     0             tx_id    VARCHAR     True                  None   True
1     1        account_id    VARCHAR    False                  None  False
2     2            amount     DOUBLE    False                  None  False
3     3          currency    VARCHAR    False                  None  False
4     4          merchant    VARCHAR    False                  None  False
5     5       description    VARCHAR    False                  None  False
6     6         timestamp  TIMESTAMP    False                  None  False
7     7  ingestion_job_id    VARCHAR    False                  None  False
8     8        raw_source    VARCHAR    False                  None  False
9     9        pii_masked    BOOLEAN    False  CAST('f' AS BOOLEAN)  False
10   10        created_at  TIMESTAMP    False     current_timestamp  False


In [1]:
import os
print("cwd:", os.getcwd())
print("DB_PATH env:", os.environ.get('FRAUD_DB_PATH', 'fraud_poc.duckdb'))
print("exists:", os.path.exists("notebooks/fraud_poc.duckdb"))
!ls -la
!ls -la notebooks || true

cwd: /content
DB_PATH env: fraud_poc.duckdb
exists: False
total 16
drwxr-xr-x 1 root root 4096 Dec  9 14:41 .
drwxr-xr-x 1 root root 4096 Jan 10 18:17 ..
drwxr-xr-x 4 root root 4096 Dec  9 14:41 .config
drwxr-xr-x 1 root root 4096 Dec  9 14:42 sample_data
ls: cannot access 'notebooks': No such file or directory


In [2]:

try:
    from google.colab import files, drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
    print("Not running in Colab - using local file system")
drive.mount('/content/drive')
# after mounting, copy the file (adjust source path)
!cp /content/drive/MyDrive/path/to/fraud_poc.duckdb notebooks/fraud_poc.duckdb

KeyboardInterrupt: 