# Fraud PoC â€” DuckDB + Embeddings + Ollama

This notebook demonstrates an end-to-end PoC:
- create canonical tables in DuckDB
- ingest synthetic transactions
- compute embeddings (sentence-transformers)
- build a local ANN index (faiss / hnswlib / sklearn fallback)
- perform retrieval, assemble prompt and call Ollama (local LLM)
- persist LLM result and provenance back into DuckDB

Notes:
- Ensure your environment has the dependencies from `requirements.txt` installed.
- Ensure Ollama is running and reachable at OLLAMA_URL (default http://localhost:11434/api/generate).
- The notebook auto-falls back to hnswlib or sklearn NearestNeighbors if faiss is unavailable.

In [None]:
# Configuration & imports
import os
import json
import uuid
import time
import hashlib
from typing import List, Tuple, Dict, Any

import duckdb
import pandas as pd
import numpy as np
import requests

# Embedding lib
from sentence_transformers import SentenceTransformer

# ANN fallbacks
HAS_FAISS = False
try:
    import faiss
    HAS_FAISS = True
except Exception:
    HAS_FAISS = False

HAS_HNSW = False
try:
    import hnswlib
    HAS_HNSW = True
except Exception:
    HAS_HNSW = False

from sklearn.neighbors import NearestNeighbors

# Ollama endpoint (adjust if different in your setup)
OLLAMA_URL = os.environ.get('OLLAMA_URL', 'http://localhost:11434/api/generate')
OLLAMA_MODEL = os.environ.get('OLLAMA_MODEL', 'llama2')  # replace with your pulled model name

# DuckDB path
DB_PATH = os.environ.get('DB_PATH', 'fraud_poc.duckdb')


In [None]:
# Utilities: prompt hashing and calling Ollama
def prompt_hash(prompt_text: str) -> str:
    return hashlib.sha256(prompt_text.encode('utf-8')).hexdigest()

def call_ollama(model: str, prompt_text: str, temperature: float = 0.0, timeout: int = 60) -> Dict[str, Any]:
    payload = {
        'model': model,
        'prompt': prompt_text,
        'temperature': temperature
    }
    start = time.time()
    resp = requests.post(OLLAMA_URL, json=payload, timeout=timeout)
    resp.raise_for_status()
    elapsed_ms = int((time.time() - start) * 1000)
    try:
        data = resp.json()
    except ValueError:
        data = {'text': resp.text}
    # Normalize text extraction (common keys)
    text = data.get('text') or data.get('response') or data.get('content') or resp.text
    return {
        'llm_provider': 'ollama_local',
        'llm_model': model,
        'llm_response_raw': text,
        'llm_response_json': data if isinstance(data, dict) else None,
        'latency_ms': elapsed_ms,
        'prompt_hash': prompt_hash(prompt_text),
        'call_id': str(uuid.uuid4())
    }

def try_parse_json(s: str):
    try:
        return json.loads(s)
    except Exception:
        return None


In [None]:
# Create DuckDB connection and required tables (idempotent)
con = duckdb.connect(DB_PATH)

CREATE_TRANSACTIONS = '''
CREATE TABLE IF NOT EXISTS transactions (
  tx_id VARCHAR PRIMARY KEY,
  account_id VARCHAR,
  amount DOUBLE,
  currency VARCHAR,
  merchant VARCHAR,
  description VARCHAR,
  timestamp TIMESTAMP,
  ingestion_job_id VARCHAR,
  raw_source VARCHAR,
  pii_masked BOOLEAN DEFAULT FALSE,
  created_at TIMESTAMP DEFAULT current_timestamp
);
'''

CREATE_EMBEDDINGS = '''
CREATE TABLE IF NOT EXISTS embeddings (
  tx_id VARCHAR PRIMARY KEY,
  emb_json TEXT,
  emb_model VARCHAR,
  emb_created_at TIMESTAMP DEFAULT current_timestamp,
  emb_job_id VARCHAR
);
'''

CREATE_LLM = '''
CREATE TABLE IF NOT EXISTS llm_results (
  id VARCHAR PRIMARY KEY,
  tx_id VARCHAR,
  llm_model VARCHAR,
  llm_provider VARCHAR,
  llm_prompt_hash VARCHAR,
  llm_prompt VARCHAR,
  llm_response VARCHAR,
  parsed_response JSON,
  risk_score DOUBLE,
  evidence_tx_ids JSON,
  call_latency_ms INTEGER,
  provenance JSON,
  created_at TIMESTAMP DEFAULT current_timestamp
);
'''

con.execute(CREATE_TRANSACTIONS)
con.execute(CREATE_EMBEDDINGS)
con.execute(CREATE_LLM)
print('Created tables (if not existing) in', DB_PATH)


In [None]:
# Create synthetic transactions and insert into DuckDB
import datetime

n = 200
rng = np.random.default_rng(42)
txs = []
for i in range(n):
    tx_id = f"tx_{i+1:06d}"
    account = f"acct_{rng.integers(1,50):04d}"
    amount = float(np.round(rng.normal(50, 120), 2))
    merchant = rng.choice(['store_a','store_b','online_shop','gas_station','restaurant'])
    desc = f"purchase at {merchant} for ${amount:.2f}"
    ts = datetime.datetime.utcnow() - datetime.timedelta(minutes=int(rng.integers(0, 60*24)))
    txs.append((tx_id, account, amount, 'USD', merchant, desc, ts, 'ingest_0', 'synthetic', False))

df = pd.DataFrame(txs, columns=['tx_id','account_id','amount','currency','merchant','description','timestamp','ingestion_job_id','raw_source','pii_masked'])

con.register('df_tx', df)
con.execute("INSERT INTO transactions SELECT * FROM df_tx")
print('Inserted', len(df), 'transactions')


In [None]:
# Compute embeddings with sentence-transformers and store in embeddings table
embed_model_name = 'all-MiniLM-L6-v2'
print('Loading embedding model:', embed_model_name)
embedder = SentenceTransformer(embed_model_name)

# Load transactions into a DataFrame (from DuckDB)
df_tx = con.execute('SELECT tx_id, description FROM transactions').df()
texts = df_tx['description'].tolist()
vectors = embedder.encode(texts, convert_to_numpy=True, show_progress_bar=True)

import json
rows = []
for tx_id, vec in zip(df_tx['tx_id'].tolist(), vectors):
    rows.append((tx_id, json.dumps(vec.tolist()), embed_model_name, None))

df_emb = pd.DataFrame(rows, columns=['tx_id','emb_json','emb_model','emb_job_id'])
con.register('df_emb', df_emb)
con.execute("INSERT INTO embeddings SELECT * FROM df_emb")
print('Stored', len(df_emb), 'embeddings in DuckDB')


In [None]:
# Build an in-memory ANN index (faiss preferred, then hnswlib, then sklearn)
emb_matrix = np.vstack([np.array(json.loads(x)).astype('float32') for x in con.execute('SELECT emb_json FROM embeddings ORDER BY emb_created_at').df()['emb_json']])
ids = con.execute('SELECT tx_id FROM embeddings ORDER BY emb_created_at').df()['tx_id'].tolist()
id_to_index = {tx_id: idx for idx, tx_id in enumerate(ids)}

ann_index = None
ann_backend = None
dim = emb_matrix.shape[1]

if HAS_FAISS:
    import faiss
    # normalize for cosine similarity
    xb = emb_matrix.copy()
    faiss.normalize_L2(xb)
    index = faiss.IndexFlatIP(dim)
    index.add(xb)
    ann_index = index
    ann_backend = 'faiss'
    print('Built FAISS IndexFlatIP with', xb.shape[0], 'vectors')
elif HAS_HNSW:
    import hnswlib
    p = hnswlib.Index(space='cosine', dim=dim)
    p.init_index(max_elements=emb_matrix.shape[0], ef_construction=200, M=16)
    p.add_items(emb_matrix, np.arange(emb_matrix.shape[0]))
    p.set_ef(50)
    ann_index = p
    ann_backend = 'hnswlib'
    print('Built hnswlib index with', emb_matrix.shape[0], 'vectors')
else:
    # sklearn brute-force as fallback
    nbrs = NearestNeighbors(n_neighbors=10, metric='cosine', algorithm='brute').fit(emb_matrix)
    ann_index = nbrs
    ann_backend = 'sklearn'
    print('Built sklearn NearestNeighbors fallback (brute)')


In [None]:
# Retrieval helper
def retrieve_topk(query_vec: np.ndarray, k: int = 5) -> List[Tuple[str, float]]:
    q = query_vec.astype('float32')
    if ann_backend == 'faiss':
        qn = q.copy()
        faiss.normalize_L2(qn)
        D, I = ann_index.search(qn.reshape(1, -1), k)
        # FAISS IndexFlatIP returns inner product; since we normalized, it's cosine similarity
        return [(ids[int(i)], float(D[0][idx])) for idx, i in enumerate(I[0])] 
    elif ann_backend == 'hnswlib':
        labels, distances = ann_index.knn_query(q, k=k)
        return [(ids[int(lbl)], float(dist)) for lbl, dist in zip(labels[0], distances[0])]
    else:
        D, I = ann_index.kneighbors(q.reshape(1, -1), n_neighbors=k, return_distance=True)
        return [(ids[int(i)], float(D[0][idx])) for idx, i in enumerate(I[0])]


In [None]:
# Simulate an incoming transaction, retrieve similar historical cases, call Ollama, and persist result
incoming = {
    'tx_id': 'tx_live_0001',
    'account_id': 'acct_9999',
    'amount': 399.99,
    'currency': 'USD',
    'merchant': 'online_shop',
    'description': 'large purchase at online_shop',
}
# compute embedding for incoming description
q_vec = embedder.encode([incoming['description']], convert_to_numpy=True)[0].astype('float32')

topk = retrieve_topk(q_vec, k=5)
print('Retrieved top-k examples:', topk)

# assemble a RAG prompt (simple template)
retrieved_texts = []
for txid, score in topk:
    row = con.execute("SELECT tx_id, account_id, amount, merchant, description FROM transactions WHERE tx_id = ?", (txid,)).fetchdf()
    if len(row) > 0:
        r = row.iloc[0]
        retrieved_texts.append(f"- {r.tx_id} | {r.merchant} | ${r.amount:.2f} | {r.description}")

prompt = f"You are a fraud analyst assistant. Given the incoming transaction and similar historical transactions, output JSON with fields: {json.dumps({'risk_score':'float 0..1','explanation':'string','evidence_tx_ids':'list'})}.\n\n"
prompt += "Incoming transaction:\n"
prompt += f"{incoming['tx_id']} | {incoming['merchant']} | ${incoming['amount']:.2f} | {incoming['description']}\n\n"
prompt += "Retrieved similar historical transactions:\n"
prompt += "\n".join(retrieved_texts)
prompt += "\n\nReturn only valid JSON."

print('\nPrompt preview:\n', prompt[:1000])

# Call Ollama
resp = call_ollama(OLLAMA_MODEL, prompt, temperature=0.0, timeout=60)
print('Ollama call latency (ms):', resp['latency_ms'])

# Try to parse JSON from response
parsed = try_parse_json(resp['llm_response_raw'])
risk_score = None
evidence = None
if parsed and isinstance(parsed, dict):
    risk_score = parsed.get('risk_score')
    evidence = parsed.get('evidence_tx_ids')
else:
    # if response not JSON, keep raw text
    parsed = {'text': resp['llm_response_raw']}

# Persist into llm_results
row_id = resp['call_id']
provenance = {
    'emb_model': embed_model_name,
    'index_backend': ann_backend,
    'retrieved': [tx for tx, _ in topk]
}
con.execute(
    """
    INSERT INTO llm_results (id, tx_id, llm_model, llm_provider, llm_prompt_hash, llm_prompt, llm_response, parsed_response, risk_score, evidence_tx_ids, call_latency_ms, provenance)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    """,
    (
        row_id,
        incoming['tx_id'],
        resp['llm_model'],
        resp['llm_provider'],
        resp['prompt_hash'],
        prompt if os.environ.get('STORE_PROMPTS','1')=='1' else None,
        resp['llm_response_raw'],
        json.dumps(parsed),
        float(risk_score) if risk_score is not None else None,
        json.dumps(evidence) if evidence is not None else json.dumps([t for t,_ in topk]),
        int(resp['latency_ms']),
        json.dumps(provenance)
    )
)
print('Inserted llm_results id=', row_id)


In [None]:
# Inspect persisted LLM results
df_llm = con.execute('SELECT id, tx_id, llm_model, llm_provider, risk_score, created_at FROM llm_results ORDER BY created_at DESC LIMIT 10').df()
df_llm

Next steps / notes:
- Tune prompt templates and JSON schema enforcement for robust parsing.
- Add retries, timeouts, and fallback for Ollama calls in production tasks.
- Move index persistence to a vector DB (Milvus/Weaviate) when scaling beyond a single node.
- Mask PII consistently and store PII mappings in a secure vault (HashiCorp Vault) if required.
