# RIFT 2026 — Synthetic Transaction Data Generator

Generates realistic CSV datasets with embedded fraud patterns and legitimate account traps.

In [None]:
import pandas as pd
import numpy as np
import random
import json
from datetime import datetime, timedelta
from pathlib import Path

random.seed(42)
np.random.seed(42)

OUTPUT_DIR = Path('../data')
OUTPUT_DIR.mkdir(exist_ok=True)

BASE_DATE = datetime(2024, 1, 1, 8, 0, 0)
transactions = []
ground_truth_accounts = set()
ground_truth_rings = []
txn_counter = [0]  # mutable counter

def txn_id():
    txn_counter[0] += 1
    return f'TXN_{txn_counter[0]:06d}'

def acc(n):
    return f'ACC_{n:05d}'

def rand_time(base, max_hours=72):
    return base + timedelta(hours=random.uniform(0, max_hours))

print('Synthetic data generator ready.')

In [None]:
# ── 1. CYCLE PATTERNS (3-node, 4-node, 5-node rings) ──────────────────────────
cycle_configs = [
    (range(1, 21),   3, 'RING_C3'),   # 20 × 3-node cycles
    (range(101, 116), 4, 'RING_C4'),  # 15 × 4-node cycles
    (range(201, 211), 5, 'RING_C5'),  # 10 × 5-node cycles
]
ring_id = 1
acc_start = 1000

for config_range, length, label in cycle_configs:
    for _ in config_range:
        members = [acc(acc_start + i) for i in range(length)]
        acc_start += length
        t = BASE_DATE + timedelta(days=random.randint(0, 60))
        for i in range(length):
            amt = round(random.uniform(500, 5000), 2)
            transactions.append({
                'transaction_id': txn_id(),
                'sender_id': members[i],
                'receiver_id': members[(i + 1) % length],
                'amount': amt,
                'timestamp': (t + timedelta(hours=i * 2)).strftime('%Y-%m-%d %H:%M:%S'),
            })
        for m in members:
            ground_truth_accounts.add(m)
        ground_truth_rings.append({'ring_id': f'RING_{ring_id:03d}', 'type': label, 'members': members})
        ring_id += 1

print(f'Cycle transactions: {len(transactions)}')

In [None]:
# ── 2. SMURFING PATTERNS (Fan-in + Fan-out) ────────────────────────────────────
for _ in range(15):  # 15 smurfing rings
    aggregator = acc(acc_start); acc_start += 1
    senders   = [acc(acc_start + i) for i in range(12)]; acc_start += 12
    receivers = [acc(acc_start + i) for i in range(12)]; acc_start += 12
    t = BASE_DATE + timedelta(days=random.randint(0, 60))
    # Fan-in: 12 senders → aggregator within 48h
    for s in senders:
        transactions.append({
            'transaction_id': txn_id(),
            'sender_id': s,
            'receiver_id': aggregator,
            'amount': round(random.uniform(1000, 9999), 2),
            'timestamp': (t + timedelta(hours=random.uniform(0, 48))).strftime('%Y-%m-%d %H:%M:%S'),
        })
    # Fan-out: aggregator → 12 receivers within 72h
    for r in receivers:
        transactions.append({
            'transaction_id': txn_id(),
            'sender_id': aggregator,
            'receiver_id': r,
            'amount': round(random.uniform(500, 5000), 2),
            'timestamp': (t + timedelta(hours=random.uniform(48, 72))).strftime('%Y-%m-%d %H:%M:%S'),
        })
    members = [aggregator] + senders + receivers
    for m in members:
        ground_truth_accounts.add(m)
    ground_truth_rings.append({'ring_id': f'RING_{ring_id:03d}', 'type': 'smurfing', 'members': members})
    ring_id += 1

print(f'After smurfing: {len(transactions)} transactions')

In [None]:
# ── 3. SHELL NETWORK PATTERNS ─────────────────────────────────────────────────
for _ in range(10):  # 10 shell chains
    source_acc = acc(acc_start); acc_start += 1
    shell1     = acc(acc_start); acc_start += 1
    shell2     = acc(acc_start); acc_start += 1
    dest_acc   = acc(acc_start); acc_start += 1
    t = BASE_DATE + timedelta(days=random.randint(0, 60))
    chain = [source_acc, shell1, shell2, dest_acc]
    for i in range(len(chain)-1):
        transactions.append({
            'transaction_id': txn_id(),
            'sender_id': chain[i],
            'receiver_id': chain[i+1],
            'amount': round(random.uniform(5000, 50000), 2),
            'timestamp': (t + timedelta(hours=i * 6 + random.uniform(0, 3))).strftime('%Y-%m-%d %H:%M:%S'),
        })
    for m in chain:
        ground_truth_accounts.add(m)
    ground_truth_rings.append({'ring_id': f'RING_{ring_id:03d}', 'type': 'shell_network', 'members': chain})
    ring_id += 1

print(f'After shell networks: {len(transactions)} transactions')

In [None]:
# ── 4. LEGITIMATE ACCOUNT TRAPS ───────────────────────────────────────────────
# High-volume merchant: 200+ transactions but NOT fraud
merchant = acc(9000)
for i in range(200):
    buyer = acc(9100 + i)
    t = BASE_DATE + timedelta(days=i // 5, hours=random.uniform(0, 24))
    transactions.append({
        'transaction_id': txn_id(),
        'sender_id': buyer,
        'receiver_id': merchant,
        'amount': round(random.uniform(10, 500), 2),
        'timestamp': t.strftime('%Y-%m-%d %H:%M:%S'),
    })

# Payroll account: 1 employer → 60 employees monthly
employer = acc(9500)
for i in range(60):
    emp = acc(9600 + i)
    for month in range(3):
        t = BASE_DATE + timedelta(days=month * 30, hours=9)
        transactions.append({
            'transaction_id': txn_id(),
            'sender_id': employer,
            'receiver_id': emp,
            'amount': round(random.uniform(3000, 8000), 2),
            'timestamp': t.strftime('%Y-%m-%d %H:%M:%S'),
        })

# Random normal transactions
normal_accs = [acc(10000 + i) for i in range(300)]
for _ in range(1500):
    s, r = random.sample(normal_accs, 2)
    t = BASE_DATE + timedelta(days=random.randint(0, 90), hours=random.uniform(0, 24))
    transactions.append({
        'transaction_id': txn_id(),
        'sender_id': s,
        'receiver_id': r,
        'amount': round(random.uniform(50, 10000), 2),
        'timestamp': t.strftime('%Y-%m-%d %H:%M:%S'),
    })

print(f'Total transactions: {len(transactions)}')

In [None]:
# ── 5. SAVE CSV AND GROUND TRUTH ──────────────────────────────────────────────
random.shuffle(transactions)
df = pd.DataFrame(transactions)
df = df.drop_duplicates(subset=['transaction_id'])

csv_path = OUTPUT_DIR / 'sample_transactions.csv'
df.to_csv(csv_path, index=False)
print(f'Saved CSV: {csv_path} ({len(df)} rows)')

ground_truth = {
    'fraud_accounts': sorted(list(ground_truth_accounts)),
    'fraud_rings': ground_truth_rings,
    'total_fraud_accounts': len(ground_truth_accounts),
    'total_rings': len(ground_truth_rings),
    'legitimate_traps': {
        'high_volume_merchant': merchant,
        'payroll_employer': employer,
        'normal_accounts': normal_accs[:10],
    }
}
gt_path = OUTPUT_DIR / 'ground_truth.json'
with open(gt_path, 'w') as f:
    json.dump(ground_truth, f, indent=2)
print(f'Saved ground truth: {gt_path}')
print(f'Total fraud accounts: {len(ground_truth_accounts)}')
print(f'Total fraud rings: {len(ground_truth_rings)}')
df.head()