In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_hyper_realistic_rift(n_entries=20000, filename="rift_2026_data.csv"):
    np.random.seed(42)
    random.seed(42)
    
    data = []
    base_date = datetime(2026, 2, 1)
    
    # 1. ACTOR DEFINITION
    merchants = [f"MERCH_HUB_{i:03d}" for i in range(150)]
    users = [f"USER_ACC_{i:05d}" for i in range(4500)]
    employers = [f"CORP_HQ_{i:02d}" for i in range(5)]
    
    # 2. CIRCADIAN RHYTHM LOGIC
    # Probability per hour (peaks at lunch/evening)
    hourly_probs = np.array([0.005, 0.002, 0.002, 0.002, 0.005, 0.015, 0.04, 0.07, 
                             0.08, 0.07, 0.06, 0.08, 0.10, 0.09, 0.07, 0.06, 
                             0.08, 0.09, 0.05, 0.03, 0.02, 0.01, 0.005, 0.005])
    hourly_probs /= hourly_probs.sum()

    # 3. GENERATE NORMAL ECONOMY (17,500 entries)
    print("Simulating legitimate human transactions...")
    for i in range(17500):
        sender = random.choice(users)
        receiver = random.choice(merchants + users[:500])
        amount = round(np.random.lognormal(mean=3.8, sigma=1.0), 2)
        
        # Human timing
        day_offset = random.randint(0, 18)
        hour = np.random.choice(range(24), p=hourly_probs)
        ts = base_date + timedelta(days=day_offset, hours=int(hour), minutes=random.randint(0,59))
        data.append([f"TXN_L_{i:05d}", sender, receiver, amount, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # 4. INJECT FRAUD RINGS (Cycles & Smurfing)
    print("Injecting complex fraud patterns...")
    # 20 Rings (Cycles)
    for r in range(20):
        nodes = [f"FRAUD_MULE_C_{r}_{j}" for j in range(random.randint(3, 5))]
        amt = round(random.uniform(9800, 9995), 2) # Structuring
        start_ts = base_date + timedelta(days=10, hours=10)
        for j in range(len(nodes)):
            ts = start_ts + timedelta(minutes=random.randint(5, 30) * j)
            data.append([f"TXN_FC_{r}_{j}", nodes[j], nodes[(j+1)%len(nodes)], amt, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # 15 Smurfing Aggregators
    for s in range(15):
        hub = f"FRAUD_HUB_S_{s}"
        start_ts = base_date + timedelta(days=12)
        for j in range(25):
            mule = f"FRAUD_MULE_S_{s}_{j}"
            ts = start_ts + timedelta(minutes=random.randint(1, 120) * j)
            data.append([f"TXN_FS_{s}_{j}", mule, hub, round(random.uniform(100, 500), 2), ts.strftime('%Y-%m-%d %H:%M:%S')])

    # 5. INJECT PAYROLL TRAPS (1 Corp -> Many Employees)
    print("Planting high-volume legitimate traps...")
    for p in range(len(employers)):
        boss = employers[p]
        ts = base_date + timedelta(days=14, hours=9)
        for i in range(150):
            data.append([f"TXN_P_{p}_{i}", boss, users[i], 3500.0, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # 6. EXPORT
    df = pd.DataFrame(data, columns=['transaction_id', 'sender_id', 'receiver_id', 'amount', 'timestamp'])
    df = df.sample(frac=1).reset_index(drop=True).head(n_entries)
    df.to_csv(filename, index=False)
    print(f"Dataset generated: {filename} ({len(df)} entries)")

if __name__ == "__main__":
    generate_hyper_realistic_rift()

Simulating legitimate human transactions...
Injecting complex fraud patterns...
Planting high-volume legitimate traps...
Dataset generated: rift_2026_data.csv (18702 entries)


In [2]:
import pandas as pd
import numpy as np
import random
import uuid
from datetime import datetime, timedelta

def generate_evaluation_dataset(filename="rift_deterministic_20k.csv"):
    np.random.seed(42)
    random.seed(42)
    
    data = []
    base_date = datetime(2026, 2, 1)
    
    # --- 1. THE CONTROLLED FRAUD (OPTION 1: Exact Matches) ---
    print("Injecting Controlled Fraud (Deterministic Rings)...")
    
    # RING_001: Cycle of 3 (High Velocity)
    # Predicted Score: ~95.0
    ring1_nodes = ["ACC_90001", "ACC_90002", "ACC_90003"]
    for j in range(3):
        ts = base_date + timedelta(days=5, hours=10, minutes=j*10)
        data.append([f"TXN_RING_001_{j}", ring1_nodes[j], ring1_nodes[(j+1)%3], 4999.0, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # RING_002: Smurfing Fan-In (12 Mules -> 1 Aggregator)
    # Predicted Score: ~85.0
    aggregator = "ACC_AGG_999"
    for j in range(12):
        mule = f"ACC_MULE_S_{j:02d}"
        ts = base_date + timedelta(days=6, hours=12, minutes=j*5) # All within 1 hour
        data.append([f"TXN_SMURF_{j}", mule, aggregator, 150.75, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # RING_003: Layered Chain (A -> B -> C -> D)
    # Testing Shell Account Detection (B and C have only 2 transactions each)
    chain = ["ACC_CH_A", "ACC_CH_B", "ACC_CH_C", "ACC_CH_D"]
    for j in range(3):
        ts = base_date + timedelta(days=7, hours=14, minutes=j*15)
        data.append([f"TXN_CHAIN_{j}", chain[j], chain[j+1], 7000.0, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # --- 2. THE HUB TRAPS (Legitimate High-Volume) ---
    print("Injecting Legitimate Hubs (The Traps)...")
    
    # Trap: Payroll (1 Corp -> 200 Employees) - Should NOT be a ring
    corp_acc = "CORP_PAYROLL_HQ"
    for e in range(200):
        ts = base_date + timedelta(days=0, hours=9) # Feb 1st
        data.append([f"TXN_PAY_{e}", corp_acc, f"ACC_EMP_{e:03d}", 3500.0, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # Trap: High-Volume Merchant (Many -> 1 over weeks) - Should NOT be smurfing
    merchant_acc = "MEGA_RETAIL_STORE"
    for m in range(500):
        ts = base_date + timedelta(days=random.randint(0, 15), hours=random.randint(9, 21))
        data.append([f"TXN_RETAIL_{m}", f"ACC_USER_{random.randint(1000, 2000)}", merchant_acc, random.uniform(20, 150), ts.strftime('%Y-%m-%d %H:%M:%S')])

    # --- 3. THE HYPER-REAL NOISE (Benford's Law & Circadian) ---
    print("Simulating organic background economy...")
    
    hourly_probs = np.array([0.005, 0.002, 0.002, 0.002, 0.005, 0.015, 0.04, 0.07, 0.08, 0.07, 0.06, 0.08, 
                             0.10, 0.09, 0.07, 0.06, 0.08, 0.09, 0.05, 0.03, 0.02, 0.01, 0.005, 0.005])
    hourly_probs /= hourly_probs.sum()

    remaining_entries = 20000 - len(data)
    for i in range(remaining_entries):
        u1, u2 = f"ACC_U_{random.randint(1, 4000)}", f"ACC_U_{random.randint(1, 4000)}"
        while u1 == u2: u2 = f"ACC_U_{random.randint(1, 4000)}"
        
        # Benford's Law distribution for amount
        amount = round(np.random.lognormal(3.7, 0.9), 2)
        
        day = random.randint(0, 18)
        hour = np.random.choice(range(24), p=hourly_probs)
        ts = base_date + timedelta(days=day, hours=int(hour), minutes=random.randint(0,59))
        
        data.append([str(uuid.uuid4())[:8], u1, u2, amount, ts.strftime('%Y-%m-%d %H:%M:%S')])

    # --- 4. EXPORT AND SHUFFLE ---
    df = pd.DataFrame(data, columns=['transaction_id', 'sender_id', 'receiver_id', 'amount', 'timestamp'])
    df = df.sample(frac=1).reset_index(drop=True)
    df.to_csv(filename, index=False)
    
    print(f"\nSUCCESS: Dataset '{filename}' created.")
    print(f"Total Entries: {len(df)}")
    print("-" * 30)
    print("GROUND TRUTH FOR YOUR ALGORITHM:")
    print("1. Cycle Expected: [ACC_90001, ACC_90002, ACC_90003] -> RING_001")
    print("2. Smurf Expected: [ACC_AGG_999] + 12 Mules -> RING_002")
    print("3. Chain Expected: [ACC_CH_A, ACC_CH_B, ACC_CH_C, ACC_CH_D] -> RING_003")
    print("4. Trap Expected: CORP_PAYROLL_HQ (High degree but NO ring/velocity)")

if __name__ == "__main__":
    generate_evaluation_dataset()

Injecting Controlled Fraud (Deterministic Rings)...
Injecting Legitimate Hubs (The Traps)...
Simulating organic background economy...

SUCCESS: Dataset 'rift_deterministic_20k.csv' created.
Total Entries: 20000
------------------------------
GROUND TRUTH FOR YOUR ALGORITHM:
1. Cycle Expected: [ACC_90001, ACC_90002, ACC_90003] -> RING_001
2. Smurf Expected: [ACC_AGG_999] + 12 Mules -> RING_002
3. Chain Expected: [ACC_CH_A, ACC_CH_B, ACC_CH_C, ACC_CH_D] -> RING_003
4. Trap Expected: CORP_PAYROLL_HQ (High degree but NO ring/velocity)


In [3]:
from pathlib import Path

# Base folder
BASE_DIR = Path("backend")

# Files to create
FILES = [
    BASE_DIR / "main.py",
    BASE_DIR / "engine.py",
    BASE_DIR / "schemas.py",
    BASE_DIR / "requirements.txt",
    BASE_DIR / ".gitignore",
]

def main():
    # Create backend directory
    BASE_DIR.mkdir(parents=True, exist_ok=True)

    # Create files
    for file in FILES:
        if not file.exists():
            file.touch()

    print("✅ Backend folder structure created successfully.")

if __name__ == "__main__":
    main()


✅ Backend folder structure created successfully.


In [4]:
import random
import pandas as pd
from datetime import datetime, timedelta
import uuid

# -----------------------------
# Configuration
# -----------------------------
TOTAL_TRANSACTIONS = 40000
START_DATE = datetime(2025, 1, 1)

random.seed(42)

transactions = []
transaction_counter = 1


def random_timestamp():
    delta_days = random.randint(0, 120)
    delta_seconds = random.randint(0, 86400)
    return START_DATE + timedelta(days=delta_days, seconds=delta_seconds)


def add_transaction(sender, receiver, amount, timestamp):
    global transaction_counter
    transactions.append({
        "transaction_id": f"T_{transaction_counter:06d}",
        "sender_id": sender,
        "receiver_id": receiver,
        "amount": round(amount, 2),
        "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S")
    })
    transaction_counter += 1


# -----------------------------
# Generate Legitimate Accounts
# -----------------------------
NUM_USERS = 8000
NUM_MERCHANTS = 300
NUM_PAYROLL = 50

users = [f"USER_{i}" for i in range(NUM_USERS)]
merchants = [f"MERCHANT_{i}" for i in range(NUM_MERCHANTS)]
payrolls = [f"PAYROLL_{i}" for i in range(NUM_PAYROLL)]

# -----------------------------
# 1️⃣ Legitimate Merchant Payments
# -----------------------------
for _ in range(int(TOTAL_TRANSACTIONS * 0.5)):
    sender = random.choice(users)
    receiver = random.choice(merchants)
    amount = random.uniform(10, 500)
    add_transaction(sender, receiver, amount, random_timestamp())

# -----------------------------
# 2️⃣ Payroll Distribution
# -----------------------------
for payroll in payrolls:
    employees = random.sample(users, 200)
    pay_date = START_DATE + timedelta(days=random.randint(0, 120))
    for emp in employees:
        add_transaction(payroll, emp, random.uniform(1000, 5000), pay_date)

# -----------------------------
# 3️⃣ Legitimate P2P Transfers
# -----------------------------
for _ in range(int(TOTAL_TRANSACTIONS * 0.15)):
    sender, receiver = random.sample(users, 2)
    amount = random.uniform(5, 200)
    add_transaction(sender, receiver, amount, random_timestamp())

# -----------------------------
# 4️⃣ Cycle Fraud (3–5 length)
# -----------------------------
for i in range(40):
    size = random.choice([3, 4, 5])
    nodes = [f"CYC_{i}_{j}" for j in range(size)]

    for j in range(size):
        sender = nodes[j]
        receiver = nodes[(j + 1) % size]
        for _ in range(5):
            add_transaction(sender, receiver, random.uniform(500, 2000), random_timestamp())

# -----------------------------
# 5️⃣ Smurfing Fraud
# -----------------------------
for i in range(20):
    aggregator = f"SMURF_AGG_{i}"
    mules = [f"SMURF_MULE_{i}_{j}" for j in range(15)]

    base_time = random_timestamp()

    # Fan-in (within 72h)
    for mule in mules:
        for _ in range(3):
            add_transaction(mule, aggregator, random.uniform(900, 990),
                            base_time + timedelta(hours=random.randint(0, 48)))

    # Fan-out
    final_receiver = f"SMURF_DEST_{i}"
    for _ in range(10):
        add_transaction(aggregator, final_receiver, random.uniform(5000, 8000),
                        base_time + timedelta(hours=random.randint(48, 72)))

# -----------------------------
# 6️⃣ Shell Layering Fraud
# -----------------------------
for i in range(30):
    chain = [f"SHELL_{i}_{j}" for j in range(4)]
    base_time = random_timestamp()

    add_transaction(chain[0], chain[1], random.uniform(2000, 4000), base_time)
    add_transaction(chain[1], chain[2], random.uniform(1900, 3800), base_time + timedelta(hours=3))
    add_transaction(chain[2], chain[3], random.uniform(1800, 3600), base_time + timedelta(hours=6))

# -----------------------------
# 7️⃣ Fraud Trap (Legitimate-looking high volume)
# -----------------------------
for i in range(10):
    legit_like = f"TRAP_{i}"
    senders = random.sample(users, 150)
    for sender in senders:
        add_transaction(sender, legit_like, random.uniform(20, 80), random_timestamp())

# -----------------------------
# Shuffle Transactions
# -----------------------------
random.shuffle(transactions)

df = pd.DataFrame(transactions)

# Trim to exactly 40K rows
df = df.head(TOTAL_TRANSACTIONS)

df.to_csv("transactions_40k_realistic.csv", index=False)

print("✅ 40K realistic dataset generated successfully.")


✅ 40K realistic dataset generated successfully.
