# Synthetic Data Generator â€” Real-World Reconciliation (v2)

This notebook generates realistic, messy financial data for reconciliation.

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import uuid

## Helper Functions

In [None]:

def random_date(start, end):
    delta = end - start
    return start + timedelta(days=random.randint(0, delta.days))

def random_narration(customer_id, channel):
    templates = {
        "CHEQUE": [
            f"Cheque deposit for member {customer_id}",
            f"CHQ DEP MBR {customer_id}",
            f"Cheque dep member {customer_id}"
        ],
        "MPESA": [
            f"Mobile payment member {customer_id}",
            f"MPESA {customer_id}",
            f"M-PESA PAY {customer_id}"
        ],
        "CASH": [
            f"Cash deposit acc {customer_id}",
            f"CASH DEP {customer_id}"
        ]
    }
    return random.choice(templates[channel])


## Generate Payments System Data

In [None]:

np.random.seed(42)
random.seed(42)

n_payments = 8000
channels = ["CHEQUE", "MPESA", "CASH"]
statuses = ["SUCCESS", "FAILED"]

payments = []

start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 3, 31)

for _ in range(n_payments):
    cust_id = random.randint(20000, 29999)
    channel = random.choice(channels)
    amount = round(random.uniform(500, 50000), 2)
    status = random.choices(statuses, weights=[0.9, 0.1])[0]

    payments.append({
        "payment_ref": f"PS-{uuid.uuid4().hex[:10]}",
        "payment_timestamp": random_date(start_date, end_date),
        "value_date": random_date(start_date, end_date).date(),
        "amount": amount,
        "currency": "KES",
        "payment_channel": channel,
        "customer_id": str(cust_id),
        "narration": random_narration(cust_id, channel),
        "payment_status": status,
        "entered_by": random.choice(["TELLER", "SYSTEM", "BATCH"]),
        "branch_code": f"BR{random.randint(1,20):02d}",
        "expected_fee": round(amount * random.uniform(0.002, 0.01), 2)
    })

payments_df = pd.DataFrame(payments)
payments_df.head()


## Generate Bank Statement Data

In [None]:

bank_txns = []

for _, row in payments_df.sample(frac=0.85).iterrows():
    lag_days = random.randint(0, 3)
    fee = round(row["amount"] * random.uniform(0.001, 0.008), 2)

    bank_txns.append({
        "bank_txn_ref": f"BK-{uuid.uuid4().hex[:10]}",
        "posting_date": (row["payment_timestamp"] + timedelta(days=lag_days)).date(),
        "value_date": (row["payment_timestamp"] + timedelta(days=lag_days)).date(),
        "amount": round(row["amount"] - fee, 2),
        "currency": "KES",
        "debit_credit": "CR",
        "narration": row["narration"]
            .replace("member", "MBR")
            .replace("Cheque", "CHQ")
            .replace("deposit", "DEP"),
        "bank_branch": f"BBR{random.randint(1,15):02d}",
        "source_system": row["payment_channel"],
        "fees_deducted": fee
    })

bank_df = pd.DataFrame(bank_txns)
bank_df.head()


## Generate Refunds Data

In [None]:

refunds = []

failed_payments = payments_df[payments_df["payment_status"] == "FAILED"].sample(frac=0.7)

for _, row in failed_payments.iterrows():
    refunds.append({
        "refund_ref": f"RF-{uuid.uuid4().hex[:8]}",
        "refund_timestamp": row["payment_timestamp"] + timedelta(days=random.randint(1,5)),
        "refund_amount": row["amount"] * random.choice([1, 1.05]),
        "currency": "KES",
        "refund_channel": row["payment_channel"],
        "customer_id": row["customer_id"],
        "narration": f"Refund {row['narration']}",
        "refund_reason": random.choice(["FAILED_TXN", "DUPLICATE"]),
        "approved_by": random.choice(["SUPERVISOR", "SYSTEM"]),
        "linked_payment_hint": row["narration"],
        "refund_status": "PROCESSED"
    })

refunds_df = pd.DataFrame(refunds)
refunds_df.head()


## Save Datasets

In [None]:

payments_df.to_csv("C:/Payments Reconciliation/payments_system_v2.csv", index=False)
bank_df.to_csv("C:/Payments Reconciliation/bank_statement_v2.csv", index=False)
refunds_df.to_csv("C:/Payments Reconciliation/refunds_v2.csv", index=False)

print("Files generated successfully")
