In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

random.seed(42)
np.random.seed(42)

ACCOUNT_HOLDER = "John Doe"

START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2024, 6, 30)

MONTHLY_INCOME = 60000

MERCHANTS = {
    "Food": ["SWIGGY", "ZOMATO", "CAFE COFFEE DAY"],
    "Transport": ["UBER", "OLA"],
    "Shopping": ["AMAZON", "FLIPKART"],
    "Entertainment": ["NETFLIX", "SPOTIFY"],
    "Utilities": ["ELECTRICITY BILL", "MOBILE RECHARGE"],
    "Housing": ["HOUSE RENT"],
    "Healthcare": ["APOLLO PHARMACY"],
}

FIXED_EXPENSES = {
    "HOUSE RENT": 15000,
    "NETFLIX": 799,
    "SPOTIFY": 119,
    "MOBILE RECHARGE": 499,
    "ELECTRICITY BILL": 1200,
}

def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

transactions = []
balance = 0  # opening balance

current_date = START_DATE

while current_date <= END_DATE:
    # Salary credit (once per month)
    if current_date.day == 1:
        balance += MONTHLY_INCOME
        transactions.append({
            "account_holder": ACCOUNT_HOLDER,
            "date": current_date,
            "description": "SALARY CREDIT",
            "debit_amount": 0,
            "credit_amount": MONTHLY_INCOME,
            "balance": balance
        })

    # Fixed expenses
    for merchant, amount in FIXED_EXPENSES.items():
        if random.random() < 0.03:  # roughly monthly
            balance -= amount
            transactions.append({
                "account_holder": ACCOUNT_HOLDER,
                "date": current_date,
                "description": merchant,
                "debit_amount": amount,
                "credit_amount": 0,
                "balance": balance
            })

    # Random discretionary spend
    if random.random() < 0.4:
        category = random.choice(list(MERCHANTS.keys()))
        merchant = random.choice(MERCHANTS[category])
        amount = random.randint(100, 2500)

        if balance - amount > 0:
            balance -= amount
            transactions.append({
                "account_holder": ACCOUNT_HOLDER,
                "date": current_date,
                "description": merchant,
                "debit_amount": amount,
                "credit_amount": 0,
                "balance": balance
            })

    current_date += timedelta(days=1)

df = pd.DataFrame(transactions)
df = df.sort_values("date").reset_index(drop=True)

df.to_csv("synthetic_hdfc_john_doe.csv", index=False)

print("Synthetic dataset generated:", df.shape)


Synthetic dataset generated: (90, 6)
