In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random


In [2]:
NUM_USERS = 1000
NUM_MERCHANTS = 200
NUM_TRANSACTIONS = 50000

START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2024, 3, 31)

CHANNELS = ["card", "upi", "netbanking"]
LOCATIONS = ["urban", "semi-urban", "rural"]

FRAUD_TYPES = ["none", "card_testing", "account_takeover", "promo_abuse"]


In [3]:
def random_timestamp(start, end):
    delta = end - start
    seconds = random.randint(0, int(delta.total_seconds()))
    return start + timedelta(seconds=seconds)


In [4]:
transactions = []

for i in range(NUM_TRANSACTIONS):
    user_id = random.randint(1, NUM_USERS)
    merchant_id = random.randint(1, NUM_MERCHANTS)

    amount = np.round(np.random.exponential(scale=150), 2)
    channel = random.choice(CHANNELS)
    location = random.choice(LOCATIONS)

    transaction = {
        "transaction_id": i + 1,
        "user_id": user_id,
        "merchant_id": merchant_id,
        "amount": amount,
        "channel": channel,
        "location": location,
        "timestamp": random_timestamp(START_DATE, END_DATE),
        "device_change": 0,
        "geo_distance": np.round(np.random.normal(5, 2), 2),
        "fraud_type": "none",
        "fraud_strategy_version": 0,
        "is_fraud": 0
    }

    transactions.append(transaction)


In [5]:
df = pd.DataFrame(transactions)
df.head()


Unnamed: 0,transaction_id,user_id,merchant_id,amount,channel,location,timestamp,device_change,geo_distance,fraud_type,fraud_strategy_version,is_fraud
0,1,804,135,491.12,netbanking,urban,2024-02-03 00:36:27,0,2.97,none,0,0
1,2,809,163,57.36,card,urban,2024-01-05 04:57:22,0,1.66,none,0,0
2,3,87,16,121.6,netbanking,urban,2024-02-01 13:15:17,0,2.63,none,0,0
3,4,644,180,319.49,card,urban,2024-02-28 01:44:40,0,2.74,none,0,0
4,5,708,195,63.64,card,urban,2024-02-08 10:23:52,0,6.16,none,0,0


In [6]:
fraud_indices = np.random.choice(
    df.index, size=int(0.03 * len(df)), replace=False
)

for idx in fraud_indices:
    df.at[idx, "is_fraud"] = 1

    # Assign fraud type
    fraud_type = random.choice(FRAUD_TYPES[1:])
    df.at[idx, "fraud_type"] = fraud_type

    # Fraud evolution by time
    if df.at[idx, "timestamp"] < datetime(2024, 2, 1):
        version = 1
        df.at[idx, "amount"] *= np.random.uniform(0.3, 0.7)
        df.at[idx, "device_change"] = 0
    elif df.at[idx, "timestamp"] < datetime(2024, 3, 1):
        version = 2
        df.at[idx, "amount"] *= np.random.uniform(1.2, 2.0)
        df.at[idx, "device_change"] = 1
        df.at[idx, "geo_distance"] *= 3
    else:
        version = 3
        df.at[idx, "amount"] *= np.random.uniform(2.5, 4.0)
        df.at[idx, "device_change"] = 1
        df.at[idx, "geo_distance"] *= 5

    df.at[idx, "fraud_strategy_version"] = version


In [7]:
df["fraud_strategy_version"].value_counts()


fraud_strategy_version
0    48500
3      517
1      504
2      479
Name: count, dtype: int64

In [9]:
import os

os.makedirs("data/synthetic", exist_ok=True)
print("Folder created or already exists")


Folder created or already exists


In [10]:
df.sort_values("timestamp", inplace=True)
df.to_csv("data/synthetic/transactions.csv", index=False)

print("Dataset saved successfully!")
print("Shape:", df.shape)


Dataset saved successfully!
Shape: (50000, 12)
