In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import os

# ====== CONFIGURATION ======
NUM_USERS = 5000  # Number of unique users
MIN_TXNS = 5      # Min transactions per user
MAX_TXNS = 20     # Max transactions per user
OUTPUT_PATH = os.path.join(os.path.expanduser("~/Desktop/transactionFraud"), "fraud_detection_dataset.csv")

# ====== INITIAL SETUP ======
faker = Faker()
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# ====== CREATE USERS ======
countries = ['IE', 'UK', 'US', 'DE', 'FR', 'IN', 'CN']
user_ids = [f"user_{i}" for i in range(NUM_USERS)]
device_ids = [faker.uuid4() for _ in range(NUM_USERS)]

user_profiles = {
    user_id: {
        "device_id": device_ids[i],
        "home_country": random.choice(countries)
    }
    for i, user_id in enumerate(user_ids)
}

# ====== GENERATE TRANSACTIONS ======
transactions = []
for user_id in user_ids:
    profile = user_profiles[user_id]
    for _ in range(random.randint(MIN_TXNS, MAX_TXNS)):
        timestamp = faker.date_time_between(start_date='-1y', end_date='now')
        amount = round(random.expovariate(1 / 50), 2)  # Skewed distribution
        transaction_country = random.choice(countries)
        device_used = profile["device_id"] if random.random() < 0.9 else faker.uuid4()
        is_fraud = 0

        # Introduce fraud patterns
        if amount > 500 or device_used != profile["device_id"] or transaction_country != profile["home_country"]:
            if random.random() < 0.4:
                is_fraud = 1

        transactions.append({
            "user_id": user_id,
            "device_id": device_used,
            "transaction_time": timestamp,
            "amount": amount,
            "transaction_country": transaction_country,
            "home_country": profile["home_country"],
            "is_fraud": is_fraud
        })

# ====== SAVE DATA ======
df = pd.DataFrame(transactions)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle rows
df.to_csv(OUTPUT_PATH, index=False)

print(f"✅ Dataset created successfully! Saved to: {OUTPUT_PATH}")
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")
print("Preview:")
print(df.head())