# Financial Synthetic Data Example

Generate synthetic financial transaction data for fraud detection model development.

In [None]:
import numpy as np
import pandas as pd

from genesis import SyntheticGenerator, PrivacyConfig, QualityEvaluator, Constraint

## Create Sample Transaction Data

In [None]:
np.random.seed(42)
n = 5000

# Transaction amounts (log-normal distribution)
amount = np.exp(np.random.normal(4, 1.5, n)).clip(1, 50000)

# Transaction types
tx_types = np.random.choice(
    ['Purchase', 'Transfer', 'Withdrawal', 'Deposit', 'Payment'],
    n,
    p=[0.4, 0.15, 0.15, 0.2, 0.1]
)

# Merchant categories
merchants = np.random.choice(
    ['Retail', 'Restaurant', 'Online', 'Travel', 'Utilities', 'ATM'],
    n,
    p=[0.3, 0.2, 0.2, 0.1, 0.1, 0.1]
)

# Time features
hour = np.random.choice(range(24), n, p=[
    0.01, 0.01, 0.01, 0.01, 0.01, 0.02, 0.03, 0.05, 0.06, 0.07,
    0.07, 0.07, 0.08, 0.07, 0.06, 0.06, 0.05, 0.05, 0.05, 0.04,
    0.04, 0.03, 0.02, 0.02
])

day_of_week = np.random.choice(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
    n,
    p=[0.16, 0.15, 0.15, 0.15, 0.17, 0.12, 0.10]
)

# Account age (months)
account_age = np.random.exponential(24, n).clip(1, 120).astype(int)

# Credit score
credit_score = np.random.normal(700, 80, n).clip(300, 850).astype(int)

# Fraud label (2% fraud rate, higher for large amounts at odd hours)
fraud_prob = 0.01 + (amount > 5000) * 0.05 + (hour < 6) * 0.03
is_fraud = np.random.random(n) < fraud_prob

transactions = pd.DataFrame({
    'amount': amount.round(2),
    'transaction_type': tx_types,
    'merchant_category': merchants,
    'hour': hour,
    'day_of_week': day_of_week,
    'account_age_months': account_age,
    'credit_score': credit_score,
    'is_fraud': is_fraud
})

print(f"Transactions: {len(transactions)}")
print(f"Fraud rate: {transactions['is_fraud'].mean()*100:.2f}%")
transactions.head()

## Configure Generator

In [None]:
# Privacy settings
privacy = PrivacyConfig(
    enable_differential_privacy=True,
    epsilon=1.0,
    suppress_rare_categories=True,
)

# Business constraints
constraints = [
    Constraint.positive('amount'),
    Constraint.range('hour', 0, 23),
    Constraint.range('credit_score', 300, 850),
    Constraint.positive('account_age_months'),
]

discrete_cols = ['transaction_type', 'merchant_category', 'day_of_week', 'is_fraud']

## Generate Synthetic Transactions

In [None]:
generator = SyntheticGenerator(method='ctgan', privacy=privacy)
generator.fit(transactions, discrete_columns=discrete_cols, constraints=constraints)

synthetic_transactions = generator.generate(n_samples=5000)

print(f"Synthetic transactions: {len(synthetic_transactions)}")
print(f"Synthetic fraud rate: {synthetic_transactions['is_fraud'].mean()*100:.2f}%")
synthetic_transactions.head()

## Evaluate Quality

In [None]:
evaluator = QualityEvaluator(transactions, synthetic_transactions)
report = evaluator.evaluate(target_column='is_fraud')

print(report.summary())

## Train Fraud Detection Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

def prepare_features(df):
    df_encoded = df.copy()
    for col in ['transaction_type', 'merchant_category', 'day_of_week']:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
    return df_encoded

# Prepare data
real_encoded = prepare_features(transactions)
syn_encoded = prepare_features(synthetic_transactions)

feature_cols = ['amount', 'transaction_type', 'merchant_category', 'hour', 
                'account_age_months', 'credit_score']

# Train on synthetic, test on real (TSTR)
X_train = syn_encoded[feature_cols]
y_train = syn_encoded['is_fraud']
X_test = real_encoded[feature_cols]
y_test = real_encoded['is_fraud']

model_synthetic = RandomForestClassifier(n_estimators=100, random_state=42)
model_synthetic.fit(X_train, y_train)

y_pred = model_synthetic.predict(X_test)
y_proba = model_synthetic.predict_proba(X_test)[:, 1]

print("TSTR (Train-Synthetic, Test-Real) Results:")
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.3f}")
print(classification_report(y_test, y_pred))

## Compare with Real-Trained Model

In [None]:
# Train on real, test on real (baseline)
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    real_encoded[feature_cols], real_encoded['is_fraud'], test_size=0.3, random_state=42
)

model_real = RandomForestClassifier(n_estimators=100, random_state=42)
model_real.fit(X_train_real, y_train_real)

y_pred_real = model_real.predict(X_test_real)
y_proba_real = model_real.predict_proba(X_test_real)[:, 1]

print("TRTR (Train-Real, Test-Real) Results:")
print(f"ROC-AUC: {roc_auc_score(y_test_real, y_proba_real):.3f}")
print(classification_report(y_test_real, y_pred_real))

## Export

In [None]:
synthetic_transactions.to_csv('synthetic_transactions.csv', index=False)
report.save_html('fraud_data_quality_report.html')

print("Files saved!")