# Fraud Detection EDA
Comprehensive exploratory data analysis for a large-scale transaction dataset with a binary fraud target.

This notebook loads a dataset, summarizes schema and statistics, inspects categorical distributions,
assesses class imbalance, quantifies missingness, analyzes transaction amount, and visualizes
correlations including a numeric target representation.

In [None]:
# Imports
import os, sys, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Ensure project root is on path for module imports
sys.path.append(os.getcwd())
from eda import eda_fraud as eda

sns.set_theme(style='whitegrid', context='notebook')
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 160)
RANDOM_STATE = 42
print('Environment ready.')

In [None]:
# Configuration
DATASET_PATH = None  # e.g., '../data/transactions.csv' (leave None to auto-discover)
COLUMN_HINTS = {
    'target': None,            # e.g., 'is_fraud' or 'fraudulent'
    'amount': None,            # e.g., 'transaction_amount'
    'timestamp': None,         # e.g., 'transaction_time'
    'merchant_category': None, # e.g., 'merchant_category' or 'mcc'
    'device_type': None,       # e.g., 'device_type'
    'device_os': None,         # e.g., 'os'
    'user_gender': None,       # e.g., 'gender'
    'user_age': None,          # e.g., 'age'
    'user_income': None        # e.g., 'income'
}
PLOT_SAMPLE_N = 300000
AUTO_FIND_DATA = True
print('Configured. Set DATASET_PATH if auto-discovery fails.')

In [None]:
# Load dataset
df, path = eda.load_dataset(DATASET_PATH, auto_find_data=AUTO_FIND_DATA)
print(f'Loaded: {path}')
print(f'Rows: {len(df)}  Columns: {df.shape[1]}')
display(df.head(3))
# Normalize column names (strip spaces)
df.columns = [c.strip() for c in df.columns]

In [None]:
# Schema and statistical summary
print('DataFrame info:')
df.info()

print('
Descriptive statistics (numeric):')
display(df.describe().T)

In [None]:
# Infer columns and feature groups
inferred = eda.infer_columns(df, COLUMN_HINTS)
print('Inferred columns:')
for k, v in inferred.items():
    if v is not None:
        print(f'  {k:>18}: {v}')
features = eda.build_feature_groups(df, target_col=inferred.get('target'))
display(pd.Series({k: len(v) for k, v in features.items()}).to_frame('count'))
# Ensure timestamp parsed if present
ts_col = inferred.get('timestamp')
if ts_col and not pd.api.types.is_datetime64_any_dtype(df[ts_col]):
    df[ts_col] = pd.to_datetime(df[ts_col], errors='coerce', utc=True)

In [None]:
# Target class imbalance
_y, _pos = eda.class_imbalance(df, inferred.get('target'))
# _y maps target to {0,1} and stores as df['__y__'] when available

In [None]:
# Missing values: percentage per column and visualization
miss = df.isna().mean().sort_values(ascending=False)
display(miss.to_frame('missing_rate').style.format({'missing_rate': '{:.2%}'}))
top_missing = miss[miss > 0].head(30)
if not top_missing.empty:
    plt.figure(figsize=(10, max(3, 0.3*len(top_missing))))
    sns.barplot(x=top_missing.values, y=top_missing.index, color='#4C78A8')
    plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x:.0%}'))
    plt.title('Top missingness per column')
    plt.xlabel('Missing rate'); plt.ylabel('Column')
    plt.tight_layout(); plt.show()

# Missingness heatmap (sampled) via helper
eda.missingness(df)

In [None]:
# Categorical distributions: merchant category, device info, user demographics
cat_keys = [
    'merchant_category', 'device_type', 'device_os', 'user_gender', 'user_age', 'user_income', 'country', 'state', 'city'
]
cats = [inferred.get(k) for k in cat_keys if inferred.get(k) in df.columns]
if not cats:
    # Fallback: auto-detected categorical columns with reasonable cardinality
    cats = []
    for c in features.get('categorical', []):
        k = df[c].nunique(dropna=True)
        if 2 <= k <= 50:
            cats.append(c)
    cats = cats[:8]
for col in cats:
    display(pd.DataFrame(df[col].value_counts(dropna=False).head(20), columns=['count']))
    plt.figure(figsize=(10, 4))
    vc = df[col].value_counts(dropna=False).head(20)
    sns.barplot(x=vc.values, y=vc.index.astype(str), color='#4C78A8')
    plt.title(f'{col} top levels')
    plt.xlabel('Count'); plt.ylabel(col)
    plt.tight_layout(); plt.show()
    if '__y__' in df.columns:
        gr = df.groupby(col)['__y__'].mean().sort_values(ascending=False).head(20)
        plt.figure(figsize=(10, 4))
        sns.barplot(x=gr.values, y=gr.index.astype(str), color='#F58518')
        plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x:.1%}'))
        plt.title(f'Fraud rate by {col}')
        plt.xlabel('Fraud rate'); plt.ylabel(col)
        plt.tight_layout(); plt.show()

In [None]:
# Transaction amount analysis: summary, histogram, box plot
amount_col = inferred.get('amount')
if amount_col and amount_col in df.columns and pd.api.types.is_numeric_dtype(df[amount_col]):
    print(f'Amount column: {amount_col}')
    display(df[amount_col].describe().to_frame('value'))
    try:
        skew_val = df[amount_col].skew(skipna=True)
        print(f'Skewness: {skew_val:.4f}')
    except Exception:
        pass
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(df[amount_col].dropna(), kde=True, ax=axes[0], color='#4C78A8')
    axes[0].set_title(f'{amount_col} distribution')
    sns.boxplot(x=df[amount_col].dropna(), ax=axes[1], color='#F58518')
    axes[1].set_title(f'{amount_col} box plot')
    plt.tight_layout(); plt.show()
else:
    print('Transaction amount column not found or not numeric.')

In [None]:
# Correlation matrix including numeric target
num_cols = features.get('numeric', [])
cols_for_corr = list(num_cols)
if '__y__' in df.columns:
    cols_for_corr = cols_for_corr + ['__y__']
df_corr = df[cols_for_corr].copy()
# Drop near-constant columns to improve readability
keep = []
for c in cols_for_corr:
    try:
        if df_corr[c].nunique(dropna=True) > 1:
            keep.append(c)
    except Exception:
        pass
df_corr = df_corr[keep]
corr = df_corr.corr(method='pearson')
plt.figure(figsize=(min(1 + 0.4*len(corr), 14), min(1 + 0.4*len(corr), 14)))
sns.heatmap(corr, cmap='vlag', center=0, square=True)
plt.title('Correlation heatmap (including target if available)')
plt.tight_layout(); plt.show()

## Next steps
- **Feature engineering**: time-based aggregations, user/merchant risk features, device/IP reputation, target encoding for high-cardinality categories.
- **Preprocessing**: imputation, log-transform skewed amounts, scaling, outlier capping.
- **Model design**: time-aware splits, class weighting or resampling, calibration, SHAP/feature importance for interpretability.