# Credit Card Fraud EDA

Initial profiling of the `creditcard.csv` dataset to understand class imbalance, feature distributions, and data quality signals.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

DATA_PATH = Path("../../data/raw/creditcard_fraud/creditcard.csv").resolve()
DATA_PATH

In [None]:
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
df_shape = df.shape
df_types = df.dtypes.value_counts()

print("Shape:", df_shape)
df_types

In [None]:
numeric_summary = df.describe().T
numeric_summary.head()

In [None]:
missing = (df.isna().sum() / len(df)).sort_values(ascending=False)
missing[missing > 0]

In [None]:
class_counts = df['Class'].value_counts().rename({0: "Legit", 1: "Fraud"})
class_ratio = class_counts / class_counts.sum()

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
class_counts.plot(kind='bar', ax=ax[0], color=["#2b8cbe", "#de2d26"])
ax[0].set_title("Transaction Count by Class")
ax[0].set_ylabel("Count")

class_ratio.plot(kind='bar', ax=ax[1], color=["#2b8cbe", "#de2d26"])
ax[1].set_title("Class Proportion")
ax[1].set_ylabel("Ratio")

plt.tight_layout()
plt.show()

class_counts, class_ratio

In [None]:
corr_matrix = df.drop(columns=['Class']).corr(method='pearson')
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap (Features only)')
plt.show()

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(12, 12))
sample_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'Amount']

for ax, feature in zip(axes.flatten(), sample_features):
    sns.kdeplot(data=df, x=feature, hue='Class', common_norm=False, ax=ax, fill=True)
    ax.set_title(f'Distribution by Class: {feature}')

plt.tight_layout()
plt.show()

## Candidate Data Quality Checks

- Class label presence (`Class` should only contain 0 or 1).
- No missing values across numeric features.
- Transaction `Amount` must be non-negative.
- Feature ranges should stay within observed bounds (flag drift).
- Class imbalance ratio monitoring to detect ingestion issues.