# Fraud Detection Data Exploration
This notebook explores the dataset to understand feature distributions, correlations, and class imbalance.

Goal:
- Understand basic statistics.
- Visualize class distribution.
- Identify potential issues (missing values, outliers).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Configure plotting
sns.set(style="whitegrid")
%matplotlib inline

In [None]:
DATA_PATH = Path("../data/raw/creditcard.csv")

def load_data(path):
    if not path.exists():
        print(f"Data file not found at {path}")
        return None
    return pd.read_csv(path)

df = load_data(DATA_PATH)
if df is not None:
    print(f"Dataset shape: {df.shape}")
    display(df.head())
else:
    print("Please ensure data exists in data/raw/")

## Basic Statistics & Missing Values

In [None]:
if df is not None:
    print(df.info())
    display(df.describe())
    
    missing = df.isnull().sum()
    print("\nMissing values:\n", missing[missing > 0])

## Class Distribution
Observe the severe class imbalance.

In [None]:
if df is not None:
    plt.figure(figsize=(6,4))
    sns.countplot(x='Class', data=df)
    plt.title("Class Distribution (0: Legit, 1: Fraud)")
    plt.show()
    
    print(df['Class'].value_counts(normalize=True))

## Correlation Matrix

In [None]:
if df is not None:
    plt.figure(figsize=(12,10))
    corr = df.corr()
    sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
    plt.title("Correlation Matrix")
    plt.show()