# Credit Card Fraud Detection - Exploratory Data Analysis

This notebook provides comprehensive EDA for the credit card fraud detection dataset.

## Contents
1. Dataset Overview
2. Class Imbalance Analysis
3. Feature Distributions
4. Correlation Analysis
5. Fraud vs Legitimate Patterns
6. Time Analysis
7. Amount Analysis
8. Key Insights

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Dataset Overview

In [None]:
# Load the dataset
data_path = Path('../data/raw/creditcard.csv')
df = pd.read_csv(data_path)

print(f"Dataset Shape: {df.shape}")
print(f"Total Transactions: {len(df):,}")
print(f"\nFeatures: {df.columns.tolist()}")

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
if missing.sum() == 0:
    print("✓ No missing values found")
else:
    print(missing[missing > 0])

In [None]:
# Data types
print("\nData Types:")
print(df.dtypes.value_counts())

## 2. Class Imbalance Analysis

In [None]:
# Class distribution
class_counts = df['Class'].value_counts()
class_pct = df['Class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(f"  Legitimate (0): {class_counts[0]:,} ({class_pct[0]:.3f}%)")
print(f"  Fraud (1):      {class_counts[1]:,} ({class_pct[1]:.3f}%)")
print(f"\nImbalance Ratio: {class_counts[0] / class_counts[1]:.0f}:1")

In [None]:
# Visualize class imbalance
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
colors = ['#2ecc71', '#e74c3c']
bars = axes[0].bar(['Legitimate', 'Fraud'], class_counts.values, color=colors)
axes[0].set_ylabel('Count')
axes[0].set_title('Transaction Class Distribution')
for bar, count in zip(bars, class_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1000, 
                 f'{count:,}', ha='center', fontsize=11)

# Pie chart
axes[1].pie(class_counts.values, labels=['Legitimate', 'Fraud'], 
            colors=colors, autopct='%1.3f%%', explode=(0, 0.1),
            shadow=True, startangle=90)
axes[1].set_title('Class Percentage Distribution')

plt.tight_layout()
plt.show()

## 3. Feature Distributions

In [None]:
# PCA features (V1-V28) distribution
v_features = [f'V{i}' for i in range(1, 29)]

fig, axes = plt.subplots(7, 4, figsize=(16, 20))
axes = axes.ravel()

for i, feature in enumerate(v_features):
    axes[i].hist(df[feature], bins=50, alpha=0.7, edgecolor='black', linewidth=0.5)
    axes[i].set_title(feature, fontsize=10)
    axes[i].tick_params(labelsize=8)

plt.suptitle('Distribution of PCA-Transformed Features (V1-V28)', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Feature statistics
feature_stats = df[v_features].agg(['mean', 'std', 'min', 'max']).T
feature_stats.columns = ['Mean', 'Std', 'Min', 'Max']
feature_stats

## 4. Correlation Analysis

In [None]:
# Correlation with target
correlations = df.corr()['Class'].drop('Class').sort_values()

fig, ax = plt.subplots(figsize=(10, 10))
colors = ['#e74c3c' if x < 0 else '#2ecc71' for x in correlations]
correlations.plot(kind='barh', color=colors, ax=ax)
ax.set_xlabel('Correlation with Fraud (Class)')
ax.set_title('Feature Correlation with Fraud')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Top correlated features
print("Top 5 Positively Correlated Features:")
print(correlations.tail(5))
print("\nTop 5 Negatively Correlated Features:")
print(correlations.head(5))

In [None]:
# Correlation heatmap for top features
top_features = list(correlations.head(5).index) + list(correlations.tail(5).index) + ['Amount', 'Class']

plt.figure(figsize=(12, 10))
sns.heatmap(df[top_features].corr(), annot=True, cmap='RdYlGn', center=0,
            fmt='.2f', square=True, linewidths=0.5)
plt.title('Correlation Heatmap - Top Features')
plt.tight_layout()
plt.show()

## 5. Fraud vs Legitimate Patterns

In [None]:
# Split data by class
fraud = df[df['Class'] == 1]
legit = df[df['Class'] == 0]

print(f"Fraud transactions: {len(fraud):,}")
print(f"Legitimate transactions: {len(legit):,}")

In [None]:
# Compare distributions for most important features
important_features = ['V17', 'V14', 'V12', 'V10', 'V16', 'V3', 'V7', 'V11']

fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, feature in enumerate(important_features):
    axes[i].hist(legit[feature], bins=50, alpha=0.5, label='Legitimate', density=True)
    axes[i].hist(fraud[feature], bins=50, alpha=0.5, label='Fraud', density=True)
    axes[i].set_title(feature)
    axes[i].legend()

plt.suptitle('Distribution Comparison: Fraud vs Legitimate', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Box plots for key features
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, feature in enumerate(important_features):
    df.boxplot(column=feature, by='Class', ax=axes[i])
    axes[i].set_title(feature)
    axes[i].set_xlabel('')

plt.suptitle('Feature Distribution by Class', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 6. Time Analysis

In [None]:
# Time feature analysis
print("Time Feature Statistics:")
print(f"  Min: {df['Time'].min()} seconds")
print(f"  Max: {df['Time'].max()} seconds ({df['Time'].max() / 3600:.1f} hours)")
print(f"  Span: {(df['Time'].max() - df['Time'].min()) / 3600:.1f} hours (~{(df['Time'].max() - df['Time'].min()) / 86400:.1f} days)")

In [None]:
# Convert time to hours for visualization
df['Hour'] = (df['Time'] / 3600) % 24

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Transaction volume by hour
axes[0].hist(df['Hour'], bins=24, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Number of Transactions')
axes[0].set_title('Transaction Volume by Hour')

# Fraud rate by hour
hourly_fraud = df.groupby(df['Hour'].astype(int))['Class'].mean() * 100
axes[1].bar(hourly_fraud.index, hourly_fraud.values, color='#e74c3c', alpha=0.7)
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Fraud Rate (%)')
axes[1].set_title('Fraud Rate by Hour')
axes[1].axhline(y=df['Class'].mean() * 100, color='black', linestyle='--', label='Average')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Time distribution for fraud vs legitimate
fig, ax = plt.subplots(figsize=(12, 5))
ax.hist(legit['Time'] / 3600, bins=48, alpha=0.5, label='Legitimate', density=True)
ax.hist(fraud['Time'] / 3600, bins=48, alpha=0.5, label='Fraud', density=True)
ax.set_xlabel('Time (hours since first transaction)')
ax.set_ylabel('Density')
ax.set_title('Transaction Time Distribution: Fraud vs Legitimate')
ax.legend()
plt.tight_layout()
plt.show()

## 7. Amount Analysis

In [None]:
# Amount statistics
print("Transaction Amount Statistics:")
print(f"\nAll Transactions:")
print(f"  Mean: ${df['Amount'].mean():.2f}")
print(f"  Median: ${df['Amount'].median():.2f}")
print(f"  Std: ${df['Amount'].std():.2f}")
print(f"  Min: ${df['Amount'].min():.2f}")
print(f"  Max: ${df['Amount'].max():.2f}")

print(f"\nLegitimate Transactions:")
print(f"  Mean: ${legit['Amount'].mean():.2f}")
print(f"  Median: ${legit['Amount'].median():.2f}")

print(f"\nFraud Transactions:")
print(f"  Mean: ${fraud['Amount'].mean():.2f}")
print(f"  Median: ${fraud['Amount'].median():.2f}")

In [None]:
# Amount distribution
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Overall distribution (log scale)
axes[0].hist(df['Amount'], bins=100, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Amount ($)')
axes[0].set_ylabel('Count')
axes[0].set_title('Transaction Amount Distribution')
axes[0].set_yscale('log')

# Fraud vs Legitimate
axes[1].hist(legit['Amount'], bins=50, alpha=0.5, label='Legitimate', density=True)
axes[1].hist(fraud['Amount'], bins=50, alpha=0.5, label='Fraud', density=True)
axes[1].set_xlabel('Amount ($)')
axes[1].set_ylabel('Density')
axes[1].set_title('Amount Distribution by Class')
axes[1].set_xlim(0, 500)  # Focus on most common range
axes[1].legend()

# Box plot
df.boxplot(column='Amount', by='Class', ax=axes[2])
axes[2].set_xlabel('Class')
axes[2].set_ylabel('Amount ($)')
axes[2].set_title('Amount by Class')
axes[2].set_ylim(0, 500)

plt.suptitle('')
plt.tight_layout()
plt.show()

In [None]:
# Fraud rate by amount bucket
df['Amount_Bucket'] = pd.cut(df['Amount'], 
                              bins=[0, 10, 50, 100, 250, 500, 1000, float('inf')],
                              labels=['$0-10', '$10-50', '$50-100', '$100-250', '$250-500', '$500-1000', '$1000+'])

fraud_by_amount = df.groupby('Amount_Bucket', observed=True)['Class'].agg(['sum', 'count'])
fraud_by_amount['fraud_rate'] = fraud_by_amount['sum'] / fraud_by_amount['count'] * 100

fig, ax = plt.subplots(figsize=(10, 5))
fraud_by_amount['fraud_rate'].plot(kind='bar', color='#e74c3c', alpha=0.7, ax=ax)
ax.set_xlabel('Amount Bucket')
ax.set_ylabel('Fraud Rate (%)')
ax.set_title('Fraud Rate by Transaction Amount')
ax.axhline(y=df['Class'].mean() * 100, color='black', linestyle='--', label='Average')
plt.xticks(rotation=45)
ax.legend()
plt.tight_layout()
plt.show()

## 8. Key Insights

In [None]:
# Summary statistics
print("=" * 60)
print("KEY INSIGHTS SUMMARY")
print("=" * 60)

print(f"""
1. DATASET OVERVIEW
   • {len(df):,} transactions over ~48 hours
   • 30 features (Time, V1-V28, Amount) + 1 target (Class)
   • No missing values

2. CLASS IMBALANCE (Critical)
   • Only {class_pct[1]:.3f}% fraud ({class_counts[1]:,} transactions)
   • Imbalance ratio: {class_counts[0] // class_counts[1]}:1
   • Requires: SMOTE, class weights, or threshold tuning

3. MOST PREDICTIVE FEATURES (|correlation| > 0.1)
   • Negative: V17, V14, V12, V10 (fraud has lower values)
   • Positive: V4, V11 (fraud has higher values)

4. AMOUNT PATTERNS
   • Fraud mean: ${fraud['Amount'].mean():.2f} vs Legit: ${legit['Amount'].mean():.2f}
   • Most fraud: small amounts (harder to detect)
   • Large transactions (>$1000): rare but higher fraud rate

5. TIME PATTERNS
   • Fraud slightly more common during certain hours
   • No strong temporal pattern due to anonymization

6. RECOMMENDATIONS
   • Use ROC-AUC and PR-AUC as primary metrics (not accuracy)
   • Optimize for Recall (catch fraud) with acceptable Precision
   • Consider ensemble methods (Random Forest, XGBoost)
   • Feature engineering: time-based, amount normalization
""")

In [None]:
# Clean up temporary columns
df.drop(['Hour', 'Amount_Bucket'], axis=1, inplace=True, errors='ignore')
print("EDA Complete! ✓")