# Fraud Detection Data Exploration

This notebook performs comprehensive exploratory data analysis (EDA) on the credit card fraud detection dataset to understand the data characteristics, distributions, and patterns that will inform our preprocessing and modeling strategies.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Starting Fraud Detection Data Exploration...")

## 1. Data Loading and Basic Information

In [None]:
# Load the dataset
import os
# Get the project root directory (parent of notebooks directory)
project_root = os.path.dirname(os.getcwd())
data_path = os.path.join(project_root, "data", "raw", "creditcard.csv")
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst 5 rows:")
print(df.head())

In [None]:
# Basic information about the dataset
print("\nDataset Info:")
print(df.info())

print("\n\nDescriptive Statistics:")
print(df.describe())

## 2. Target Variable Analysis (Class Distribution)

In [None]:
# Analyze the target variable
class_counts = df['Class'].value_counts()
class_percentages = df['Class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(f"Normal transactions: {class_counts[0]} ({class_percentages[0]:.2f}%)")
print(f"Fraudulent transactions: {class_counts[1]} ({class_percentages[1]:.2f}%)")
print(f"\nClass imbalance ratio: {class_counts[0]/class_counts[1]:.1f}:1")

In [None]:
# Visualize class distribution
plt.figure(figsize=(12, 5))

# Subplot 1: Count plot
plt.subplot(1, 2, 1)
sns.countplot(x='Class', data=df)
plt.title('Transaction Class Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Class (0=Normal, 1=Fraud)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Normal', 'Fraud'])

# Add count labels on bars
for i, count in enumerate(class_counts):
    plt.text(i, count + 50, str(count), ha='center', va='bottom', fontweight='bold')

# Subplot 2: Pie chart
plt.subplot(1, 2, 2)
colors = ['#2ecc71', '#e74c3c']
plt.pie(class_percentages.values, labels=['Normal', 'Fraud'], colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Transaction Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 3. Feature Analysis

In [None]:
# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

print(f"Number of features: {X.shape[1]}")
print(f"Feature names: {list(X.columns)}")

In [None]:
# Analyze Time and Amount features (the only non-PCA features)
plt.figure(figsize=(15, 10))

# Time distribution
plt.subplot(2, 3, 1)
plt.hist(df['Time'], bins=50, alpha=0.7, color='skyblue')
plt.title('Time Distribution', fontweight='bold')
plt.xlabel('Time (seconds)')
plt.ylabel('Frequency')

# Amount distribution
plt.subplot(2, 3, 2)
plt.hist(df['Amount'], bins=50, alpha=0.7, color='lightgreen')
plt.title('Amount Distribution', fontweight='bold')
plt.xlabel('Amount ($)')
plt.ylabel('Frequency')

# Log scale for Amount to see the distribution better
plt.subplot(2, 3, 3)
plt.hist(np.log1p(df['Amount']), bins=50, alpha=0.7, color='lightgreen')
plt.title('Amount Distribution (Log Scale)', fontweight='bold')
plt.xlabel('Log(Amount + 1)')
plt.ylabel('Frequency')

# Time vs Amount scatter plot
plt.subplot(2, 3, 4)
plt.scatter(df[df['Class']==0]['Time'], df[df['Class']==0]['Amount'], alpha=0.1, color='blue', label='Normal', s=1)
plt.scatter(df[df['Class']==1]['Time'], df[df['Class']==1]['Amount'], alpha=0.5, color='red', label='Fraud', s=5)
plt.title('Time vs Amount', fontweight='bold')
plt.xlabel('Time (seconds)')
plt.ylabel('Amount ($)')
plt.legend()

# Amount by class
plt.subplot(2, 3, 5)
plt.boxplot([df[df['Class']==0]['Amount'], df[df['Class']==1]['Amount']], labels=['Normal', 'Fraud'])
plt.title('Amount by Class', fontweight='bold')
plt.ylabel('Amount ($)')
plt.yscale('log')

# Time by class
plt.subplot(2, 3, 6)
plt.boxplot([df[df['Class']==0]['Time'], df[df['Class']==1]['Time']], labels=['Normal', 'Fraud'])
plt.title('Time by Class', fontweight='bold')
plt.ylabel('Time (seconds)')

plt.tight_layout()
plt.show()

## 4. PCA Features Analysis (V1-V28)

In [None]:
# Analyze PCA features (V1-V28)
pca_features = [f'V{i}' for i in range(1, 29)]

print("PCA Features Statistics:")
print(df[pca_features].describe().T[['mean', 'std', 'min', 'max']].round(3))

In [None]:
# Distribution of first 12 PCA features
plt.figure(figsize=(15, 12))

for i, feature in enumerate(pca_features[:12], 1):
    plt.subplot(4, 3, i)
    plt.hist(df[feature], bins=30, alpha=0.7, color='orange')
    plt.title(f'{feature} Distribution', fontsize=10, fontweight='bold')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Feature Correlations

In [None]:
# Correlation matrix for all features
plt.figure(figsize=(20, 16))
correlation_matrix = df.corr()

sns.heatmap(correlation_matrix, 
            annot=False, 
            cmap='coolwarm', 
            center=0,
            square=True,
            cbar_kws={'label': 'Correlation Coefficient'})

plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Correlation of features with the target variable
correlations_with_target = df.corr()['Class'].sort_values(ascending=False)

print("Features most correlated with fraud (Class=1):")
print(correlations_with_target.head(10))

print("\nFeatures most negatively correlated with fraud (Class=0):")
print(correlations_with_target.tail(10))

In [None]:
# Visualize correlations with target
plt.figure(figsize=(12, 8))

# Remove the target variable itself from correlations
target_correlations = correlations_with_target.drop('Class')

# Plot top 10 most correlated features
top_features = abs(target_correlations).sort_values(ascending=False).head(10)

colors = ['red' if target_correlations[feat] < 0 else 'blue' for feat in top_features.index]

plt.bar(range(len(top_features)), top_features.values, color=colors, alpha=0.7)
plt.xticks(range(len(top_features)), top_features.index, rotation=45)
plt.title('Top 10 Features by Absolute Correlation with Fraud', fontsize=14, fontweight='bold')
plt.ylabel('Absolute Correlation Coefficient')
plt.xlabel('Features')
plt.grid(True, alpha=0.3)

# Add correlation values on bars
for i, (feature, value) in enumerate(zip(top_features.index, top_features.values)):
    corr_value = target_correlations[feature]
    plt.text(i, value + 0.01, f'{corr_value:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Missing Values and Data Quality

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("\n‚úÖ No missing values found in the dataset")
else:
    print(f"\n‚ö†Ô∏è Found {missing_values.sum()} missing values")

In [None]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

if duplicates > 0:
    print(f"\n‚ö†Ô∏è Found {duplicates} duplicate rows")
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df.shape}")
else:
    print("\n‚úÖ No duplicate rows found")

## 7. Feature Distributions by Class

In [None]:
# Compare feature distributions between normal and fraud transactions
normal_data = df[df['Class'] == 0]
fraud_data = df[df['Class'] == 1]

print(f"Normal transactions: {len(normal_data)}")
print(f"Fraud transactions: {len(fraud_data)}")

# Distribution comparison for top correlated features
top_correlated_features = abs(correlations_with_target).sort_values(ascending=False).head(6).index.tolist()

plt.figure(figsize=(15, 10))

for i, feature in enumerate(top_correlated_features, 1):
    plt.subplot(2, 3, i)
    
    # Normal transactions
    plt.hist(normal_data[feature], bins=30, alpha=0.6, label='Normal', color='blue', density=True)
    
    # Fraud transactions
    plt.hist(fraud_data[feature], bins=30, alpha=0.6, label='Fraud', color='red', density=True)
    
    plt.title(f'{feature} Distribution by Class', fontweight='bold')
    plt.xlabel('Value')
    plt.ylabel('Density')
    plt.legend()
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Summary and Insights

In [None]:
print("="*60)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*60)

print(f"\nüìä DATASET OVERVIEW:")
print(f"   - Total transactions: {len(df):,}")
print(f"   - Features: {len(X.columns)}")
print(f"   - Target classes: 2 (Normal=0, Fraud=1)")

print(f"\nüéØ CLASS IMBALANCE:")
print(f"   - Normal transactions: {class_counts[0]:,} ({class_percentages[0]:.2f}%)")
print(f"   - Fraud transactions: {class_counts[1]:,} ({class_percentages[1]:.2f}%)")
print(f"   - Imbalance ratio: {class_counts[0]/class_counts[1]:.1f}:1")
print(f"   ‚ö†Ô∏è  This is a highly imbalanced dataset!")

print(f"\nüí∞ AMOUNT ANALYSIS:")
print(f"   - Mean amount (Normal): ${df[df['Class']==0]['Amount'].mean():.2f}")
print(f"   - Mean amount (Fraud): ${df[df['Class']==1]['Amount'].mean():.2f}")
print(f"   - Max amount: ${df['Amount'].max():.2f}")
print(f"   - Min amount: ${df['Amount'].min():.2f}")

print(f"\n‚è∞ TIME ANALYSIS:")
print(f"   - Time range: {df['Time'].min():.0f}s to {df['Time'].max():.0f}s")
print(f"   - Total duration: {(df['Time'].max() - df['Time'].min())/3600:.1f} hours")

print(f"\nüîç FEATURE INSIGHTS:")
print(f"   - PCA features (V1-V28): All standardized, mean ‚âà 0")
print(f"   - Most correlated with fraud: {top_features.index[0]}")
print(f"   - Strongest negative correlation: {target_correlations.idxmin()}")

print(f"\n‚úÖ DATA QUALITY:")
print(f"   - Missing values: {missing_values.sum()}")
print(f"   - Duplicate rows: {duplicates}")
print(f"   - Data type consistency: All numeric")

print("\n" + "="*60)
print("RECOMMENDATIONS FOR PREPROCESSING:")
print("="*60)
print("1. Handle class imbalance using:")
print("   - Undersampling majority class")
print("   - Oversampling minority class (SMOTE)")
print("   - Class weights in models")
print("\n2. Feature scaling:")
print("   - Amount feature needs scaling (wide range)")
print("   - Time feature may benefit from scaling")
print("   - PCA features already standardized")
print("\n3. Feature engineering opportunities:")
print("   - Time-based features (hour of day, day of week)")
print("   - Amount-based features (log transformation)")
print("   - Interaction features between top correlated variables")
print("\n4. Model considerations:")
print("   - Use metrics suitable for imbalanced data (AUC-ROC, Precision-Recall)")
print("   - Focus on recall to minimize false negatives (missed fraud)")
print("   - Consider cost-sensitive learning")