# Credit Card Fraud Detection - Data Exploration

This notebook provides comprehensive exploratory data analysis (EDA) for the Online Payments Fraud Detection dataset from Kaggle.

## Dataset Information
- **Source**: Kaggle - Online Payments Fraud Detection Dataset
- **Objective**: Detect fraudulent transactions using machine learning
- **Challenge**: Highly imbalanced dataset with fraud cases being minority class

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Settings
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully")

Libraries imported successfully


: 

## Load Dataset

**Note**: Please download the dataset from Kaggle and place it in the `data/raw/` directory.

In [None]:
# Load the dataset
try:
    data_path = "../data/raw/PS_20174392719_1491204439457_log.csv"
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully from {data_path}")
    print(f"Shape: {df.shape}")
except FileNotFoundError:
    print("Dataset file not found!")
    print("Please download the dataset from Kaggle and place it in '../data/raw/'")
    
    # Create sample data for demonstration
    print("Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 10000
    
    sample_data = {
        'step': np.random.randint(1, 744, n_samples),
        'type': np.random.choice(['CASH_OUT', 'PAYMENT', 'CASH_IN', 'TRANSFER', 'DEBIT'], n_samples),
        'amount': np.random.exponential(100, n_samples),
        'nameOrig': [f'C{i}' for i in range(n_samples)],
        'oldbalanceOrg': np.random.exponential(1000, n_samples),
        'newbalanceOrig': np.random.exponential(1000, n_samples),
        'nameDest': [f'M{i}' for i in range(n_samples)],
        'oldbalanceDest': np.random.exponential(1000, n_samples),
        'newbalanceDest': np.random.exponential(1000, n_samples),
        'isFraud': np.random.choice([0, 1], n_samples, p=[0.998, 0.002])
    }
    
    df = pd.DataFrame(sample_data)
    print(f"Sample data created with shape: {df.shape}")

## Dataset Overview

In [None]:
# Basic dataset information
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)

print(f"\nMissing values:")
print(df.isnull().sum())

print(f"\nBasic statistics:")
df.describe()

## Class Distribution Analysis

In [None]:
# Analyze class distribution
fraud_counts = df['isFraud'].value_counts()
fraud_percentages = df['isFraud'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(f"Legitimate transactions: {fraud_counts[0]:,} ({fraud_percentages[0]:.3f}%)")
print(f"Fraudulent transactions: {fraud_counts[1]:,} ({fraud_percentages[1]:.3f}%)")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
fraud_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'red'])
ax1.set_title('Class Distribution (Count)')
ax1.set_xlabel('Is Fraud')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Legitimate', 'Fraud'], rotation=0)

# Pie chart
ax2.pie(fraud_counts.values, labels=['Legitimate', 'Fraud'], 
        autopct='%1.3f%%', colors=['skyblue', 'red'])
ax2.set_title('Class Distribution (Percentage)')

plt.tight_layout()
plt.show()

## Transaction Type Analysis

In [None]:
# Transaction type distribution
type_counts = df['type'].value_counts()
print("Transaction Type Distribution:")
print(type_counts)

# Transaction type vs fraud
type_fraud = pd.crosstab(df['type'], df['isFraud'], normalize='index') * 100
print("\nFraud Rate by Transaction Type:")
print(type_fraud)

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Transaction type distribution
type_counts.plot(kind='bar', ax=ax1)
ax1.set_title('Transaction Type Distribution')
ax1.set_xlabel('Transaction Type')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Fraud rate by transaction type
type_fraud[1].plot(kind='bar', ax=ax2, color='red')
ax2.set_title('Fraud Rate by Transaction Type')
ax2.set_xlabel('Transaction Type')
ax2.set_ylabel('Fraud Rate (%)')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Amount Analysis

In [None]:
# Amount statistics by fraud class
amount_stats = df.groupby('isFraud')['amount'].describe()
print("Amount Statistics by Fraud Class:")
print(amount_stats)

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Amount distribution
df['amount'].hist(bins=50, ax=axes[0,0], alpha=0.7)
axes[0,0].set_title('Amount Distribution (All Transactions)')
axes[0,0].set_xlabel('Amount')
axes[0,0].set_ylabel('Frequency')

# Amount by fraud class
df[df['isFraud']==0]['amount'].hist(bins=50, ax=axes[0,1], alpha=0.7, label='Legitimate', color='blue')
df[df['isFraud']==1]['amount'].hist(bins=50, ax=axes[0,1], alpha=0.7, label='Fraud', color='red')
axes[0,1].set_title('Amount Distribution by Fraud Class')
axes[0,1].set_xlabel('Amount')
axes[0,1].set_ylabel('Frequency')
axes[0,1].legend()

# Box plot
df.boxplot(column='amount', by='isFraud', ax=axes[1,0])
axes[1,0].set_title('Amount Distribution by Fraud Class (Box Plot)')
axes[1,0].set_xlabel('Is Fraud')
axes[1,0].set_ylabel('Amount')

# Log scale amount
df['log_amount'] = np.log1p(df['amount'])
df[df['isFraud']==0]['log_amount'].hist(bins=50, ax=axes[1,1], alpha=0.7, label='Legitimate', color='blue')
df[df['isFraud']==1]['log_amount'].hist(bins=50, ax=axes[1,1], alpha=0.7, label='Fraud', color='red')
axes[1,1].set_title('Log Amount Distribution by Fraud Class')
axes[1,1].set_xlabel('Log(Amount + 1)')
axes[1,1].set_ylabel('Frequency')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## Balance Analysis

In [None]:
# Balance analysis
balance_cols = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']

print("Balance Statistics:")
for col in balance_cols:
    print(f"\n{col}:")
    print(df.groupby('isFraud')[col].describe())

# Create balance difference features
df['balance_diff_orig'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balance_diff_dest'] = df['newbalanceDest'] - df['oldbalanceDest']

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Original balance difference
df[df['isFraud']==0]['balance_diff_orig'].hist(bins=50, ax=axes[0,0], alpha=0.7, label='Legitimate', color='blue')
df[df['isFraud']==1]['balance_diff_orig'].hist(bins=50, ax=axes[0,0], alpha=0.7, label='Fraud', color='red')
axes[0,0].set_title('Origin Balance Difference')
axes[0,0].set_xlabel('Balance Difference')
axes[0,0].legend()

# Destination balance difference
df[df['isFraud']==0]['balance_diff_dest'].hist(bins=50, ax=axes[0,1], alpha=0.7, label='Legitimate', color='blue')
df[df['isFraud']==1]['balance_diff_dest'].hist(bins=50, ax=axes[0,1], alpha=0.7, label='Fraud', color='red')
axes[0,1].set_title('Destination Balance Difference')
axes[0,1].set_xlabel('Balance Difference')
axes[0,1].legend()

# Correlation heatmap of numerical features
numerical_cols = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
                  'oldbalanceDest', 'newbalanceDest', 'isFraud']
correlation_matrix = df[numerical_cols].corr()

sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,0])
axes[1,0].set_title('Correlation Matrix')

# Feature importance visualization (correlation with target)
feature_corr = df[numerical_cols].corrwith(df['isFraud']).abs().sort_values(ascending=True)
feature_corr.plot(kind='barh', ax=axes[1,1])
axes[1,1].set_title('Feature Correlation with Fraud')
axes[1,1].set_xlabel('Absolute Correlation')

plt.tight_layout()
plt.show()

## Key Insights and Summary

In [None]:
# Summary statistics
total_transactions = len(df)
fraud_transactions = df['isFraud'].sum()
fraud_rate = fraud_transactions / total_transactions * 100

print("=" * 50)
print("KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 50)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Total transactions: {total_transactions:,}")
print(f"   - Fraudulent transactions: {fraud_transactions:,}")
print(f"   - Fraud rate: {fraud_rate:.3f}%")
print(f"   - Features: {len(df.columns)}")

print(f"\n2. CLASS IMBALANCE:")
print(f"   - Highly imbalanced dataset")
print(f"   - Fraud cases represent less than 1% of all transactions")
print(f"   - Will require special handling (SMOTE, class weights, etc.)")

print(f"\n3. TRANSACTION TYPES:")
transaction_types = df['type'].unique()
print(f"   - Types available: {', '.join(transaction_types)}")
fraud_by_type = df.groupby('type')['isFraud'].mean() * 100
highest_fraud_type = fraud_by_type.idxmax()
print(f"   - Highest fraud rate in: {highest_fraud_type} ({fraud_by_type[highest_fraud_type]:.3f}%)")

print(f"\n4. AMOUNT PATTERNS:")
fraud_amount_mean = df[df['isFraud']==1]['amount'].mean()
legit_amount_mean = df[df['isFraud']==0]['amount'].mean()
print(f"   - Average fraud amount: ${fraud_amount_mean:,.2f}")
print(f"   - Average legitimate amount: ${legit_amount_mean:,.2f}")

print(f"\n5. RECOMMENDATIONS FOR MODELING:")
print(f"   - Use SMOTE or other resampling techniques for class imbalance")
print(f"   - Focus on transaction type as important feature")
print(f"   - Consider balance difference features")
print(f"   - Use ensemble methods for better performance")
print(f"   - Prioritize recall (catching fraud) over precision")

print(f"\n" + "="*50)
print("EDA COMPLETED - Ready for preprocessing phase")
print("="*50)