# EDA for creditcard.csv

This notebook performs exploratory data analysis on the credit card fraud detection dataset.


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from data_cleaning import clean_creditcard_data

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load data
print("Loading creditcard.csv...")
cc_df = pd.read_csv('../data/raw/creditcard.csv')
print(f"Shape: {cc_df.shape}")
print(f"\nColumns: {cc_df.columns.tolist()}")
print(f"\nFirst few rows:")
cc_df.head()


## 1. Data Cleaning


In [None]:
# Clean the data
cc_df_clean = clean_creditcard_data(cc_df)

# Display basic info
print("Data Info:")
print(cc_df_clean.info())
print("\nBasic Statistics:")
cc_df_clean.describe()


## 2. Class Distribution Analysis


In [None]:
# Class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
class_counts = cc_df_clean['Class'].value_counts()
axes[0].bar(class_counts.index, class_counts.values, color=['green', 'red'])
axes[0].set_xlabel('Class (0=Legitimate, 1=Fraud)')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution')
axes[0].set_xticks([0, 1])

# Percentage
class_pct = cc_df_clean['Class'].value_counts(normalize=True) * 100
axes[1].bar(class_pct.index, class_pct.values, color=['green', 'red'])
axes[1].set_xlabel('Class (0=Legitimate, 1=Fraud)')
axes[1].set_ylabel('Percentage')
axes[1].set_title('Class Distribution (%)')
axes[1].set_xticks([0, 1])

plt.tight_layout()
plt.show()

print(f"Class distribution:")
print(cc_df_clean['Class'].value_counts())
print(f"\nClass imbalance ratio: {class_counts[0] / class_counts[1]:.2f}:1")
print(f"Fraud percentage: {class_pct[1]:.4f}%")


## 3. Univariate Analysis


In [None]:
# Distribution of Amount
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Full distribution
axes[0].hist(cc_df_clean['Amount'], bins=50, edgecolor='black')
axes[0].set_xlabel('Amount')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Transaction Amount')

# Log scale (to see distribution better)
axes[1].hist(np.log1p(cc_df_clean['Amount']), bins=50, edgecolor='black')
axes[1].set_xlabel('Log(Amount + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Transaction Amount (Log Scale)')

plt.tight_layout()
plt.show()

print("Amount Statistics:")
print(cc_df_clean['Amount'].describe())


In [None]:
# Distribution of Time
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(cc_df_clean['Time'], bins=50, edgecolor='black')
axes[0].set_xlabel('Time (seconds)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Time')

# Convert to hours for better interpretation
cc_df_clean['Time_hours'] = cc_df_clean['Time'] / 3600
axes[1].hist(cc_df_clean['Time_hours'], bins=50, edgecolor='black')
axes[1].set_xlabel('Time (hours)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Time (in hours)')

plt.tight_layout()
plt.show()


## 4. Bivariate Analysis


In [None]:
# Amount by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
cc_df_clean.boxplot(column='Amount', by='Class', ax=axes[0])
axes[0].set_title('Transaction Amount by Class')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Amount')

# Violin plot
sns.violinplot(data=cc_df_clean, x='Class', y='Amount', ax=axes[1])
axes[1].set_title('Amount Distribution by Class')

plt.tight_layout()
plt.show()

# Statistical summary
print("Amount Statistics by Class:")
print(cc_df_clean.groupby('Class')['Amount'].describe())


In [None]:
# Correlation analysis of PCA features
# Sample a subset for visualization if dataset is large
sample_size = min(10000, len(cc_df_clean))
cc_sample = cc_df_clean.sample(n=sample_size, random_state=42)

# Correlation matrix for V1-V28
v_cols = [f'V{i}' for i in range(1, 29)]
corr_matrix = cc_sample[v_cols + ['Amount', 'Class']].corr()

# Plot correlation with Class
class_corr = corr_matrix['Class'].drop('Class').sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Top correlations with Class
top_corr = class_corr.head(15)
axes[0].barh(range(len(top_corr)), top_corr.values)
axes[0].set_yticks(range(len(top_corr)))
axes[0].set_yticklabels(top_corr.index)
axes[0].set_xlabel('Correlation with Class')
axes[0].set_title('Top 15 Features Correlated with Class')
axes[0].invert_yaxis()


top_features = list(top_corr.index) + ['Class']
corr_subset = cc_sample[top_features].corr()
sns.heatmap(corr_subset, annot=False, cmap='coolwarm', center=0, ax=axes[1])
axes[1].set_title('Correlation Heatmap of Top Features')

plt.tight_layout()
plt.show()

print("Top 10 Features Correlated with Class:")
print(class_corr.head(10))


## 5. Summary and Key Insights

Key findings from the EDA:
1. Extreme class imbalance (typical for fraud detection)
2. Distribution of transaction amounts
3. Important PCA features for fraud detection
4. Time patterns in transactions
