# EDA - Credit Card Fraud Data

## Task 1: Data Analysis and Preprocessing

**Objective**: Prepare clean, feature-rich datasets ready for modeling by exploring the data, engineering meaningful features, and handling class imbalance.

This notebook focuses on:
- Data Cleaning (missing values, duplicates, data types)
- Exploratory Data Analysis
- Feature Engineering
- Handling Class Imbalance

**Note**: This dataset contains PCA-transformed features (V1-V28) for privacy protection. The original features have been anonymized.


In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Set style for plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


Libraries imported successfully!


## 1. Load the Data


In [2]:
# Define paths
data_dir = Path('../data')
raw_data_path = data_dir / 'raw' / 'creditcard.csv'
processed_data_path = data_dir / 'processed'

# Create processed directory if it doesn't exist
processed_data_path.mkdir(parents=True, exist_ok=True)

# Load the dataset
print("Loading credit card fraud dataset...")
df = pd.read_csv(raw_data_path)

print(f"\nDataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


Loading credit card fraud dataset...

Dataset loaded successfully!
Shape: (284807, 31)

First few rows:


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## 2. Initial Data Exploration


In [3]:
# Basic information about the dataset
print("=" * 120)
print("DATASET INFORMATION")
print("=" * 120)
print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"\nColumn Names:")
print(df.columns.tolist())
print(f"\nData Types:")
print(df.dtypes)
print(f"\nDataset Info:")
df.info()


DATASET INFORMATION

Shape: 284807 rows × 31 columns

Column Names:
['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']

Data Types:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   C

In [7]:
# Statistical summary
print("=" * 120)
print("STATISTICAL SUMMARY")
print("=" * 120)
df.describe(include='all')


STATISTICAL SUMMARY


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0
mean,94811.0776,0.005917,-0.004135,0.001613,-0.002966,0.001828,-0.001139,0.001801,-0.000854,-0.001596,-0.001441,0.000202,-0.000715,0.000603,0.000252,0.001043,0.001162,0.00017,0.001515,-0.000264,0.000187,-0.000371,-1.5e-05,0.000198,0.000214,-0.000232,0.000149,0.001763,0.000547,88.472687,0.001667
std,47481.047891,1.948026,1.646703,1.508682,1.414184,1.377008,1.331931,1.227664,1.179054,1.095492,1.076407,1.01872,0.994674,0.99543,0.952215,0.914894,0.873696,0.842507,0.837378,0.813379,0.769984,0.723909,0.72455,0.623702,0.605627,0.52122,0.482053,0.395744,0.328027,250.399437,0.040796
min,0.0,-56.40751,-72.715728,-48.325589,-5.683171,-113.743307,-26.160506,-43.557242,-73.216718,-13.434066,-24.588262,-4.797473,-18.683715,-5.791881,-19.214325,-4.498945,-14.129855,-25.162799,-9.498746,-7.213527,-54.49772,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-15.430084,0.0,0.0
25%,54204.75,-0.915951,-0.600321,-0.889682,-0.850134,-0.68983,-0.769031,-0.552509,-0.208828,-0.644221,-0.535578,-0.761649,-0.406198,-0.647862,-0.425732,-0.581452,-0.46686,-0.483928,-0.498014,-0.456289,-0.211469,-0.228305,-0.5427,-0.161703,-0.354453,-0.317485,-0.326763,-0.070641,-0.052818,5.6,0.0
50%,84692.5,0.020384,0.063949,0.179963,-0.022248,-0.053468,-0.275168,0.040859,0.021898,-0.052596,-0.093237,-0.032306,0.139072,-0.012927,0.050209,0.049299,0.067119,-0.065867,-0.002142,0.003367,-0.062353,-0.029441,0.006675,-0.011159,0.041016,0.016278,-0.052172,0.001479,0.011288,22.0,0.0
75%,139298.0,1.316068,0.800283,1.02696,0.739647,0.612218,0.396792,0.570474,0.325704,0.595977,0.453619,0.739579,0.616976,0.663178,0.492336,0.650104,0.523512,0.398972,0.501956,0.458508,0.133207,0.186194,0.528245,0.147748,0.439738,0.350667,0.240261,0.091208,0.078276,77.51,0.0
max,172792.0,2.45493,22.057729,9.382558,16.875344,34.801666,73.301626,120.589494,20.007208,15.594995,23.745136,12.018913,7.848392,7.126883,10.526766,8.877742,17.315112,9.253526,5.041069,5.591971,39.420904,27.202839,10.50309,22.528412,4.584549,7.519589,3.517346,31.612198,33.847808,25691.16,1.0


## 3. Data Cleaning

### 3.1 Check for Missing Values


In [8]:
# Check for missing values
print("=" * 120)
print("MISSING VALUES ANALYSIS")
print("=" * 120)

missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_count.index,
    'Missing Count': missing_count.values,
    'Missing Percentage': missing_percent.values
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("\nColumns with missing values:")
    print(missing_df)
else:
    print("\n✓ No missing values found in the dataset!")

# Visualize missing values if any exist
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    sns.barplot(data=missing_df, x='Column', y='Missing Percentage')
    plt.title('Missing Values by Column')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


MISSING VALUES ANALYSIS

✓ No missing values found in the dataset!


### 3.2 Handle Missing Values

**Strategy**: 
- For numerical columns (V1-V28, Time, Amount): Use median imputation (robust to outliers)
- For Class column: Drop rows with missing values (critical target variable)
- **Justification**: 
  - Median imputation preserves the distribution of PCA-transformed features
  - Missing target values cannot be imputed and must be removed


In [9]:
# Store original shape
original_shape = df.shape
print(f"Original dataset shape: {original_shape}")

# Handle missing values based on column type
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    
    # Drop rows with missing Class (target variable) - cannot be imputed
    if df['Class'].isnull().sum() > 0:
        missing_class_count = df['Class'].isnull().sum()
        df = df.dropna(subset=['Class'])
        print(f"  - Dropped {missing_class_count} rows with missing Class (target variable)")
    
    # Numerical columns - use median imputation
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    for col in numerical_cols:
        if df[col].isnull().sum() > 0:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"  - Filled {col} with median: {median_val:.4f}")
    
    # Verify no missing values remain
    remaining_missing = df.isnull().sum().sum()
    print(f"\n✓ Missing values handled. Remaining missing: {remaining_missing}")
else:
    print("\n✓ No missing values to handle!")


Original dataset shape: (283726, 31)

✓ No missing values to handle!


### 3.3 Remove Duplicates


In [10]:
# Check for duplicate rows
print("=" * 120)
print("DUPLICATE ROWS ANALYSIS")
print("=" * 120)

duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    duplicate_percent = (duplicate_count / len(df)) * 100
    print(f"Percentage of duplicates: {duplicate_percent:.2f}%")
    
    # Check if duplicates have different Class labels (data quality issue)
    duplicate_mask = df.duplicated(keep=False)
    if duplicate_mask.sum() > 0:
        duplicate_df = df[duplicate_mask]
        # Check for conflicting labels in duplicates
        conflicting = duplicate_df.groupby(list(df.columns.drop('Class'))).agg({
            'Class': ['nunique', 'unique']
        })
        conflicting = conflicting[conflicting[('Class', 'nunique')] > 1]
        if len(conflicting) > 0:
            print(f"\n⚠ Warning: Found {len(conflicting)} groups of duplicates with conflicting Class labels!")
            print("This indicates data quality issues. Will keep first occurrence.")
        else:
            print("\n✓ No conflicting labels in duplicate rows")
    
    # Show some duplicate examples
    print("\nSample duplicate rows:")
    print(df[df.duplicated(keep=False)].head(10))
    
    # Remove duplicates (keep first occurrence)
    print(f"\nRemoving {duplicate_count} duplicate rows...")
    df = df.drop_duplicates()
    print(f"✓ Duplicates removed. New shape: {df.shape}")
else:
    print("\n✓ No duplicate rows found!")


DUPLICATE ROWS ANALYSIS

Number of duplicate rows: 0

✓ No duplicate rows found!


### 3.4 Correct Data Types

**Strategy**:
- Ensure `Time` is numeric (seconds elapsed)
- Ensure all PCA features (V1-V28) are float64
- Ensure `Amount` is numeric (float64 for precision)
- Ensure `Class` is integer (binary: 0 or 1)


In [12]:
# Store data types before conversion
print("=" * 120)
print("DATA TYPE CORRECTION")
print("=" * 120)
print("\nOriginal data types:")
print(df.dtypes)


DATA TYPE CORRECTION

Original data types:
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object


In [13]:
# Convert Time column to numeric
print("\nConverting Time column...")
if df['Time'].dtype not in ['int64', 'float64']:
    df['Time'] = pd.to_numeric(df['Time'], errors='coerce')
    print("  ✓ Time converted to numeric")
else:
    print("  ✓ Time is already numeric")

# Check for any conversion errors
if df['Time'].isnull().sum() > 0:
    print(f"  ⚠ Warning: {df['Time'].isnull().sum()} rows have invalid Time values")
    df = df.dropna(subset=['Time'])
    print(f"  - Dropped rows with invalid Time")
else:
    print("  ✓ Time conversion successful!")



Converting Time column...
  ✓ Time is already numeric
  ✓ Time conversion successful!


In [14]:
# Convert PCA features (V1-V28) to float64
print("\nConverting PCA features (V1-V28)...")
v_columns = [f'V{i}' for i in range(1, 29)]
for col in v_columns:
    if col in df.columns:
        if df[col].dtype != 'float64':
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
            print(f"  ✓ {col} converted to float64")
        else:
            pass  # Already correct type

print(f"  ✓ All {len(v_columns)} PCA features verified/converted")



Converting PCA features (V1-V28)...
  ✓ All 28 PCA features verified/converted


In [15]:
# Convert Amount to numeric
print("\nConverting Amount column...")
if df['Amount'].dtype not in ['int64', 'float64']:
    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
    print("  ✓ Amount converted to numeric")
else:
    print("  ✓ Amount is already numeric")

# Check for any conversion errors
if df['Amount'].isnull().sum() > 0:
    print(f"  ⚠ Warning: {df['Amount'].isnull().sum()} rows have invalid Amount values")
    df = df.dropna(subset=['Amount'])
    print(f"  - Dropped rows with invalid Amount")
else:
    print("  ✓ Amount conversion successful!")



Converting Amount column...
  ✓ Amount is already numeric
  ✓ Amount conversion successful!


In [16]:
# Convert Class to integer (binary: 0 or 1)
print("\nConverting Class column...")
if df['Class'].dtype != 'int64':
    # Handle string representations
    df['Class'] = df['Class'].astype(str).str.strip().str.replace('"', '')
    df['Class'] = pd.to_numeric(df['Class'], errors='coerce').astype('Int64')
    print("  ✓ Class converted to integer")
    
    # Verify Class values are only 0 or 1
    invalid_class = df[~df['Class'].isin([0, 1])]
    if len(invalid_class) > 0:
        print(f"  ⚠ Warning: {len(invalid_class)} rows have invalid Class values (not 0 or 1)")
        df = df[df['Class'].isin([0, 1])]
        print(f"  - Dropped rows with invalid Class values")
    else:
        print("  ✓ All Class values are valid (0 or 1)")
else:
    # Verify Class values are only 0 or 1
    invalid_class = df[~df['Class'].isin([0, 1])]
    if len(invalid_class) > 0:
        print(f"  ⚠ Warning: {len(invalid_class)} rows have invalid Class values")
        df = df[df['Class'].isin([0, 1])]
        print(f"  - Dropped rows with invalid Class values")
    else:
        print("  ✓ Class is already integer with valid values (0 or 1)")

# Convert to int64 (non-nullable)
df['Class'] = df['Class'].astype('int64')



Converting Class column...
  ✓ Class is already integer with valid values (0 or 1)


In [17]:
print("\n" + "=" * 120)
print("FINAL DATA TYPES")
print("=" * 120)
print(df.dtypes)



FINAL DATA TYPES
Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object


### 3.5 Data Cleaning Summary


In [18]:
# Summary of data cleaning
print("=" * 120)
print("DATA CLEANING SUMMARY")
print("=" * 120)
print(f"\nOriginal dataset shape: {original_shape}")
print(f"Final dataset shape: {df.shape}")
print(f"Rows removed: {original_shape[0] - df.shape[0]}")
print(f"Columns: {df.shape[1]}")

print(f"\n✓ Missing values: {df.isnull().sum().sum()}")
print(f"✓ Duplicate rows: {df.duplicated().sum()}")
print(f"\n✓ Data types corrected:")
print(f"  - Time: numeric (float64)")
print(f"  - V1-V28: float64 (PCA features)")
print(f"  - Amount: numeric (float64)")
print(f"  - Class: integer (binary: 0 or 1)")

# Check Class distribution
print(f"\n✓ Class distribution:")
class_dist = df['Class'].value_counts()
print(f"  - Class 0 (Normal): {class_dist[0]:,} ({class_dist[0]/len(df)*100:.2f}%)")
print(f"  - Class 1 (Fraud): {class_dist[1]:,} ({class_dist[1]/len(df)*100:.2f}%)")

print(f"\n✓ Dataset is ready for further analysis!")
df.head()


DATA CLEANING SUMMARY

Original dataset shape: (283726, 31)
Final dataset shape: (283726, 31)
Rows removed: 0
Columns: 31

✓ Missing values: 0
✓ Duplicate rows: 0

✓ Data types corrected:
  - Time: numeric (float64)
  - V1-V28: float64 (PCA features)
  - Amount: numeric (float64)
  - Class: integer (binary: 0 or 1)

✓ Class distribution:
  - Class 0 (Normal): 283,253 (99.83%)
  - Class 1 (Fraud): 473 (0.17%)

✓ Dataset is ready for further analysis!


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## 4. Save Cleaned Dataset


In [20]:
# Save cleaned dataset
cleaned_file_path = processed_data_path / 'creditcard_cleaned.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"✓ Cleaned dataset saved to: {cleaned_file_path}")
print(f"  Shape: {df.shape}")
print(f"  File size: {cleaned_file_path.stat().st_size / (1024*1024):.2f} MB")


✓ Cleaned dataset saved to: ../data/processed/creditcard_cleaned.csv
  Shape: (283726, 31)
  File size: 143.29 MB
