# EDA - Credit Card Data Analysis

This notebook analyzes the `creditcard.csv` bank transaction dataset, focusing on PCA-transformed features and class distribution.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_loader import load_data
from preprocessing import clean_creditcard_data

%matplotlib inline
sns.set(style="whitegrid")

## 1. Data Loading and Cleaning

In [None]:
cc_df = load_data('../data/raw/creditcard.csv')
cc_df = clean_creditcard_data(cc_df)

print(f"Credit Card Data Shape: {cc_df.shape}")
cc_df.head()

## 2. Univariate Analysis

### 2.1 Class Distribution

In [None]:
counts = cc_df['Class'].value_counts()
percent = cc_df['Class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(pd.concat([counts, percent], axis=1, keys=['Count', 'Percentage']))

plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=cc_df, palette='mako')
plt.title('Absolute Class Distribution (Credit Card)')
plt.show()

### 2.2 Transaction Amount Distribution

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(cc_df['Amount'], bins=100, color='teal', kde=True)
plt.title('Distribution of Transaction Amount')
plt.yscale('log') # Log scale as amount is highly skewed
plt.show()

## 3. Bivariate Analysis

### 3.1 Fraud vs Amount

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x='Class', y='Amount', data=cc_df[cc_df['Amount'] < 500], palette='magma')
plt.title('Amount Distribution ( < 500) by Class')
plt.show()

### 3.2 Time vs Fraud

In [None]:
plt.figure(figsize=(12, 4))
sns.kdeplot(cc_df.loc[cc_df['Class'] == 0, 'Time'], label='Normal', shade=True)
sns.kdeplot(cc_df.loc[cc_df['Class'] == 1, 'Time'], label='Fraud', shade=True)
plt.title('Distribution of Time by Class')
plt.legend()
plt.show()