In [1]:
# eda_modeling.ipynb

# 1. Imports and Data Loading
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load data
df = pd.read_csv('../data/creditcard.csv')  # adjust path as needed

# 2. Data Overview
print("Dataset shape:", df.shape)
print(df.head())
print(df.info())
print(df.describe())

# 3. Check for Missing Values
print("Missing values per column:\n", df.isnull().sum())

# 4. Drop 'Time' Feature (not useful for modeling)
df = df.drop('Time', axis=1)

# 5. Class Distribution
print(df['Class'].value_counts())
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Non-Fraud, 1: Fraud)')
plt.show()

fraud = df[df['Class'] == 1]
non_fraud = df[df['Class'] == 0]
print(f"Fraudulent transactions: {len(fraud)}")
print(f"Non-fraudulent transactions: {len(non_fraud)}")
print(f"Fraud percentage: {100*len(fraud)/len(df):.4f}%")

# 6. Amount Feature Analysis
plt.figure(figsize=(8,4))
sns.histplot(df['Amount'], bins=100, kde=True)
plt.title('Transaction Amount Distribution')
plt.show()

plt.figure(figsize=(8,4))
sns.boxplot(x='Class', y='Amount', data=df)
plt.title('Amount by Transaction Class')
plt.show()

# 7. Correlation Analysis
plt.figure(figsize=(12,10))
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', vmax=0.8)
plt.title('Correlation Matrix')
plt.show()

# 8. Feature Scaling
scaler = StandardScaler()
df['Amount_scaled'] = scaler.fit_transform(df[['Amount']])
df = df.drop('Amount', axis=1)

# 9. Train-Test Split
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 10. Handle Class Imbalance (Optional: Undersample for demonstration)
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)
print("Resampled dataset shape:", Counter(y_res))

# 11. Model Training

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_res, y_res)
y_pred_lr = lr.predict(X_test)
y_proba_lr = lr.predict_proba(X_test)[:,1]

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_res, y_res)
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

# 12. Evaluation

def plot_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
plot_conf_matrix(y_test, y_pred_lr, "Logistic Regression Confusion Matrix")

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
plot_conf_matrix(y_test, y_pred_rf, "Random Forest Confusion Matrix")

# ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
plt.figure(figsize=(8,6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC={roc_auc_score(y_test, y_proba_lr):.3f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={roc_auc_score(y_test, y_proba_rf):.3f})')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


ModuleNotFoundError: No module named 'seaborn'