# 🧪 Credit Card Fraud Detection Project

In [None]:
# Standard Libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing & Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve

# Imbalanced Techniques
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Anomaly Detection
from sklearn.ensemble import IsolationForest

# Dimensionality Reduction
from sklearn.manifold import TSNE

# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Deep Learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

# Overview
df.info()
df.describe()

# Class Distribution
sns.countplot(x='Class', data=df)
plt.title('Class Distribution (0: Legit, 1: Fraud)')
plt.show()

# Check for missing values
df.isnull().sum()

In [None]:
# Separate features and labels
X = df.drop('Class', axis=1)
y = df['Class']

# Standardize if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
iso = IsolationForest(contamination=0.01, random_state=42)
y_pred_iso = iso.fit_predict(X_test)
y_pred_iso = [1 if x == -1 else 0 for x in y_pred_iso]

print(classification_report(y_test, y_pred_iso))

In [None]:
tsne = TSNE(n_components=2, random_state=42)
X_embedded = tsne.fit_transform(X_train[:5000])

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_embedded[:, 0], y=X_embedded[:, 1], hue=y_train[:5000], palette='coolwarm')
plt.title('t-SNE visualization')
plt.show()

In [None]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

for name, model in models.items():
    model.fit(X_smote, y_smote)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print(classification_report(y_test, preds))

In [None]:
model = Sequential([
    Dense(64, input_dim=X_smote.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_smote, y_smote, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate on test set
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred_nn))

### ✅ Conclusion
- Compare model metrics (F1-score, Precision-Recall, AUC).
- Logistic Regression may give good interpretability, but XGBoost/LightGBM can give better performance.
- Neural Network can capture nonlinear patterns but may need tuning.
- Consider ensembling, feature selection, or model saving for deployment in future iterations.