In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

print("NumPy:", np.__version__)



In [None]:
df = pd.read_csv("creditcard.csv")

df.head()


In [None]:
df.shape



In [None]:
df.info()


Check imbalance

In [None]:
df['Class'].value_counts()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Class', data=df)
plt.title("Class Distribution (0 = Legit, 1 = Fraud)")
plt.show()


In [None]:
fraud_percentage = df['Class'].mean() * 100
fraud_percentage


In [None]:
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
y_train.value_counts(normalize=True)
y_test.value_counts(normalize=True)


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
y_prob_nb = nb_model.predict_proba(X_test_scaled)[:, 1]
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
roc_auc_nb = roc_auc_score(y_test, y_prob_nb)
print("Naive Bayes ROC-AUC:", roc_auc_nb)


“The Naive Bayes classifier achieved a high ROC-AUC of 0.96, indicating good separability between fraudulent and legitimate transactions. However, despite a high recall of 85% for fraud detection, the precision was extremely low (6%), resulting in a large number of false positives. This makes Naive Bayes unsuitable for real-world deployment, but useful as a baseline model.”

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log))

roc_auc_log = roc_auc_score(y_test, y_prob_log)
print("Logistic Regression ROC-AUC:", roc_auc_log)


The model penalizes fraud misclassification more heavily.that is why recall improved

“Logistic Regression with class balancing achieved a high ROC-AUC of 0.97 and improved fraud recall to 92%. However, precision remained low due to the extreme class imbalance. This highlights the inherent trade-off between fraud detection sensitivity and false-positive rate in real-world financial systems.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf_model.fit(X_train, y_train)  # RF does NOT need scaling
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

roc_auc_rf = roc_auc_score(y_test, y_prob_rf)
print("Random Forest ROC-AUC:", roc_auc_rf)


“The Random Forest classifier achieved the best overall performance, with a fraud precision of 96% and an F1-score of 0.84. Although the recall (74%) was slightly lower than Logistic Regression, the model significantly reduced false positives, making it more suitable for real-world deployment. This highlights the importance of balancing fraud detection sensitivity and operational cost.”

In [None]:
!pip install shap
import shap

# Create SHAP explainer for Random Forest
explainer = shap.TreeExplainer(rf_model)

# Compute SHAP values on test data
shap_values = explainer.shap_values(X_test)
