In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import HistGradientBoostingClassifier


In [8]:
# Load dataset
df = pd.read_csv('dataset\cleaned_creditcard.csv')

# Split the data into features (X) and target (y)
X = df.drop(columns=['Class'])
y = df['Class']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the *scaled* training data
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Print class distributions
print("Class distribution before SMOTE:\n", pd.Series(y_train).value_counts())
print("Class distribution after SMOTE:\n", pd.Series(y_train_smote).value_counts())


Class distribution before SMOTE:
 Class
0    226602
1       378
Name: count, dtype: int64
Class distribution after SMOTE:
 Class
0    226602
1    226602
Name: count, dtype: int64


In [17]:
from sklearn.metrics import classification_report, accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix

models = {
    "Logistic Regression": LogisticRegression(max_iter=500, solver='saga'),
    "Random Forest": RandomForestClassifier(n_estimators=50, n_jobs=-1, random_state=42), 
    "Gradient Boosting": HistGradientBoostingClassifier(max_iter=50, random_state=42)
}

results = {}  # To store model performance

for name, model in models.items():
    model.fit(X_train_smote, y_train_smote)  
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else None

    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None

    # Compute confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Store metrics and confusion matrix
    results[name] = {
        "Accuracy": acc,
        "Recall": recall,
        "F1-score": f1,
        "AUC-ROC": auc,
        "Confusion Matrix": conf_matrix,
        "Probabilities": y_prob  # Store predicted probabilities
    }

    # Print classification report
    print(f"\n{name} Performance:")
    print(classification_report(y_test, y_pred))






The max_iter was reached which means the coef_ did not converge




Logistic Regression Performance:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56651
           1       0.05      0.87      0.10        95

    accuracy                           0.97     56746
   macro avg       0.53      0.92      0.54     56746
weighted avg       1.00      0.97      0.99     56746


Random Forest Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.91      0.76      0.83        95

    accuracy                           1.00     56746
   macro avg       0.96      0.88      0.91     56746
weighted avg       1.00      1.00      1.00     56746


Gradient Boosting Performance:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56651
           1       0.32      0.85      0.46        95

    accuracy                           1.00     56746
   macro avg       0.66      0.92 

In [14]:
# Visualizing Model Performance with Plotly
import plotly.graph_objects as go
import plotly.express as px

# Extract results for plotting
model_names = list(results.keys())
accuracies = [results[m]["Accuracy"] for m in model_names]
auc_rocs = [results[m]["AUC-ROC"] for m in model_names]

# Create bar chart using Plotly
fig = go.Figure()

fig.add_trace(go.Bar(
    x=model_names, y=accuracies, 
    name='Accuracy', marker_color='blue'
))

fig.add_trace(go.Bar(
    x=model_names, y=auc_rocs, 
    name='AUC-ROC', marker_color='orange'
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Models',
    yaxis_title='Score',
    barmode='group',
    template='plotly_dark'
)

fig.show()


In [15]:
#Visualizing Confusion Matrices with Plotly
for model_name, model_result in results.items():
    conf_matrix = model_result["Confusion Matrix"]

    # Convert Confusion Matrix to DataFrame for Plotly
    conf_df = pd.DataFrame(conf_matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

    # Heatmap for Confusion Matrix
    fig = px.imshow(
        conf_df, text_auto=True, color_continuous_scale='Blues',
        title=f'Confusion Matrix: {model_name}'
    )
    fig.update_layout(
        xaxis_title="Predicted",
        yaxis_title="Actual",
        template="plotly_dark"
    )
    fig.show()


In [18]:
#Precision-Recall Curves (Plotly)
import numpy as np
from sklearn.metrics import precision_recall_curve

fig = go.Figure()

# Loop through models and plot PR curves
for model_name, model_result in results.items():
    y_true = y_test
    y_scores = model_result["Probabilities"]  # Predicted probabilities

    precision, recall, _ = precision_recall_curve(y_true, y_scores)

    fig.add_trace(go.Scatter(
        x=recall, y=precision, mode='lines',
        name=model_name
    ))

fig.update_layout(
    title="Precision-Recall Curve",
    xaxis_title="Recall",
    yaxis_title="Precision",
    template="plotly_dark"
)

fig.show()
