<a href="https://colab.research.google.com/github/kavya22115/Predictive_Analytics/blob/Creditcard_Fraud_Detection/Creditcardfrauddetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
pip install imbalanced-learn



In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, auc
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

try:
    df = pd.read_csv('creditcard.csv')
except FileNotFoundError:
    print("Error: 'creditcard.csv' not found. Please ensure the file is in the correct directory.")
    exit()

X = df.drop('Class', axis=1)
y = df['Class']

# Check the class distribution of the loaded file
print("Class distribution of the loaded 'creditcard.csv':")
print(y.value_counts())

scaler = StandardScaler()
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

# Correcting train_test_split based on class distribution
# If Class 1 has 1 instance, stratify=y will cause an error.
# If Class 1 has >1 instance, stratify=y is recommended for imbalanced datasets.
if y.value_counts()[1] >= 2 and len(y) >= 2 / 0.2: # Check if enough instances for stratification and test_size
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print("\nUsed stratify=y in train_test_split.")
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("\nSkipped stratify=y due to insufficient minority class samples for stratified splitting.")
    print("Warning: With very few (e.g., 1) minority class samples, building a robust fraud detection model is severely limited.")
    print("Consider acquiring a dataset with more fraud instances for meaningful analysis.")


print("\n--- Logistic Regression (Baseline) ---")
lr_model = LogisticRegression(solver='liblinear', random_state=42, n_jobs=-1)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix for Logistic Regression:")
print(confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report for Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print(f"ROC AUC Score for Logistic Regression: {roc_auc_score(y_test, y_prob_lr):.4f}")
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_prob_lr)
pr_auc_lr = auc(recall_lr, precision_lr)
print(f"Precision-Recall AUC for Logistic Regression: {pr_auc_lr:.4f}")

print("\n--- Logistic Regression with SMOTE Oversampling ---")
# SMOTE requires at least 2 samples to create synthetic samples.
# It also needs more than 1 sample of the minority class to be able to work properly.
# Check if SMOTE can be applied. If y_train has only one fraud instance, SMOTE will fail.
# Here, we'll only apply SMOTE if there are enough fraud instances in y_train
if y_train.value_counts().get(1, 0) > 1: # Check if class 1 has more than 1 instance in training set
    pipeline_lr_smote = ImbPipeline([
        ('smote', SMOTE(random_state=42, sampling_strategy='auto')),
        ('classifier', LogisticRegression(solver='liblinear', random_state=42, n_jobs=-1))
    ])

    pipeline_lr_smote.fit(X_train, y_train)
    y_pred_lr_smote = pipeline_lr_smote.predict(X_test)
    y_prob_lr_smote = pipeline_lr_smote.predict_proba(X_test)[:, 1]

    print("Confusion Matrix for Logistic Regression with SMOTE:")
    print(confusion_matrix(y_test, y_pred_lr_smote))
    print("\nClassification Report for Logistic Regression with SMOTE:")
    print(classification_report(y_test, y_pred_lr_smote))
    print(f"ROC AUC Score for LR with SMOTE: {roc_auc_score(y_test, y_prob_lr_smote):.4f}")
    precision_lr_smote, recall_lr_smote, _ = precision_recall_curve(y_test, y_prob_lr_smote)
    pr_auc_lr_smote = auc(recall_lr_smote, precision_lr_smote)
    print(f"Precision-Recall AUC for LR with SMOTE: {pr_auc_lr_smote:.4f}")
else:
    print("SMOTE Oversampling skipped: Not enough minority class samples in the training set to apply SMOTE effectively.")

print("\n--- Random Forest Classifier ---")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report for Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))
print(f"ROC AUC Score for Random Forest: {roc_auc_score(y_test, y_prob_rf):.4f}")
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_prob_rf)
pr_auc_rf = auc(recall_rf, precision_rf)
print(f"Precision-Recall AUC for Random Forest: {pr_auc_rf:.4f}")

print("\n--- Gradient Boosting Classifier ---")
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
y_prob_gb = gb_model.predict_proba(X_test)[:, 1]

print("Confusion Matrix for Gradient Boosting Classifier:")
print(confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report for Gradient Boosting Classifier:")
print(classification_report(y_test, y_pred_gb))
print(f"ROC AUC Score for Gradient Boosting: {roc_auc_score(y_test, y_prob_gb):.4f}")
precision_gb, recall_gb, _ = precision_recall_curve(y_test, y_prob_gb)
pr_auc_gb = auc(recall_gb, precision_gb)
print(f"Precision-Recall AUC for Gradient Boosting: {pr_auc_gb:.4f}")

# Plotting PR curves - only if there are enough fraud instances in y_test
if y_test.value_counts().get(1, 0) > 0: # Ensure there is at least one fraud case in test set for plotting PR curve
    plt.figure(figsize=(10, 7))
    plt.plot(recall_lr, precision_lr, label=f'LR (AUC = {pr_auc_lr:.2f})')
    # Plot SMOTE curve only if it was run
    if y_train.value_counts().get(1, 0) > 1:
        plt.plot(recall_lr_smote, precision_lr_smote, label=f'LR+SMOTE (AUC = {pr_auc_lr_smote:.2f})')
    plt.plot(recall_rf, precision_rf, label=f'Random Forest (AUC = {pr_auc_rf:.2f})')
    plt.plot(recall_gb, precision_gb, label=f'Gradient Boosting (AUC = {pr_auc_gb:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve for Fraud Detection')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("\nPrecision-Recall Curve not plotted: No fraud instances in the test set to evaluate PR curve.")

print("\n--- Example Prediction for a New Transaction ---")
new_transaction = pd.DataFrame([[0.0, -0.5, 0.5, 1.0, -0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0]],
                                  columns=X.columns)
new_transaction[['Time', 'Amount']] = scaler.transform(new_transaction[['Time', 'Amount']])
predicted_class = gb_model.predict(new_transaction)[0]
predicted_prob = gb_model.predict_proba(new_transaction)[0, 1]

if predicted_class == 1:
    print(f"Predicted: FRAUD (Probability: {predicted_prob:.4f})")
else:
    print(f"Predicted: NON-FRAUD (Probability: {predicted_prob:.4f})")

Class distribution of the loaded 'creditcard.csv':
Class
0    47
1     1
Name: count, dtype: int64

Skipped stratify=y due to insufficient minority class samples for stratified splitting.
Consider acquiring a dataset with more fraud instances for meaningful analysis.

--- Logistic Regression (Baseline) ---
Confusion Matrix for Logistic Regression:
[[10]]

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

ROC AUC Score for Logistic Regression: nan
Precision-Recall AUC for Logistic Regression: 0.5000

--- Logistic Regression with SMOTE Oversampling ---
SMOTE Oversampling skipped: Not enough minority class samples in the training set to apply SMOTE effectively.

--- Random Forest Classifier ---
Confusion Matrix for Random F



Confusion Matrix for Gradient Boosting Classifier:
[[10]]

Classification Report for Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

ROC AUC Score for Gradient Boosting: nan
Precision-Recall AUC for Gradient Boosting: 0.5000

Precision-Recall Curve not plotted: No fraud instances in the test set to evaluate PR curve.

--- Example Prediction for a New Transaction ---
Predicted: NON-FRAUD (Probability: 0.0000)


