In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE



In [3]:
#Loading the CSV
df = pd.read_csv('creditcard 2.csv') 
print("Class distribution before cleaning:")
print(df['Class'].value_counts(dropna=False))

Class distribution before cleaning:
Class
0    284315
1       492
Name: count, dtype: int64


In [4]:
# Handle NaNs in Class column
df = df.copy()
df = df.dropna(subset=['Class'])

print("Class distribution after removing NaNs:")
print(df['Class'].value_counts())

Class distribution after removing NaNs:
Class
0    284315
1       492
Name: count, dtype: int64


In [5]:
# Normalize Amount column
scaler = StandardScaler()
if 'Amount' in df.columns:
    df['norm_Amount'] = scaler.fit_transform(df[['Amount']])
    df = df.drop(['Amount'], axis=1)

In [6]:
# Separate into features and label
X = df.drop('Class', axis=1)
y = df['Class']

print("Features shape :", X.shape)
print("Target shape :", y.shape)

Features shape : (284807, 30)
Target shape : (284807,)


In [7]:
# Handle Imbalanced Classes with SMOTE 
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

print("Class distribution after SMOTE:")
print(pd.Series(y_smote).value_counts())

Class distribution after SMOTE:
Class
0    284315
1    284315
Name: count, dtype: int64


In [22]:
# Split into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote,  test_size=0.3, random_state=42)


print("Training set size :", X_train.shape)
print("Test set size :", X_test.shape)

Training set size : (398041, 30)
Test set size : (170589, 30)


In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

print("Precision :", precision_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("AUC :", roc_auc_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Precision : 0.9998361691670372
Recall : 1.0
F1-score : 0.9999180778728335
AUC : 0.9999177911660736
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85149
           1       1.00      1.00      1.00     85440

    accuracy                           1.00    170589
   macro avg       1.00      1.00      1.00    170589
weighted avg       1.00      1.00      1.00    170589

Confusion Matrix:
[[85135    14]
 [    0 85440]]


In [20]:
model1=  LogisticRegression(max_iter=999, random_state=42, class_weight='balanced')
model1.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
y_pred = model1.predict(X_test)

print("Precision :", precision_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("F1-score :", f1_score(y_test, y_pred))
print("AUC :", roc_auc_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Precision : 0.9906684053740414
Recall : 0.9691830524344569
F1-score : 0.9798079596750815
AUC : 0.9800113197556142
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     85149
           1       0.99      0.97      0.98     85440

    accuracy                           0.98    170589
   macro avg       0.98      0.98      0.98    170589
weighted avg       0.98      0.98      0.98    170589

Confusion Matrix:
[[84369   780]
 [ 2633 82807]]
