In [None]:
!pip install imbalanced-learn




In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek




In [None]:
data = pd.read_csv("sample_data/Creditcard_data.csv")
data.head()


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
print(data['Class'].value_counts())

Class
0    763
1      9
Name: count, dtype: int64


In [None]:
X = data.drop('Class', axis=1)
y = data['Class']


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
sampling_methods = {
    "Sampling1_Under": RandomUnderSampler(random_state=42),
    "Sampling2_Over": RandomOverSampler(random_state=42),
    "Sampling3_SMOTE": SMOTE(random_state=42),
    "Sampling4_SMOTETomek": SMOTETomek(random_state=42),
    "Sampling5_Stratif": None   # handled separately
}


In [None]:
model_bank = {
    "M1_LogReg": LogisticRegression(max_iter=1000),
    "M2_DecTree": DecisionTreeClassifier(random_state=42),
    "M3_RandForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "M4_KNN": KNeighborsClassifier(n_neighbors=5),
    "M5_SVM": SVC(kernel='rbf')
}


In [None]:
accuracy_table = pd.DataFrame(
    index=model_bank.keys(),
    columns=sampling_methods.keys()
)


In [None]:
for samp_name, sampler in sampling_methods.items():

    if sampler is not None:
        X_res, y_res = sampler.fit_resample(X_scaled, y)
    else:
        X_res, y_res = X_scaled, y  # stratified handled in split

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res,
        test_size=0.3,
        random_state=42,
        stratify=y_res
    )

    for model_name, model in model_bank.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        acc = accuracy_score(y_test, predictions)
        accuracy_table.loc[model_name, samp_name] = round(acc * 100, 2)


In [None]:
accuracy_table


Unnamed: 0,Sampling1_Under,Sampling2_Over,Sampling3_SMOTE,Sampling4_SMOTETomek,Sampling5_Stratif
M1_LogReg,50.0,91.7,91.7,91.7,98.71
M2_DecTree,66.67,99.13,98.47,98.47,98.28
M3_RandForest,50.0,100.0,99.13,99.13,98.71
M4_KNN,33.33,98.25,94.1,94.1,98.71
M5_SVM,50.0,97.82,98.25,98.25,98.71


In [None]:
best_sampling = accuracy_table.astype(float).idxmax(axis=1)
best_sampling


Unnamed: 0,0
M1_LogReg,Sampling5_Stratif
M2_DecTree,Sampling2_Over
M3_RandForest,Sampling2_Over
M4_KNN,Sampling5_Stratif
M5_SVM,Sampling5_Stratif


In [None]:
for model in accuracy_table.index:
    best = accuracy_table.loc[model].astype(float).idxmax()
    score = accuracy_table.loc[model, best]
    print(f"{model} performs best with {best} (Accuracy = {score}%)")


M1_LogReg performs best with Sampling5_Stratif (Accuracy = 98.71%)
M2_DecTree performs best with Sampling2_Over (Accuracy = 99.13%)
M3_RandForest performs best with Sampling2_Over (Accuracy = 100.0%)
M4_KNN performs best with Sampling5_Stratif (Accuracy = 98.71%)
M5_SVM performs best with Sampling5_Stratif (Accuracy = 98.71%)
