In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./Creditcard_data.csv')

In [3]:
df['Class'].value_counts()

0    763
1      9
Name: Class, dtype: int64

In [4]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [5]:
# Train test split

In [6]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2)

In [7]:
# Unbalanced data set so sampling must be applied

In [8]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [9]:
sampling_tech = {
    'Random-Under(U)' : RandomUnderSampler(replacement=True),
    'Random-Over(O)' : RandomOverSampler(),
    'Tomek(U)' : TomekLinks(sampling_strategy='majority'),
    'SMOTE(O)' : SMOTE(),
    'NearMiss(U)' : NearMiss()
}

In [10]:
# Sample size
z = 1.96 # 95% confidence
e = 0.05
p = 0.05    # 5% frauds

n = (z**2 * p * (1-p) )//(e**2)


In [11]:
# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [12]:
models = {
    'KNN' : KNeighborsClassifier(n_neighbors=3),
    'Random Forest' : RandomForestClassifier(n_estimators=100,criterion='entropy'),
    'XGB': XGBClassifier(),
    'SVC' : SVC(),
    'Naive Bayes' : GaussianNB(),
}

In [13]:
def get_random_n(X,y,n):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    df = pd.concat((X,y),axis=1).sample(int(n),replace=True)
    return df.iloc[:,:-1].values,df.iloc[:,-1].values

In [14]:
from sklearn.metrics import accuracy_score

final_df = {}

for name,sampler in sampling_tech.items():
    final_df[name]=[]
    X_sampled,y_sampled = sampler.fit_resample(X_train,y_train)

    if name!='Tomek(U)':    # not using sample size for Tomek links
        X_sampled,y_sampled = get_random_n(X_sampled,y_sampled,n)
    
    for model_name,model in models.items():
        model.fit(X_sampled,y_sampled)
        y_pred = model.predict(X_test)
        ac = accuracy_score(y_pred=y_pred,y_true=y_test)
        final_df[name].append(ac)
# print(final_df)
pd.DataFrame(final_df,index=models.keys())
    

Unnamed: 0,Random-Under(U),Random-Over(O),Tomek(U),SMOTE(O),NearMiss(U)
KNN,0.619355,0.8,0.980645,0.548387,0.116129
Random Forest,0.619355,0.948387,0.980645,0.903226,0.496774
XGB,0.490323,0.903226,0.980645,0.83871,0.212903
SVC,0.690323,0.76129,0.980645,0.116129,0.380645
Naive Bayes,0.890323,0.83871,0.883871,0.903226,0.354839
