Importing modules for sampling

In [50]:
import pandas as pd
import numpy as np
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import AgglomerativeClustering
from imblearn.over_sampling import RandomOverSampler

Converting the data-set into balanced class data-set

In [51]:
data=pd.read_csv('/content/Creditcard_data.csv')
x,y=RandomOverSampler(random_state=42).fit_resample(data.iloc[:,:-1],data.iloc[:,-1])
data=pd.concat([x,y],axis=1)

Creating five samples using different techniques

In [52]:
c=0.9
p=np.sum(data.iloc[:,-1])/len(data.iloc[:,-1])
E=1-c
Z=norm.ppf(1-E/2)
n=int(np.ceil(Z**2*p*(1-p)/(E**2)))
Random=data.sample(n=n)
indices=np.arange(0,len(data),int(len(data)/n))
Systematic=data.iloc[indices]
Convenience=pd.concat([data[data['Class']==0].sample(int(n/2)),data[data['Class']==1].sample(int(n/2))]) 
S=data['Class'].nunique()
n=int(np.ceil(Z**2*p*(1-p)/((E/S)**2)))
Stratified=data.groupby('Class').apply(lambda x: x.sample(n=int(n/S),replace=True)).reset_index(drop=True)
clustering=AgglomerativeClustering().fit(data.drop('Class', axis=1))
labels=clustering.labels_
cluster_sizes={}
for label in set(labels):
    cluster_sizes[label]=sum(labels == label)
C=sum(cluster_sizes.values())/len(cluster_sizes)
c=0.07
E=1-c
Z=norm.ppf(1-E/2)
n3=int(np.ceil(Z**2*p*(1-p)/((E/C)**2)))
Cluster=data.sample(n3,replace=False)
samples=[Random,Systematic,Convenience,Stratified,Cluster]

Applying five different ML models on each sample

In [53]:
classifiers=[RandomForestClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(),AdaBoostClassifier(),KNeighborsClassifier()]
results=pd.DataFrame(index=[str(clf) for clf in classifiers],columns=[sample_name for sample_name, sample in zip(['Random','Systematic','Convenience','Stratified','Cluster'],samples)])
for i,sample in enumerate(samples):
    sample_name=['Random','Systematic','Convenience','Stratified','Cluster'][i]
    x_train,x_test,y_train,y_test=train_test_split(sample.iloc[:,:-1],sample.iloc[:,-1])
    for clf in classifiers:
        clf.fit(x_train,y_train)
        accuracy = accuracy_score(y_test,clf.predict(x_test))
        results.loc[str(clf),sample_name]=accuracy
results=results.astype('float64')
print(results)

                                Random  Systematic  Convenience  Stratified  \
RandomForestClassifier()      0.882353    0.944444     0.941176    1.000000   
DecisionTreeClassifier()      0.705882    0.888889     0.941176    0.926471   
GradientBoostingClassifier()  0.823529    0.888889     0.941176    1.000000   
AdaBoostClassifier()          0.823529    0.888889     0.941176    0.955882   
KNeighborsClassifier()        0.823529    0.555556     0.823529    0.955882   

                               Cluster  
RandomForestClassifier()      1.000000  
DecisionTreeClassifier()      0.993846  
GradientBoostingClassifier()  0.996923  
AdaBoostClassifier()          1.000000  
KNeighborsClassifier()        0.975385  


Determining which sampling technique gives higher accuracy on which model

In [54]:
for clf in classifiers:
    max_value=results.loc[str(clf)].max()
    max_indices=np.where(results.loc[str(clf)]==max_value)[0]
    if len(max_indices)==1:
        print(results.loc[str(clf)].idxmax(),'sampling technique gives higher accuracy on',str(clf))
    else:
        print(', '.join(results.columns[max_indices]),'sampling techniques give higher accuracy on ',str(clf))

Stratified, Cluster sampling techniques give higher accuracy on  RandomForestClassifier()
Cluster sampling technique gives higher accuracy on DecisionTreeClassifier()
Stratified sampling technique gives higher accuracy on GradientBoostingClassifier()
Cluster sampling technique gives higher accuracy on AdaBoostClassifier()
Cluster sampling technique gives higher accuracy on KNeighborsClassifier()


Calculating Topsis score to rank models

In [55]:
df=results.copy(deep=False)
weight=[1,1,1,1,1]
impact=['+','+','+','+','+']
c=len(df.axes[1])
for j in range(1,c):
    t=0
    for i in range(len(df)):
        t+=df.iloc[i,j]**2
    t**=0.5
    for i in range(len(df)):
        df.iloc[i,j]=(df.iloc[i,j]/t)*weight[j-1]
best=df.max().values[1:]
worst=df.min().values[1:]
for i in range(0,c-1):
    if impact[i]=='-':
        t=best[i]
        best[i]=worst[i]
        worst[i]=t
top=np.zeros(len(df))
for i in range(len(df)):
    t1=0
    t2=0
    for j in range(1,c):
        t1+=(df.iloc[i,j]-best[j-1])**2
        t2+=(df.iloc[i,j]-worst[j-1])**2
    top[i]=t2**0.5/(t2**0.5+t1**0.5)
df['Topsis Score']=top
df['Rank']=df['Topsis Score'].rank(method='max',ascending=False)
df.drop(df.columns[[0,1,2,3,4]],axis=1,inplace=True)
print(df)

                              Topsis Score  Rank
RandomForestClassifier()          1.000000   1.0
DecisionTreeClassifier()          0.804861   4.0
GradientBoostingClassifier()      0.865129   2.0
AdaBoostClassifier()              0.838902   3.0
KNeighborsClassifier()            0.059475   5.0
