In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, balanced_accuracy_score, precision_score, \
recall_score 
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from statistics import mean
from scipy.stats import wilcoxon
import json
import random

## Import Sampling Strategies

In [2]:
#Combiner
from imblearn.combine import SMOTEENN

#OverSampling
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

#UnderSamling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import EditedNearestNeighbours

Using TensorFlow backend.


In [3]:
sampling = {
    "nearMiss" : NearMiss(version=3, n_neighbors_ver3=1),
    "enn" : EditedNearestNeighbours(),
    "randomUnderSampler" : RandomUnderSampler(random_state=0),
    "adasyn" : ADASYN(),
    "smote" : SMOTE(random_state=0),
    "smoteenn" : SMOTEENN(random_state=0)
}

<h1>Nomralization Methods<h1>

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
nomralization = {
    'MinMax' : MinMaxScaler(),
    'z_score' : StandardScaler()
}

<h2>Import Algorithms And There Tools</h2>

In [5]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier, plot_tree
#SVM
from sklearn.svm import SVC
#MLP
from sklearn.neural_network import MLPClassifier

<h2>Functions</h2>
<p>On a essayé d'implimenter une fonction, pour chaque algorithme, soit pour obtenir les resultats ou soit pour les afficher</p>
<h2>Functions for each algorithm :</h2>

<h2>KNeighborsClassifier</h2>
<p>la fonction <strong>getDataKNN</strong> retourne la resultat d'un seul technique de sampling en passant le nom de la technique au parametres</p>
<p>la fonction <strong>getAllDataKNN</strong> retourne les resultats avec tout les technique de sampling</p>

In [6]:
def getDataKNN(x_train,x_test,y_train,y_test,sampling="null",samplingName="null"): 
    tmp={}
    if sampling == "null":
        
        grid_param = {
            'n_neighbors' : list(range(2,11)),
            'p' : list(range(1,4))
        }
    
        gs = GridSearchCV(KNeighborsClassifier(), grid_param, cv=10, scoring='balanced_accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["sampling"] = "Without Sampling"
        tmp["params"] = gs_results.best_params_   
        tmp["accuracy"] = balanced_accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    else:
        x, y = sampling.fit_resample(x_train, y_train)

        grid_param = {
            'n_neighbors' : list(range(2,11)),
            'p' : list(range(1,4))
        }

        gs = GridSearchCV(KNeighborsClassifier(), grid_param, scoring='accuracy', cv=10)
        gs_results = gs.fit(x,y)
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["sampling"] = samplingName
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    return tmp

In [7]:
def getAllDataKNN(x_train,x_test,y_train,y_test):
    data = {}
    data["KNN"]=[]
    keys = list(sampling.keys())
    keys.insert(0,"withoutChange")
    for key in keys:
        if(key == "withoutChange"):
            data["KNN"].append(getDataKNN(x_train,x_test,y_train,y_test))
        else :
            data["KNN"].append(getDataKNN(x_train,x_test,y_train,y_test,sampling=sampling[key],samplingName=key))
    return data

### DecisionTreeClassifier
<p>la fonction <strong>getDataDT</strong> retourne la resultat d'un seul technique de sampling en passant le nom de la technique au parametres</p>
<p>la fonction <strong>getAllDataKDT</strong> retourne les resultats avec tout les technique de sampling</p>

In [8]:
def getDataDT(x_train,x_test,y_train,y_test,sampling="null",samplingName="null"):
    tmp={}
    if sampling == "null":
        
        grid_param = {
            'max_leaf_nodes': list(range(2, 6)), 
            'min_samples_split': list(range(20, 70,10)),
            'criterion' : ["gini","entropy"]
        }

        gs = GridSearchCV(DecisionTreeClassifier(), grid_param, cv=10, scoring='balanced_accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["x"]=x_train
        tmp["y"]=y_train
        tmp["sampling"] = "Without Sampling"
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = balanced_accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    else:
        x, y = sampling.fit_resample(x_train, y_train)

        grid_param = {
            'max_leaf_nodes': list(range(2, 6)), 
            'min_samples_split': list(range(20, 70,10)),
            'criterion' : ["gini","entropy"]
        }

        gs = GridSearchCV(DecisionTreeClassifier(), grid_param, cv=10, scoring='accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["x"]=x
        tmp["y"]=y
        tmp["sampling"] = samplingName
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    return tmp

In [9]:
def getAllDataDT(x_train,x_test,y_train,y_test):
    data = {}
    data["DT"]=[]
    keys = list(sampling.keys())
    keys.insert(0,"withoutChange")
    for key in keys:
        if(key == "withoutChange"):
            data["DT"].append(getDataDT(x_train,x_test,y_train,y_test))
        else :
            data["DT"].append(getDataDT(x_train,x_test,y_train,y_test,sampling=sampling[key],samplingName=key))
    return data

### SVM
<p>la fonction <strong>getDataSVM</strong> retourne la resultat d'un seul technique de sampling en passant le nom de la technique au parametres</p>
<p>la fonction <strong>getAllDataKSVM</strong> retourne les resultats avec tout les technique de sampling</p>

In [10]:
def getDataSVM(x_train,x_test,y_train,y_test,sampling="null",samplingName="null"):
    tmp={}
    if sampling == "null":
        
        grid_param = [
            {'kernel' : ['linear'], 'C' : [0.001, 0.01, 0.1, 1, 10]},
            {'kernel' : ['rbf'], 'gamma' : [0.001, 0.01, 0.1, 1], 'C' : [0.001, 0.01, 0.1, 1, 10]},
            {'kernel' : ['poly'], 'coef0' : [0.0, 0.25, 0.5, 0.75, 1], 'C' : [0.001, 0.01, 0.1, 1, 10]}
        ]

        gs = GridSearchCV(SVC(), grid_param, cv=10, scoring='balanced_accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["x"]=x_train
        tmp["y"]=y_train
        tmp["sampling"] = "Without Sampling"
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = balanced_accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    else:
        x, y = sampling.fit_resample(x_train, y_train)

        grid_param = [
            {'kernel' : ['linear'], 'C' : [0.001, 0.01, 0.1, 1, 10]},
            {'kernel' : ['rbf'], 'gamma' : [0.001, 0.01, 0.1, 1], 'C' : [0.001, 0.01, 0.1, 1, 10]},
            {'kernel' : ['poly'], 'coef0' : [0.0, 0.25, 0.5, 0.75, 1], 'C' : [0.001, 0.01, 0.1, 1, 10]}
        ]

        gs = GridSearchCV(SVC(), grid_param, cv=10, scoring='accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["x"]=x
        tmp["y"]=y
        tmp["sampling"] = samplingName
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[1][1]/(cm[1][1]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    return tmp

In [11]:
def getAllDataSVM(x_train,x_test,y_train,y_test):
    data = {}
    data["SVM"]=[]
    keys = list(sampling.keys())
    keys.insert(0,"withoutChange")
    for key in keys:
        if(key == "withoutChange"):
            data["SVM"].append(getDataSVM(x_train,x_test,y_train,y_test))
        else :
            data["SVM"].append(getDataSVM(x_train,x_test,y_train,y_test,sampling=sampling[key],samplingName=key))
    return data

<h2>Multi-layer Perceptron</h2>

In [12]:
def getDataMLP(x_train,x_test,y_train,y_test,sampling="null",samplingName="null"):
    tmp={}
    if sampling == "null":
        
        grid_param = {
            'learning_rate_init': [0.001,0.01, 0.1, 0.2],
            'momentum': [0.0,0.2,0.4,0.6,0.8,0.9],
            'max_iter' : [100,200,500,1000]
        }

        gs = GridSearchCV(MLPClassifier(), grid_param, cv=10, scoring='balanced_accuracy')
        gs_results = gs.fit(x_train,y_train)
        
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["sampling"] = "Without Sampling"
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = balanced_accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    else:
        x, y = sampling.fit_resample(x_train, y_train)

        grid_param = {
            'learning_rate_init': [0.001,0.01, 0.1, 0.2],
            'momentum': [0.0,0.2,0.4,0.6,0.8,0.9],
            'max_iter' : [100,200,500,1000]
        }

        gs = GridSearchCV(MLPClassifier(), grid_param, scoring='accuracy', cv=10)
        gs_results = gs.fit(x,y)
        y_pred = gs.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        
        tmp["sampling"] = samplingName
        tmp["params"] = gs_results.best_params_
        tmp["accuracy"] = accuracy_score(y_test, y_pred)
        tmp["precision_score"] = precision_score(y_test, y_pred)
        tmp["recall_score"] = recall_score(y_test, y_pred)
        tmp["specificity_score"] = cm[0][0]/(cm[0][0]+cm[1][0])
        tmp["best_score"] = gs_results.best_score_
        tmp["confusion_matrix"] = cm
        tmp["classification_report"] = classification_report(y_test,y_pred)
        
    return tmp

In [13]:
def getAllDataMLP(x_train,x_test,y_train,y_test):
    data = {}
    data["MLP"]=[]
    keys = list(sampling.keys())
    keys.insert(0,"withoutChange")
    for key in keys:
        if(key == "withoutChange"):
            data["MLP"].append(getDataMLP(x_train,x_test,y_train,y_test))
        else :
            data["MLP"].append(getDataMLP(x_train,x_test,y_train,y_test,sampling=sampling[key],samplingName=key))
    return data

<h2>Global functions</h2>
<p>Ce sont des fonctions qui s'aplique pour tous les algorithmes quel que soit la stratégie de sampling</p>

<p>la fonction <strong>getAllData</strong> retourne tout les resultats de tout les algorithmes</p>
<p>la fonction <strong>getScores</strong> retourne une liste des valeurs des accuracy</p>
<p>la fonction <strong>dispalyData</strong> permit de visualiser les resultats</p>
<p>La fonction <strong>TestWilcoxon</strong> permit de connaitre parmi les deux meilleures accuracy on specifiant la valeur de <strong>chosen significance level</strong></p>

In [14]:
def getAllData(x_train,x_test,y_train,y_test):
    data = {}
    data.update(getAllDataKNN(x_train,x_test,y_train,y_test))
    data.update(getAllDataDT(x_train,x_test,y_train,y_test))
    data.update(getAllDataSVM(x_train,x_test,y_train,y_test))
    data.update(getAllDataMLP(x_train,x_test,y_train,y_test))
    return data

In [15]:
def dispalyData(data):
    for algorithm in list(data.keys()):
        print("Algorithm",algorithm) 
        for res in data[algorithm]:
            if(res["sampling"] == "Without Sampling"):
                print("\n\tSampling : Without Sampling\n")
                print("\tbest score for the model : ", res["best_score"])
                print("\tBest Parameters :")
                for param in res["params"]:
                    print("\t\t",param," : ",res["params"][param])
                print("\n\tTest the model with X_test :")
                print(res["classification_report"])
                
                plt.figure()
                print("\n\nbalanced_accuracy_score :",res["balanced_accuracy"])
                heat_map = sns.heatmap(res["confusion_matrix"], annot=True)
                
                if (algorithm == "DT"):
                    ax = plt.subplots(figsize=(18, 10))
                    plot_tree(DecisionTreeClassifier(max_leaf_nodes=res["params"]['max_leaf_nodes'],
                                         min_samples_split=res["params"]['min_samples_split'],
                                        criterion=res["params"]['criterion']).fit(res["x"], res["y"]))
                plt.show()
            else:
                print("\n\tTechnique de sampling: "+res["sampling"]+"\n")
                print("\tbest score for the model : ", res["best_score"])
                print("\tBest Parameters :")
                for param in res["params"]:
                    print("\t\t",param," : ",res["params"][param])
                print("\n\tTest the model with X_test :")
                print(res["classification_report"])
                
                plt.figure()
                print("\n\naccuracy_score :",res["accuracy"])
                heat_map = sns.heatmap(res["confusion_matrix"], annot=True)
                if (algorithm == "DT"):
                    ax = plt.subplots(figsize=(20, 12))
                    plot_tree(DecisionTreeClassifier(max_leaf_nodes=res["params"]['max_leaf_nodes'],
                                         min_samples_split=res["params"]['min_samples_split'],
                                        criterion=res["params"]['criterion']).fit(res["x"], res["y"]))
                plt.show()

In [16]:
def getScore(allScores, scoreName):
    score = {}
    for key in allScores:
        score[key] = {}
        for e in range(len(allScores[key])) :
            score[key][e] = allScores[key][e][scoreName]
    return score

In [17]:
def sortData(data):
    data_score={}
    for key in list(data.keys()):
        data_score[key] = sorted(data[key].items(), key=lambda x: x[1], reverse=True)
    return data_score  

In [18]:
def getScoreSorted(data,scoreName) :
    return sortData(getScore(data,scoreName))

In [19]:
def getAllScores(data):
    allScores = {}
    for algorithm in list(data.keys()):
        allScores[algorithm]=[]
        for res in data[algorithm]:
            allScores[algorithm].append(dict(sampling = res["sampling"],
                                         accuracy = res["accuracy"], 
                                         precision = res["precision_score"], 
                                         recall = res["recall_score"], 
                                         specificity = res["specificity_score"],
                                         params = res["params"]))
    return allScores

In [20]:
def TestWilcoxon(scores,alpha):
    wilcoxonResultat = {}
    for key in scores:
        if(scores[key][0][1] == scores[key][1][1]):
            wilcoxonResultat[key]=random.choice( [scores[key][0][0],scores[key][1][0]] )
        else:
            stat, p = wilcoxon([scores[key][0][1]] , [scores[key][1][1]])
            if p < alpha:
                wilcoxonResultat[key]=random.choice( [scores[key][0][0],scores[key][1][0]] )
            else:
                wilcoxonResultat[key]=scores[key][0][0]
    return wilcoxonResultat

In [21]:
raw_data = pd.read_csv('data.csv')
dataset = raw_data.copy()
dataset['diagnosis']=dataset['diagnosis'].map({'B':0,'M':1})
raw_x = dataset.iloc[:, 2:31].values
raw_y = dataset.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(raw_x, raw_y, stratify=raw_y, test_size = 0.25, random_state = 0)

def bourdacount():
    resultat = {}
    
    bourdaCount = {}
    bourdaCount['KNN']=[]
    bourdaCount['DT']=[]
    bourdaCount['SVM']=[]
    bourdaCount['MLP']=[]
    
    for normalizer in nomralization:
        
        x_train = nomralization[normalizer].fit(X_train).transform(X_train)
        x_test = nomralization[normalizer].fit(X_test).transform(X_test)
        
        data = getAllData(x_train,x_test,y_train,y_test)
        scores = getAllScores(data)
        score = getScoreSorted(scores,'accuracy')
        wilcoxon = TestWilcoxon(score,0.05)
        
        for key in wilcoxon:
            scores[key][wilcoxon[key]]['normalizer'] = normalizer
            bourdaCount[key].append(scores[key][wilcoxon[key]])
    
    print(bourdaCount)
    sortedBourdaCount = getScoreSorted(bourdaCount,'accuracy')
    index = {}
    for key in sortedBourdaCount:
        resultat[key] = bourdaCount[key][sortedBourdaCount[key][0][0]]
    return resultat

In [22]:
bourda = bourdacount()

  "The number of the samples to be selected is larger"
  "The number of the samples to be selected is larger"
  "The number of the samples to be selected is larger"




















  "The number of the samples to be selected is larger"
























































































































  "The number of the samples to be selected is larger"
  "The number of the samples to be selected is larger"
  "The number of the samples to be selected is larger"






  "The number of the samples to be selected is larger"












































{'KNN': [{'sampling': 'Without Sampling', 'accuracy': 0.9350104821802935, 'precision': 0.8387096774193549, 'recall': 0.9811320754716981, 'specificity': 0.9876543209876543, 'params': {'n_neighbors': 5, 'p': 3}, 'normalizer': 'MinMax'}, {'sampling': 'adasyn', 'accuracy': 0.951048951048951, 'precision': 0.9107142857142857, 'recall': 0.9622641509433962, 'specificity': 0.9770114942528736, 'params': {'n_neighbors': 2, 'p': 1}, 'normalizer': 'z_score'}], 'DT': [{'sampling': 'adasyn', 'accuracy': 0.8951048951048951, 'precision': 0.796875, 'recall': 0.9622641509433962, 'specificity': 0.9746835443037974, 'params': {'criterion': 'gini', 'max_leaf_nodes': 5, 'min_samples_split': 50}, 'normalizer': 'MinMax'}, {'sampling': 'nearMiss', 'accuracy': 0.951048951048951, 'precision': 0.9423076923076923, 'recall': 0.9245283018867925, 'specificity': 0.9560439560439561, 'params': {'criterion': 'gini', 'max_leaf_nodes': 5, 'min_samples_split': 30}, 'normalizer': 'z_score'}], 'SVM': [{'sampling': 'Without Samp

In [23]:
bourda
df1 = pd.DataFrame(bourda, index=['accuracy', 'precision', 'recall', 'specificity'])
df2 = pd.DataFrame(bourda)

In [24]:
df1

Unnamed: 0,KNN,DT,SVM,MLP
accuracy,0.951049,0.951049,0.972028,0.965035
precision,0.910714,0.942308,0.980392,0.944444
recall,0.962264,0.924528,0.943396,0.962264
specificity,0.977011,0.956044,0.943396,0.977528


In [25]:
pd.set_option("max_colwidth", None)
df2

Unnamed: 0,KNN,DT,SVM,MLP
sampling,adasyn,nearMiss,enn,smote
accuracy,0.951049,0.951049,0.972028,0.965035
precision,0.910714,0.942308,0.980392,0.944444
recall,0.962264,0.924528,0.943396,0.962264
specificity,0.977011,0.956044,0.943396,0.977528
params,"{'n_neighbors': 2, 'p': 1}","{'criterion': 'gini', 'max_leaf_nodes': 5, 'min_samples_split': 30}","{'C': 1, 'coef0': 0.75, 'kernel': 'poly'}","{'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.0}"
normalizer,z_score,z_score,z_score,z_score


In [26]:
bourda

{'KNN': {'sampling': 'adasyn',
  'accuracy': 0.951048951048951,
  'precision': 0.9107142857142857,
  'recall': 0.9622641509433962,
  'specificity': 0.9770114942528736,
  'params': {'n_neighbors': 2, 'p': 1},
  'normalizer': 'z_score'},
 'DT': {'sampling': 'nearMiss',
  'accuracy': 0.951048951048951,
  'precision': 0.9423076923076923,
  'recall': 0.9245283018867925,
  'specificity': 0.9560439560439561,
  'params': {'criterion': 'gini',
   'max_leaf_nodes': 5,
   'min_samples_split': 30},
  'normalizer': 'z_score'},
 'SVM': {'sampling': 'enn',
  'accuracy': 0.972027972027972,
  'precision': 0.9803921568627451,
  'recall': 0.9433962264150944,
  'specificity': 0.9433962264150944,
  'params': {'C': 1, 'coef0': 0.75, 'kernel': 'poly'},
  'normalizer': 'z_score'},
 'MLP': {'sampling': 'smote',
  'accuracy': 0.965034965034965,
  'precision': 0.9444444444444444,
  'recall': 0.9622641509433962,
  'specificity': 0.9775280898876404,
  'params': {'learning_rate_init': 0.001, 'max_iter': 200, 'momen