#### Import modules and read data

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import cross_validate, KFold, RepeatedKFold

In [2]:
normal_data = pd.read_csv(os.getcwd() + '/Data/brain_tumour_normalized.csv')
normal_data.head()

Unnamed: 0,Image,Mean,Variance,Standard Deviation,Entropy,Skewness,Kurtosis,Contrast,Energy,ASM,Homogeneity,Dissimilarity,Correlation,Coarseness,Class
0,Image1,0.194705,0.212023,0.443074,0.274801,0.068211,0.010937,0.028236,0.47541,0.246092,0.603108,0.139694,0.981764,0.0,0
1,Image2,0.261489,0.276124,0.510114,0.674843,0.052278,0.007693,0.017951,0.797096,0.648383,0.7738,0.093527,0.997417,0.0,0
2,Image3,0.219003,0.392326,0.6142,0.001487,0.090618,0.016478,0.02328,0.012719,0.001173,0.23076,0.195261,0.972855,0.0,1
3,Image4,0.1773,0.329007,0.55975,0.001513,0.108202,0.021559,0.043805,0.012908,0.001192,0.196137,0.258588,0.941475,0.0,1
4,Image5,0.218223,0.24984,0.483677,0.370574,0.068403,0.011067,0.050836,0.56486,0.338854,0.560862,0.226679,0.960995,0.0,0


In [3]:
X, Y = normal_data[normal_data.columns[1 : 14]], normal_data[normal_data.columns[-1]]
X.shape, Y.shape

((3762, 13), (3762,))

#### Feature selection using ANOVA F-value

In [5]:
k = 6

feature_selection = SelectKBest(score_func = f_classif, k = k)  # select k best features using ANOVA F-value

fit = feature_selection.fit(X, Y)

In [6]:
pd.DataFrame({'scores': fit.scores_, 'cols': X.columns}).sort_values(by = 'scores', ascending = False).head(k)  # k best features

Unnamed: 0,scores,cols
7,10913.518208,Energy
9,9587.758475,Homogeneity
3,5772.609943,Entropy
8,5086.027179,ASM
10,1685.25463,Dissimilarity
4,727.528516,Skewness


In [7]:
X_fs = feature_selection.transform(X)
X_fs.shape

(3762, 6)

#### Cross-validation without Feature Selection

In [8]:
def crossVal(model, cv_selected):
    scoring = ['accuracy', 'precision', 'recall']  # select metrics
    scores = cross_validate(model, X, Y, scoring = scoring, cv = cv_selected)  # calc scores
    
    print('Test accuracy:', scores['test_accuracy'].mean() * 100)
    print('Test std. dev. of accuracy:', scores['test_accuracy'].std())
    print('-' * 10)
    print('Test sensitivity:', scores['test_recall'].mean() * 100)
    print('Test std. dev. of sensitivity:', scores['test_recall'].std())
    print('-' * 10)
    print('Test precision:', scores['test_precision'].mean() * 100)
    print('Test std. dev. of precision:', scores['test_precision'].std())

In [9]:
adaBoost = AdaBoostClassifier()

Without repeated folds:

In [10]:
cv = KFold(n_splits = 10, shuffle = True, random_state = 42)  # create CV splits

crossVal(adaBoost, cv)

Test accuracy: 98.61786782549805
Test std. dev. of accuracy: 0.006268334181714131
----------
Test sensitivity: 97.67718381081835
Test std. dev. of sensitivity: 0.014020642062510967
----------
Test precision: 99.21004019232453
Test std. dev. of precision: 0.006596021313716727


With repeated folds:

In [11]:
cvr = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 42)  # creates CV splits for repeated KFold CV

crossVal(adaBoost, cvr)

Test accuracy: 98.75058788118216
Test std. dev. of accuracy: 0.0051450117361733485
----------
Test sensitivity: 97.9049444089611
Test std. dev. of sensitivity: 0.010808683837078729
----------
Test precision: 99.29754164092037
Test std. dev. of precision: 0.005643862714722419
