In [79]:
import os

#data wrangling
import numpy as np
import pandas as pd

import operator

###Machine learning
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.svm import SVC, LinearSVC #support vector machines
from sklearn import svm
from sklearn.naive_bayes import GaussianNB #naive bayes

In [80]:
data = pd.read_csv(os.getcwd() + '/Data/brain_tumour.csv')

# normalise using min-max normalisation
min_max_scaler = preprocessing.MinMaxScaler()
scaled_data = min_max_scaler.fit_transform(data[data.columns[2 :]].values)
normal_data = pd.DataFrame(scaled_data, columns = data.columns[2 :])

normal_data['Class'] = data['Class']
normal_data.insert(loc = 0, column = 'Image', value = data["Image"])  

#saving normalised data to a csv file
normal_data.to_csv(os.getcwd() + '/Data/brain_tumour_normalized.csv', index=False)
normal_data.head()

Unnamed: 0,Image,Mean,Variance,Standard Deviation,Entropy,Skewness,Kurtosis,Contrast,Energy,ASM,Homogeneity,Dissimilarity,Correlation,Coarseness,Class
0,Image1,0.194705,0.212023,0.443074,0.274801,0.068211,0.010937,0.028236,0.47541,0.246092,0.603108,0.139694,0.981764,0.0,0
1,Image2,0.261489,0.276124,0.510114,0.674843,0.052278,0.007693,0.017951,0.797096,0.648383,0.7738,0.093527,0.997417,0.0,0
2,Image3,0.219003,0.392326,0.6142,0.001487,0.090618,0.016478,0.02328,0.012719,0.001173,0.23076,0.195261,0.972855,0.0,1
3,Image4,0.1773,0.329007,0.55975,0.001513,0.108202,0.021559,0.043805,0.012908,0.001192,0.196137,0.258588,0.941475,0.0,1
4,Image5,0.218223,0.24984,0.483677,0.370574,0.068403,0.011067,0.050836,0.56486,0.338854,0.560862,0.226679,0.960995,0.0,0


## Feature Importance

In [81]:
#feature importance using External Trees Classifier(similar to Random Forest)
from sklearn.ensemble import ExtraTreesClassifier
X = normal_data[normal_data.columns[1 :14]].values
Y = normal_data["Class"].values
cols = list(normal_data.columns[1:14])

# feature extraction
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
scores = model.feature_importances_
scores_dict = {}
for i in range(1, len(scores)+1):
    scores_dict[normal_data.columns[i]] = scores[i-1]

#displaying features in the order of importance
scores_dict_sorted = sorted(scores_dict.items(), key=operator.itemgetter(1), reverse = True)
scores_dict_sorted


[('Energy', 0.48933094362997076),
 ('Entropy', 0.21912098926935974),
 ('ASM', 0.08432792867894454),
 ('Homogeneity', 0.07835452464974792),
 ('Dissimilarity', 0.0370245689424418),
 ('Kurtosis', 0.016994894148999802),
 ('Variance', 0.016074644520222683),
 ('Skewness', 0.014011222800679507),
 ('Standard Deviation', 0.013931838826707713),
 ('Mean', 0.013269090157434164),
 ('Contrast', 0.008831217156668866),
 ('Correlation', 0.004945135358730837),
 ('Coarseness', 0.003783001860091708)]

## Model 1: Logistic Regression

In [82]:
#datset split
from sklearn.model_selection import train_test_split
X = normal_data[normal_data.columns[:14]]
Y = normal_data[normal_data.columns[-1]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
X_test

Unnamed: 0,Image,Mean,Variance,Standard Deviation,Entropy,Skewness,Kurtosis,Contrast,Energy,ASM,Homogeneity,Dissimilarity,Correlation,Coarseness
3202,Image3203,0.411857,0.201923,0.431632,0.168981,0.009725,0.000958,0.016720,0.355911,0.145127,0.591544,0.101937,0.941657,0.02832
1135,Image1136,0.215528,0.428803,0.643585,0.008348,0.096743,0.018110,0.041743,0.052148,0.006699,0.266099,0.201322,0.965428,0.00000
1501,Image1502,0.415565,0.517418,0.710152,0.036227,0.040976,0.005653,0.056216,0.141729,0.029880,0.392557,0.189193,0.929059,0.00000
1301,Image1302,0.078619,0.114534,0.317399,0.004659,0.149218,0.034841,0.007935,0.033094,0.003671,0.273041,0.115757,0.921572,0.00000
1220,Image1221,0.364182,0.214880,0.446262,0.288207,0.017398,0.002006,0.032085,0.487971,0.258230,0.690250,0.097081,0.916158,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2594,Image2595,0.477424,0.415984,0.633406,0.157974,0.022431,0.002541,0.046307,0.344875,0.137127,0.512013,0.196497,0.946570,0.00000
586,Image587,0.312698,0.313537,0.545670,0.282230,0.042706,0.005750,0.081340,0.482596,0.253001,0.591055,0.228128,0.905751,0.00000
2818,Image2819,0.117340,0.223880,0.456166,0.002399,0.144301,0.033872,0.062582,0.019269,0.001893,0.191114,0.336584,0.890343,0.00000
876,Image877,0.229312,0.252771,0.486691,0.279192,0.060537,0.009141,0.059268,0.479745,0.250249,0.633857,0.162044,0.944360,0.00000


In [83]:
def logistic_regression(X_train, y_train, X_test):
    predictions = ['Mean', "Variance", 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    x_train = X_train[predictions]
    x_test = X_test[predictions]
    logisticRegr = LogisticRegression()
    logisticRegr.fit(x_train, y_train)
    pred1=logisticRegr.predict(x_test)
    pred = list(pred1)
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = pred
    return df_res

x = logistic_regression(X_train, Y_train, X_test)
print(x)

          Image  Class
3202  Image3203      0
1135  Image1136      1
1501  Image1502      1
1301  Image1302      1
1220  Image1221      0
...         ...    ...
2594  Image2595      0
586    Image587      0
2818  Image2819      1
876    Image877      0
434    Image435      0

[753 rows x 2 columns]


In [84]:
#measuring the accuracy of the model
pred = x["Class"]
print('Accuracy Score :')
print(metrics.accuracy_score(Y_test, pred))

Accuracy Score :
0.9933598937583001


## Model 2 : SVM(Linear and Radial Basis Function(RBF))

In [85]:
def linear_svm(svm_type, x_train, y_train, x_test):
    predictions = ['Mean', "Variance", 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    #linear SVM
    if svm_type == "linear":
        svc = LinearSVC()
    #svm with rbf kernel
    if svm_type == "rbf":
        svc = svm.SVC(kernel='rbf', C=1,gamma='auto')
    svc.fit(x_train[predictions], y_train)
    svm_pred = svc.predict(X_test[predictions])
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = svm_pred
    return df_res, svm_pred

In [86]:
#linear SVM
x, svm_pred = linear_svm("linear",X_train, Y_train, X_test)
x

Unnamed: 0,Image,Class
3202,Image3203,0
1135,Image1136,1
1501,Image1502,1
1301,Image1302,1
1220,Image1221,0
...,...,...
2594,Image2595,0
586,Image587,0
2818,Image2819,1
876,Image877,0


In [87]:
#measuring accuracy of linear svm
print('Accuracy Score :')
print(metrics.accuracy_score(Y_test, svm_pred))

Accuracy Score :
0.9933598937583001


In [88]:
#svm with rbf kernel
y, rbf_svm_pred = linear_svm("rbf",X_train, Y_train, X_test)
y

AttributeError: 'function' object has no attribute 'SVC'

In [None]:
#measuring the accuracy of kernel svm
print('Accuracy Score :')
print(metrics.accuracy_score(Y_test, rbf_svm_pred))

## Model 3 : Naive Bayes

In [None]:
def naive_bayes(X_train, Y_train, X_test):
    predictions = ['Mean', "Variance", 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
    model = GaussianNB()
    model.fit(X_train[predictions], Y_train)
    predicted= model.predict(X_test[predictions])
    predicted = list(predicted)
    df_res = pd.DataFrame()
    df_res['Image'] = X_test['Image']
    df_res['Class'] = predicted
    return df_res, predicted

In [None]:
#navive bayes
z, nb_pred = naive_bayes(X_train, Y_train, X_test)
z

In [None]:
#measuring the accuracy of naive bayes model
print('Accuracy Score :')
print(metrics.accuracy_score(Y_test, nb_pred))

## Model 4 : Random Forest