In this Jupyter Notebook I developed the Data Mining of the Cardiotocography dataset available at https://archive.ics.uci.edu/ml/datasets/Cardiotocography

In this first code block, I import the dataset that is given in the xlsx format (MS Office Excel).
Since all the features in the dataset are already in a numeric format, there is no need to map categorical features into numeric ones.
However, I converted the last feature (NSP), which corresponds to the fetal state, into binary by mapping the suspect(S=2) and pathologic(P=3) states into 1 and the normal(N=1) into 0.

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

data = pd.read_excel('Cardiotocography/CTG.xlsx','Data',header=1)
data.head()

mappings = {}
mappings['NSP'] = {1:0, 2:1, 3:1}

for k in mappings:
    data[k] = data[k].map(mappings[k])

data.head()






Unnamed: 0,b,e,AC,FM,UC,DL,DS,DP,DR,LB,...,C,D,E,AD,DE,LD,FS,SUSP,CLASS,NSP
0,240,357,0,0,0,0,0,0,0,120,...,-1,-1,-1,-1,-1,-1,1,-1,9,1
1,5,632,4,0,4,2,0,0,0,132,...,-1,-1,-1,1,-1,-1,-1,-1,6,0
2,177,779,2,0,5,2,0,0,0,133,...,-1,-1,-1,1,-1,-1,-1,-1,6,0
3,411,1192,2,0,6,2,0,0,0,134,...,-1,-1,-1,1,-1,-1,-1,-1,6,0
4,533,1147,4,0,5,0,0,0,0,132,...,-1,-1,-1,-1,-1,-1,-1,-1,2,0


In [25]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
y=data.as_matrix(columns=[data.columns[-1]])
X=data.as_matrix(columns=data.columns[:-1])

sc = StandardScaler()
X = sc.fit_transform(X)
X

array([[-0.71423878, -1.44609109, -0.76473985, ...,  5.45999947,
        -0.31957073,  1.48376361],
       [-0.97713923, -1.15061462,  0.35885166, ..., -0.1831502 ,
        -0.31957073,  0.49241181],
       [-0.78471847, -0.99266902, -0.20294409, ..., -0.1831502 ,
        -0.31957073,  0.49241181],
       ...,
       [ 0.78037828,  0.95962458, -0.48384197, ..., -0.1831502 ,
        -0.31957073,  0.16196121],
       [ 0.78037828,  1.4463549 , -0.48384197, ..., -0.1831502 ,
        -0.31957073,  0.16196121],
       [ 2.1452232 ,  1.83960721, -0.48384197, ..., -0.1831502 ,
        -0.31957073, -1.15984119]])

Now, the pre-processing phase is completed by doing the dimension reduction with the PCA procedure.
By not limiting the number of components within the PCA() function arguments, I noticed that the explained variance ratio after 15 principal components was not relevant, so Ilimited the ammount of components by 15.
The lowest explained varience ration found in the dataset after the dimension reduction was about 0.03.

In [26]:
pca = PCA(n_components=15)
X = pca.fit_transform(X)

plt.bar(range(15), pca.explained_variance_ratio_, alpha=0.5, align='center')
plt.step(range(15), np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.show()


print("Lowest explained variance ratio: " + str(pca.explained_variance_ratio_[-1]))


Lowest explained variance ratio: 0.023393460206440925


In [27]:
import time

from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import train_test_split

from numpy import linalg as LA
from matplotlib.colors import ListedColormap
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from sklearn.neural_network import MLPClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


start = time.time()
svm_class=SVC(C=4, max_iter=120000, tol=1e-03, kernel="poly", degree=3)
svm_class = svm_class.fit(X_train,y_train)
prediction_svm = svm_class.predict(X_test)
end = time.time()
print("SVM classifier:")
print("Accuracy (Score): {}".format(accuracy_score(y_test, prediction_svm)))
print("Precision: {}".format(precision_score(y_test, prediction_svm)))
print("Recall: {}".format(recall_score(y_test, prediction_svm)))
print("Time (s): {}".format(end-start))

start = time.time()
mlp_class = MLPClassifier(activation='tanh', hidden_layer_sizes=(10,), solver='lbfgs')
prediction_mlp = mlp_class.fit(X_train,y_train).predict(X_test)
end = time.time()
print("\nMLP classifier:")
print("Accuracy (Score): {}".format(accuracy_score(y_test, prediction_mlp)))
print("Precision: {}".format(precision_score(y_test, prediction_mlp)))
print("Recall: {}".format(recall_score(y_test, prediction_mlp)))
print("Time (s): {}".format(end-start))

start = time.time()
svm_linear_class=LinearSVC(C=1, max_iter=30000, tol=1e-03)
svm_linear_class = svm_linear_class.fit(X_train,y_train)
prediction_svm_linear = svm_linear_class.predict(X_test)
end = time.time()
print("\nLinear SVM classifier:")
print("Accuracy (Score): {}".format(accuracy_score(y_test, prediction_svm_linear)))
print("Precision: {}".format(precision_score(y_test, prediction_svm_linear)))
print("Recall: {}".format(recall_score(y_test, prediction_svm_linear)))
print("Time (s): {}".format(end-start))

start = time.time()
bagging_class=BaggingClassifier(n_estimators=15, max_features=0.5, random_state=1512)
bagging_class = bagging_class.fit(X_train,y_train)
prediction_bagging = bagging_class.predict(X_test)
end = time.time()
print("\nBagging classifier:")
print("Accuracy (Score): {}".format(accuracy_score(y_test, prediction_svm_linear)))
print("Precision: {}".format(precision_score(y_test, prediction_svm_linear)))
print("Recall: {}".format(recall_score(y_test, prediction_svm_linear)))
print("Time (s): {}".format(end-start))

SVM classifier:
Accuracy (Score): 0.9812206572769953
Precision: 0.9893617021276596
Recall: 0.93
Time (s): 0.009947776794433594

MLP classifier:
Accuracy (Score): 0.9788732394366197
Precision: 0.9690721649484536
Recall: 0.94
Time (s): 0.036878347396850586

Linear SVM classifier:
Accuracy (Score): 0.9765258215962441
Precision: 0.9787234042553191
Recall: 0.92
Time (s): 0.02921605110168457

Bagging classifier:
Accuracy (Score): 0.9765258215962441
Precision: 0.9787234042553191
Recall: 0.92
Time (s): 0.07081031799316406


Now, I determine wich are the best hyperparameters for the Linear and Non-Linear SVM (using the polynomial kernel function) and the respectives scores.
I varied the C hyperparameter which determines how much the SVM optimization will avoid misclassifying each training example.
For this I made several trainings, for several C values to see which was the best case scenario.
I also set a for each training a tolerance of stopping criteria of 1*10^-8, and noticed the optimal C value was much lower for the non-linear SVM with the polynomial function

In [29]:
C=[3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 150]

highest_score = 0
best_C = None
for c in C:
    test = LinearSVC(C=c, tol=1e-8).fit(X_train, y_train)
    score = test.score(X_test, y_test)
    if  score > highest_score:
        best_C = c
        highest_score = score
            
print("Best hyperparameters for Linear SVM: C={}, with a score of {}".format(best_C, highest_score))


highest_score = 0
best_C = None
for c in C:
    test = SVC(C=c, tol=1e-8, kernel='poly').fit(X_train, y_train)
    score = test.score(X_test, y_test)
    if score > highest_score:
        best_C = c
        highest_score = score
            
print("Best hyperparameters for SVM with polynomial function: C={} with a score of {}".format(best_C, highest_score))



Best hyperparameters for Linear SVM: C=30, with a score of 0.9859154929577465
Best hyperparameters for SVM with polynomial function: C=3 with a score of 0.9835680751173709


In the following steps, I will use nested cross-validation techinques to determine thescore for diferent classifiers.
For this case I estimated the model's performance with the Grid Search Cross Validation, in wich I passed different parameter grids to determine the best way to train the model.
After defining the parameter grids for each case, I created the Grid Search CV object passing a Classifier as the

In [6]:
grid_mlp_params = [{'hidden_layer_sizes': [(d, ) for d in [5, 10, 25, 50]], 'activation': ['relu', 'tanh', 'logistic', 'identity']}]
grid_search_mlp = GridSearchCV(estimator=MLPClassifier(solver="lbfgs"), param_grid=grid_mlp_params, cv=3)
grid_search_mlp.fit(X, y)
print("Best MLP Score: {}".format(grid_search_mlp.best_score_))
print("Best MLP Params: {}".format(grid_search_mlp.best_estimator_.get_params()))

Best MLP Score: 0.9313264346190028
Best MLP Params: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'lbfgs', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}


In [32]:
import random

from sklearn.tree import DecisionTreeClassifier

random_state = random.randint(1,99999)
grid_bagging_params = [{'n_estimators':[2,5,10,15,20],'max_features':[0.25,0.5,1],'bootstrap':[True,False],'n_jobs':[None,2,3],'random_state':[random_state,None]}]
grid_search_bagging = GridSearchCV(estimator=BaggingClassifier(), param_grid=grid_bagging_params)
grid_search_bagging.fit(X, y)
print("Best Bagging Score: {}".format(grid_search_bagging.best_score_))
print("Best Bagging Params: {}".format(grid_search_bagging.best_estimator_.get_params()))

Random state:  15977
Best Bagging Score: 0.9586077140169332
Best Bagging Params: {'base_estimator': None, 'bootstrap': False, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': 15977, 'verbose': 0, 'warm_start': False}


At the end, we get 