In [1]:
import numpy as np
import pandas as pd
import csv
from IPython.display import HTML, display
import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
def readData():    
    patientdata = pd.read_csv('../mfcc_features_patient.csv', delimiter=' ')
    healthydata = pd.read_csv('../mfcc_features_healthy.csv', delimiter=' ')
    
    return patientdata, healthydata

In [10]:
def createSamplesAndLabels(patient_df, healthy_df):
    print(len(healthy_df), len(patient_df))
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
    print(samples[0])
    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)
    
    samples = np.delete(samples, -1, 1)
    print(len(samples), len(labels))

    return samples, labels

In [11]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [12]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [13]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [14]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [15]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y, model):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        print("Predicting labels:\n")
        if(model == "SVM"):
            predicted_test_labels = svm(X_train, y_train, X_test)
        elif(model == "kMeans"):
            predicted_test_labels = kMeans(X_train, y_train, X_test)
        elif(model == "RandomForestClassifier"):
            predicted_test_labels = rf(X_train, y_train, X_test)
        
        print("Calculating scores:\n")
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        answer = [[model],
                  ["Metric", "Healthy", "Patient"],
                  ["Recall", recall[0], recall[1]],
                  ["Precision", precision[0], precision[1]],
                  ["F1-Score", f1[0], f1[1]],
                 ]

        display(HTML(tabulate.tabulate(answer, tablefmt='html')))

In [16]:
patient_df, healthy_df = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

1082 2172
[-1.00000005e-08  1.35332341e+01 -1.98957086e-01 -1.44259898e+00
 -9.99999873e-09  4.03264231e+00 -4.97080609e-01  5.26596341e-01
 -9.99999247e-09  1.99606224e+00  1.19709851e-01  6.22993356e-01
 -1.00000002e-08  2.49621912e+00  3.72091438e-02 -6.33984995e-01
 -9.99999920e-09  2.36262275e+00 -4.94927839e-01  4.38764617e-02
 -1.00000014e-08  1.58849947e+00  3.85578125e-01  1.66772290e-01
 -9.99999953e-09  1.24230210e+00 -7.52247284e-01  3.77686634e-01
 -9.99999971e-09  1.30999791e+00 -7.17744493e-01 -9.80582563e-02
 -1.00000000e-08  1.26126912e+00 -3.62055444e-01 -6.57563611e-01
 -1.00000003e-08  1.09333905e+00 -3.92464545e-01 -4.43685263e-01
 -9.99999999e-09  7.66703651e-01  1.25992222e-01 -6.46560540e-01
 -9.99999989e-09  8.44180658e-01  1.85652766e-01  3.70511788e-01
  1.00000000e+00]
3254 3254


In [10]:
samples = samples.astype(np.float)
labels = labels.astype(np.float)
print(type(samples), type(samples[0][0]))

<class 'numpy.ndarray'> <class 'numpy.float64'>


In [12]:
kFoldCrossValidation(5, samples[0:2000], labels[0:2000], "SVM")

Predicting labels:

Calculating scores:



0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9142857142857143,0.9628378378378378
Precision,0.897196261682243,0.9693877551020408
F1-Score,0.9056603773584906,0.9661016949152542


Predicting labels:

Calculating scores:



0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9326923076923077,0.9256756756756757
Precision,0.8151260504201681,0.9750889679715302
F1-Score,0.8699551569506726,0.949740034662045


Predicting labels:

Calculating scores:



0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.7884615384615384,0.9459459459459459
Precision,0.8367346938775511,0.9271523178807947
F1-Score,0.8118811881188118,0.9364548494983277


Predicting labels:

Calculating scores:



0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.8461538461538461,0.956081081081081
Precision,0.8712871287128713,0.9464882943143813
F1-Score,0.8585365853658536,0.9512605042016806


Predicting labels:

Calculating scores:



0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.8269230769230769,0.9491525423728814
Precision,0.8514851485148515,0.9395973154362416
F1-Score,0.8390243902439023,0.9443507588532882


In [13]:
kFoldCrossValidation(5, samples[0:2000], labels[0:2000], "kMeans")

Predicting labels:

Calculating scores:



0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,1.0,0.5304054054054054
Precision,0.430327868852459,1.0
F1-Score,0.6017191977077364,0.6931567328918322


Predicting labels:

Calculating scores:



0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,1.0,0.47635135135135137
Precision,0.4015444015444015,1.0
F1-Score,0.5730027548209365,0.6453089244851259


Predicting labels:

Calculating scores:



0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,1.0,0.4797297297297297
Precision,0.40310077519379844,1.0
F1-Score,0.574585635359116,0.6484018264840182


Predicting labels:

Calculating scores:



0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.0,0.5236486486486487
Precision,0.0,0.5984555984555985
F1-Score,0.0,0.5585585585585586


Predicting labels:

Calculating scores:



0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.0,0.4915254237288136
Precision,0.0,0.5823293172690763
F1-Score,0.0,0.5330882352941175


In [14]:
kFoldCrossValidation(5, samples[0:2000], labels[0:2000], "RandomForestClassifier")

Predicting labels:

Calculating scores:



0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.9904761904761905,0.9797297297297297
Precision,0.9454545454545454,0.9965635738831615
F1-Score,0.9674418604651163,0.9880749574105622


Predicting labels:

Calculating scores:



0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.9903846153846154,0.9594594594594594
Precision,0.8956521739130435,0.9964912280701754
F1-Score,0.9406392694063926,0.9776247848537005


Predicting labels:

Calculating scores:



0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,1.0,0.9594594594594594
Precision,0.896551724137931,1.0
F1-Score,0.9454545454545454,0.9793103448275862


Predicting labels:

Calculating scores:



0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.9807692307692307,0.9594594594594594
Precision,0.8947368421052632,0.993006993006993
F1-Score,0.9357798165137614,0.9759450171821306


Predicting labels:

Calculating scores:



0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.9807692307692307,0.9728813559322034
Precision,0.9272727272727272,0.9930795847750865
F1-Score,0.9532710280373831,0.9828767123287672
