In [17]:
import numpy as np
import pandas as pd
import csv
from IPython.display import HTML, display
import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [18]:
def readData():    
    patientdata = pd.read_csv('../mfcc_features_patient.csv', delimiter=' ')
    healthydata = pd.read_csv('../mfcc_features_healthy.csv', delimiter=' ')
    
    return patientdata, healthydata

In [26]:
def createSamplesAndLabels(patient_df, healthy_df):
    print(len(healthy_df), len(patient_df))
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
#     print(samples[0])
    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)
    
    samples = np.delete(samples, -1, 1)
#     print(len(samples), len(labels))

    return samples, labels

In [27]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [28]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [29]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [30]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [36]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y, model):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
#         print("Predicting labels:\n")
        if(model == "SVM"):
            predicted_test_labels = svm(X_train, y_train, X_test)
        elif(model == "kMeans"):
            predicted_test_labels = kMeans(X_train, y_train, X_test)
        elif(model == "RandomForestClassifier"):
            predicted_test_labels = rf(X_train, y_train, X_test)
        
#         print("Calculating scores:\n")
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        answer = [[model],
                  ["Metric", "Healthy", "Patient"],
                  ["Recall", recall[0], recall[1]],
                  ["Precision", precision[0], precision[1]],
                  ["F1-Score", f1[0], f1[1]],
                 ]

        display(HTML(tabulate.tabulate(answer, tablefmt='html')))

In [37]:
patient_df, healthy_df = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

1083 2173


In [38]:
samples = samples.astype(np.float)
labels = labels.astype(np.float)
print(type(samples), type(samples[0][0]))

<class 'numpy.ndarray'> <class 'numpy.float64'>


In [39]:
kFoldCrossValidation(5, samples, labels, "SVM")

0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.5345622119815668,0.8827586206896552
Precision,0.6946107784431138,0.7917525773195876
F1-Score,0.6041666666666666,0.8347826086956521


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.576036866359447,0.8827586206896552
Precision,0.7102272727272727,0.8067226890756303
F1-Score,0.6361323155216285,0.8430296377607025


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.5622119815668203,0.896551724137931
Precision,0.7305389221556886,0.8041237113402062
F1-Score,0.6354166666666667,0.8478260869565218


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.5,0.8732718894009217
Precision,0.6625766871165644,0.7782340862422998
F1-Score,0.5699208443271767,0.8230184581976113


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.6111111111111112,0.868663594470046
Precision,0.6984126984126984,0.8177874186550976
F1-Score,0.6518518518518519,0.8424581005586592


In [40]:
kFoldCrossValidation(5, samples[0:2000], labels[0:2000], "kMeans")

0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.7596899224806202,0.6309963099630996
Precision,0.494949494949495,0.8465346534653465
F1-Score,0.599388379204893,0.7230443974630021


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.7751937984496124,0.6125461254612546
Precision,0.4878048780487805,0.8512820512820513
F1-Score,0.5988023952095809,0.7124463519313305


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.7906976744186046,0.6051660516605166
Precision,0.4880382775119617,0.8586387434554974
F1-Score,0.6035502958579881,0.7099567099567099


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.689922480620155,0.6273062730627307
Precision,0.46842105263157896,0.8095238095238095
F1-Score,0.5579937304075235,0.7068607068607068


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.7829457364341085,0.6531365313653137
Precision,0.517948717948718,0.8634146341463415
F1-Score,0.6234567901234569,0.7436974789915966


In [41]:
kFoldCrossValidation(5, samples[0:2000], labels[0:2000], "RandomForestClassifier")

0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.4186046511627907,0.959409594095941
Precision,0.8307692307692308,0.7761194029850746
F1-Score,0.5567010309278351,0.8580858085808581


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.43410852713178294,0.966789667896679
Precision,0.8615384615384616,0.7820895522388059
F1-Score,0.577319587628866,0.8646864686468646


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.3875968992248062,0.955719557195572
Precision,0.8064516129032258,0.7662721893491125
F1-Score,0.5235602094240838,0.8505747126436782


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.4186046511627907,0.966789667896679
Precision,0.8571428571428571,0.7774480712166172
F1-Score,0.5625000000000001,0.8618421052631579


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.4108527131782946,0.915129151291513
Precision,0.6973684210526315,0.7654320987654321
F1-Score,0.5170731707317073,0.8336134453781513
