In [1]:
import numpy as np
import pandas as pd
import csv
from IPython.display import HTML, display
import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
def readData():    
    patientdata = pd.read_csv('../mfcc_features_patient.csv', delimiter=' ')
    healthydata = pd.read_csv('../mfcc_features_healthy.csv', delimiter=' ')
    
    return patientdata, healthydata

In [3]:
def createSamplesAndLabels(patient_df, healthy_df):
    print(len(healthy_df), len(patient_df))
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
    
    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)
    
    samples = np.delete(samples, -1, 1)
    print(len(samples), len(labels))

    return samples, labels

In [4]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [5]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [6]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [7]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [8]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y, model):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        print("Predicting labels:\n")
        if(model == "SVM"):
            predicted_test_labels = svm(X_train, y_train, X_test)
        elif(model == "kMeans"):
            predicted_test_labels = kMeans(X_train, y_train, X_test)
        elif(model == "RandomForestClassifier"):
            predicted_test_labels = rf(X_train, y_train, X_test)
        
        print("Calculating scores:\n")
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        answer = [[model],
                  ["Metric", "Healthy", "Patient"],
                  ["Recall", recall[0], recall[1]],
                  ["Precision", precision[0], precision[1]],
                  ["F1-Score", f1[0], f1[1]],
                 ]

        display(HTML(tabulate.tabulate(answer, tablefmt='html')))

In [9]:
patient_df, healthy_df = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

55932 144654
200586 200586


In [10]:
samples = samples.astype(np.float)
labels = labels.astype(np.float)
print(type(samples), type(samples[0][0]))

<class 'numpy.ndarray'> <class 'numpy.float64'>


In [None]:
kFoldCrossValidation(5, samples, labels, "SVM")

Predicting labels:



In [None]:
kFoldCrossValidation(5, samples, labels, "kMeans")

In [None]:
kFoldCrossValidation(5, samples, labels, "RandomForestClassifier")