In [2]:
import numpy as np
import pandas as pd
import csv
from IPython.display import HTML, display
import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [3]:
def readData():    
    patientdata = pd.read_csv('./patient_output.csv', delimiter='\t')
    healthydata = pd.read_csv('./healthy_output.csv', delimiter='\t')

    columns = patientdata.columns.values.tolist()
    
    return patientdata, healthydata, columns

In [11]:
def createSamplesAndLabels(patient_df, healthy_df):
    print(len(healthy_df), len(patient_df))
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
    
    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)
    
    samples = np.delete(samples, -1, 1)
    print(len(samples))

    return samples, labels

In [5]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [6]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [7]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [8]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [9]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y, model):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if(model == "SVM"):
            predicted_test_labels = svm(X_train, y_train, X_test)
        elif(model == "kMeans"):
            predicted_test_labels = kMeans(X_train, y_train, X_test)
        elif(model == "RandomForestClassifier"):
            predicted_test_labels = rf(X_train, y_train, X_test)
        
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        answer = [[model],
                  ["Metric", "Healthy", "Patient"],
                  ["Recall", recall[0], recall[1]],
                  ["Precision", precision[0], precision[1]],
                  ["F1-Score", f1[0], f1[1]],
                 ]

        display(HTML(tabulate.tabulate(answer, tablefmt='html')))
        
                
#         print("cm", cm, "\nrecall", recall, "\nprecision", precision, "\nf1:", f1)
#         print("Mean Recall:", np.mean(recall))
#         print("Mean Precision:", np.mean(precision))
#         print("meanf1:", np.mean(f1))
#         print('\n')

In [12]:
patient_df, healthy_df, features = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

1082 2168
3250


In [23]:
kFoldCrossValidation(5, samples, labels, "SVM")

0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9308755760368663,0.9792626728110599
Precision,0.957345971563981,0.9659090909090909
F1-Score,0.9439252336448597,0.9725400457665904


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9631336405529954,0.9654377880184332
Precision,0.9330357142857143,0.9812646370023419
F1-Score,0.9478458049886621,0.9732868757259


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9212962962962963,0.9746543778801844
Precision,0.9476190476190476,0.9613636363636363
F1-Score,0.9342723004694835,0.9679633867276888


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9212962962962963,0.9722863741339491
Precision,0.943127962085308,0.9611872146118722
F1-Score,0.9320843091334895,0.9667049368541907


0,1,2
SVM,,
Metric,Healthy,Patient
Recall,0.9444444444444444,0.976905311778291
Precision,0.9532710280373832,0.9724137931034482
F1-Score,0.9488372093023255,0.9746543778801843


In [24]:
kFoldCrossValidation(5, samples, labels, "kMeans")

0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.12442396313364056,0.6728110599078341
Precision,0.15976331360946747,0.6058091286307054
F1-Score,0.13989637305699484,0.6375545851528384


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.9032258064516129,0.31336405529953915
Precision,0.3967611336032389,0.8662420382165605
F1-Score,0.5513361462728551,0.46023688663282575


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.9120370370370371,0.31336405529953915
Precision,0.397979797979798,0.8774193548387097
F1-Score,0.5541490857946554,0.4617996604414262


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.8935185185185185,0.3233256351039261
Precision,0.39711934156378603,0.8588957055214724
F1-Score,0.5498575498575499,0.46979865771812085


0,1,2
kMeans,,
Metric,Healthy,Patient
Recall,0.8842592592592593,0.3394919168591224
Precision,0.40041928721174,0.8546511627906976
F1-Score,0.5512265512265513,0.4859504132231405


In [25]:
kFoldCrossValidation(5, samples, labels, "RandomForestClassifier")

0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.8847926267281107,0.9654377880184332
Precision,0.927536231884058,0.9436936936936937
F1-Score,0.9056603773584906,0.9544419134396355


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.8894009216589862,0.9723502304147466
Precision,0.9414634146341463,0.9461883408071748
F1-Score,0.9146919431279621,0.9590909090909091


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.9074074074074074,0.9815668202764977
Precision,0.9607843137254902,0.9551569506726457
F1-Score,0.9333333333333333,0.9681818181818181


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.8935185185185185,0.976905311778291
Precision,0.9507389162561576,0.9484304932735426
F1-Score,0.9212410501193318,0.9624573378839589


0,1,2
RandomForestClassifier,,
Metric,Healthy,Patient
Recall,0.8888888888888888,0.976905311778291
Precision,0.9504950495049505,0.9463087248322147
F1-Score,0.9186602870813396,0.9613636363636363
