In [117]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [118]:
def readData():    
    patientdata = pd.read_csv('./patient_output.csv', delimiter='\t')
    healthydata = pd.read_csv('./healthy_output.csv', delimiter='\t')

    columns = patientdata.columns.values.tolist()
    
    return patientdata, healthydata, columns

In [130]:
def createSamplesAndLabels(patient_df, healthy_df):
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))

    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)

    return samples, labels

In [131]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [132]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [133]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [150]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [164]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
#         predicted_test_labels = svm(X_train, y_train, X_test)
#         predicted_test_labels = kMeans(X_train, y_train, X_test)
        predicted_test_labels = rf(X_train, y_train, X_test)
        
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        print("cm", cm, "\nrecall", recall, "\nprecision", precision, "\nf1:", f1)
        print("Mean Recall:", np.mean(recall))
        print("Mean Precision:", np.mean(precision))
        print("meanf1:", np.mean(f1))
        print('\n')

In [158]:
patient_df, healthy_df, features = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

In [165]:
kFoldCrossValidation(5, samples, labels)

cm [[210   7]
 [  4 430]] 
recall [0.96774194 0.99078341] 
precision [0.98130841 0.98398169] 
f1: [0.97447796 0.98737084]
Mean Recall: 0.9792626728110599
Mean Precision: 0.9826450522893988
meanf1: 0.9809243981768829


cm [[212   5]
 [  2 432]] 
recall [0.97695853 0.99539171] 
precision [0.99065421 0.98855835] 
f1: [0.9837587  0.99196326]
Mean Recall: 0.9861751152073732
Mean Precision: 0.9896062790051113
meanf1: 0.9878609806580163


cm [[207   9]
 [  3 431]] 
recall [0.95833333 0.99308756] 
precision [0.98571429 0.97954545] 
f1: [0.97183099 0.98627002]
Mean Recall: 0.9757104454685099
Mean Precision: 0.9826298701298701
meanf1: 0.979050504399394


cm [[201  15]
 [  2 431]] 
recall [0.93055556 0.99538106] 
precision [0.99014778 0.96636771] 
f1: [0.95942721 0.98065984]
Mean Recall: 0.9629683089556069
Mean Precision: 0.9782577481278579
meanf1: 0.9700435241826659


cm [[205  11]
 [  3 430]] 
recall [0.94907407 0.99307159] 
precision [0.98557692 0.97505669] 
f1: [0.96698113 0.98398169]
Mean Re