In [2]:
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [3]:
def readData():    
    patientdata = pd.read_csv('./patient_output.csv', delimiter='\t')
    healthydata = pd.read_csv('./healthy_output.csv', delimiter='\t')

    columns = patientdata.columns.values.tolist()
    
    return patientdata, healthydata, columns

In [31]:
def createSamplesAndLabels(patient_df, healthy_df):
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)

    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
    
    np.random.shuffle(samples)

    labels = samples[:,-1]
    labels = labels.astype(int)
    
    samples = np.delete(samples, -1, 1)

    return samples, labels

In [5]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [6]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [7]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [8]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [32]:
from sklearn.model_selection import StratifiedKFold

def kFoldCrossValidation(n, X, y):
    skf = StratifiedKFold(n_splits=n, shuffle=True)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        predicted_test_labels = svm(X_train, y_train, X_test)
#         predicted_test_labels = kMeans(X_train, y_train, X_test)
#         predicted_test_labels = rf(X_train, y_train, X_test)
        
        cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
        print("cm", cm, "\nrecall", recall, "\nprecision", precision, "\nf1:", f1)
        print("Mean Recall:", np.mean(recall))
        print("Mean Precision:", np.mean(precision))
        print("meanf1:", np.mean(f1))
        print('\n')

In [33]:
patient_df, healthy_df, features = readData();
samples, labels = createSamplesAndLabels(patient_df, healthy_df)

In [34]:
kFoldCrossValidation(5, samples, labels)

cm [[201  16]
 [  7 427]] 
recall [0.92626728 0.98387097] 
precision [0.96634615 0.96388262] 
f1: [0.94588235 0.97377423]
Mean Recall: 0.9550691244239631
Mean Precision: 0.9651143861781559
meanf1: 0.9598282916359246


cm [[207  10]
 [  8 426]] 
recall [0.95391705 0.98156682] 
precision [0.9627907  0.97706422] 
f1: [0.95833333 0.97931034]
Mean Recall: 0.967741935483871
Mean Precision: 0.9699274589289524
meanf1: 0.9688218390804597


cm [[203  13]
 [ 13 421]] 
recall [0.93981481 0.97004608] 
precision [0.93981481 0.97004608] 
f1: [0.93981481 0.97004608]
Mean Recall: 0.9549304488820618
Mean Precision: 0.9549304488820618
meanf1: 0.9549304488820618


cm [[203  13]
 [ 20 413]] 
recall [0.93981481 0.95381062] 
precision [0.9103139  0.96948357] 
f1: [0.92482916 0.96158324]
Mean Recall: 0.9468127191856983
Mean Precision: 0.9398987347102044
meanf1: 0.9432061967483514


cm [[205  11]
 [ 10 423]] 
recall [0.94907407 0.97690531] 
precision [0.95348837 0.97465438] 
f1: [0.9512761  0.97577855]
Mean Re