In [10]:
import numpy as np
import pandas as pd
import csv
from IPython.display import HTML, display
import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [11]:
def readData():    
    patientdata = pd.read_csv('../mfcc_features_patient.csv', delimiter=' ')
    healthydata = pd.read_csv('../mfcc_features_healthy.csv', delimiter=' ')
    
    return patientdata, healthydata

In [20]:
def createSamplesAndLabels(patient_df, healthy_df):
    print(len(healthy_df), len(patient_df))
    listofzeros = [0] * len(healthy_df)
    listofones = [1] * len(patient_df)
    
    patient_df['Label'] = listofones
    healthy_df['Label'] = listofzeros

    patient_samples = patient_df.to_numpy()
    healthy_samples = healthy_df.to_numpy()

    samples = np.vstack((patient_samples, healthy_samples))
#     print(samples[0], '\n', samples[120])
    np.random.shuffle(samples)

    # actual label of frame 1-patient; 0-healthy
    labels = samples[:,-1]
    labels = labels.astype(int)
    samples = np.delete(samples, -1, 1)
    
    # audio number to which the frame belongs
    # audio number: first digit represents healthy(0)/patient(1) - rest of the digits 
    # represent the audio number in each of healthy and patient
    audio_num = samples[:,-1]
    audio_num = audio_num.astype(int)
    samples = np.delete(samples, -1, 1)
#     print(len(samples), len(labels))

    return samples, labels, audio_num

In [13]:
from sklearn.svm import SVC

def svm(training_samples, training_labels, test_samples):
    clf = SVC(kernel="linear")
    clf.fit(training_samples, training_labels)
    print("Training complete\n")
    return clf.predict(test_samples)

In [14]:
from sklearn.cluster import KMeans

def kMeans(training_samples, training_labels, test_samples):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(training_samples, training_labels)
    return kmeans.predict(test_samples)

In [15]:
from sklearn.ensemble import RandomForestClassifier

def rf(training_samples, training_labels, test_samples):
    clf = RandomForestClassifier(random_state=1, n_estimators=300)
    clf.fit(training_samples, training_labels)
    return clf.predict(test_samples)

In [16]:
def calculate_cm_recall_precision(testlabels, y_true):
    y_pred = list(testlabels)
    labels = [0, 1]
    recall = recall_score(y_true, y_pred, labels, average=None)
    precision = precision_score(y_true, y_pred,labels, average=None)
    f1 = f1_score(y_true, y_pred,labels, average=None)
    return confusion_matrix(y_true, y_pred), recall, precision, f1;

In [19]:
patient_df, healthy_df = readData();
samples, labels, audio_num = createSamplesAndLabels(patient_df, healthy_df)

55933 144655


In [15]:
samples = samples.astype(np.float)
labels = labels.astype(np.float)
# print(type(samples), type(samples[0][0]))

<class 'numpy.ndarray'> <class 'numpy.float64'>


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.33) # reduce sample...
# ...size as mentioned in next line:
# train_test_split(samples[0:2000], labels[0:2000], test_size=0.33)

In [None]:
predicted_test_labels = svm(X_train, y_train, X_test)

In [None]:
cm, recall, precision, f1 = calculate_cm_recall_precision(predicted_test_labels, y_test)
print("cm", cm, "\nrecall", recall, "\nprecision", precision, "\nf1:", f1)
print("meanf1:", np.mean(f1))