In [635]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

### Data Pre-processing
Features *ChestPain* and *Thal* have strings as entries which are enumerated to non-negative integers.  

Binary classifier (target variable) *AHD* takes value 0 for *No* and 1 for *Yes*.  

All samples where any entry is *NA* are dropped; and the target variable column is dropped from the dataset.

In [636]:
readdata = pd.read_csv("../dataset/heart.csv")

mapping = {value: index for index, value in enumerate(readdata['ChestPain'].unique())}
readdata['ChestPain'] = readdata['ChestPain'].map(mapping)

mapping = {value: index for index, value in enumerate(readdata['Thal'].unique())}
readdata['Thal'] = readdata['Thal'].map(mapping)

readdata.loc[readdata['AHD']=='No','AHD'] = 0
readdata.loc[readdata['AHD']=='Yes','AHD'] = 1

readdata.dropna(inplace=True)

data = readdata.to_numpy()
data = data[:,1:].astype('float64')

k = 10

### Fisher Discriminant Analysis (FDA)
Data is split into 2 classes *class0* and *class1*.  

*&mu;<sub>i</sub>* represents the centroids of respective classes.  

*S<sub>i</sub>* represents the within-class scatter of i<sup>th</sup> class; *n<sub>i</sub>* represents the number of samples in the i<sup>th</sup> class.  

**&omega;** is calculated as:  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**&omega;** = (*n<sub>0</sub> S<sub>0</sub> + n<sub>1</sub> S<sub>1</sub>)<sup>-1</sup> **.** (&mu;<sub>0</sub> - &mu;<sub>1</sub>)  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;*Y* = **&omega;**<sup>T</sup> **.** *X<sup>(i)</sup>*

In [637]:
# fda
def fda(tmp,labels):

    f = tmp.shape[1]
    class0 = tmp[labels==0]
    class1 = tmp[labels==1]
    
    cent = np.zeros((2,f))
    cent[0] = np.mean(class0,axis=0)
    cent[1] = np.mean(class1,axis=0)

    s0 = np.zeros((f,f))
    s1 = np.zeros((f,f))

    n0 = class0.shape[0]
    n1 = class1.shape[1]
    
    for x in class0:
        row = x.reshape(-1,1)
        mean = cent[0].reshape(-1,1)
        s0 += (row - mean).dot((row - mean).T)
    s0 *= (1.0/(n0-1))

    for x in class1:
        row = x.reshape(-1,1)
        mean = cent[1].reshape(-1,1)
        s1 += (row - mean).dot((row - mean).T)
    s1 *= (1.0/(n1-1))

    m = cent[0] - cent[1]
    sigma = ((n0 * s0) + (n1 * s1))
    omega = np.linalg.inv(sigma)@(m.reshape(-1,1))
    y = np.zeros((tmp.shape[0],1))
    for i in range(tmp.shape[0]):
        y[i] = omega.T @ (tmp[i].reshape(-1,1))

    return np.column_stack((y,labels.astype('int')))

In [638]:
# pca followed by fda
def pcafda(inp,c):
    labels = inp[:,-1]
    tmp = inp[:,:-1]

    tmp = (tmp - np.mean(tmp,axis=0))/np.std(tmp,axis=0)
    cov = np.cov(tmp.T)

    evals,evecs = np.linalg.eig(cov)
    sel = np.argsort(evals)[::-1]
    evecs = evecs[:, sel]
    evecs = evecs[:,:c]
    tmp = tmp.dot(evecs)

    return fda(tmp,labels)

### Logistic Regression
Define &sigma;(x) = 1 / (1 + e<sup>-x</sup>)  

*p<sub>i</sub>* = &sigma;(**&Sigma;**<sub>j=0</sub><sup>m</sup> (*&Theta;<sub>j</sub>.x<sub>i</sub><sup>(j)</sup>*))

**&Theta;** = **&Theta;** - &alpha; x (1/N)  **&Sigma;**<sub>i=1</sub><sup>n</sup> (*p<sub>i</sub> - y<sub>i</sub>).x<sub>i</sub><sup>(j)</sup>*

In [639]:
def logistic_regression(inp):
    def sigmoid(x):
        return 1/(1 + np.exp(-x))
    
    # train-test split
    traindata = inp[:int(0.9*len(inp))]
    testdata = inp[int(0.9*len(inp)):]
    Y = traindata[:,-1]
    X = traindata[:,:-1]
    Y_test = testdata[:,-1]
    X_test = testdata[:,:-1]
    n,m = X.shape
    theta = np.zeros((m,1))
    
    # max iterations
    max_iter = 1000
    
    # learning rate
    alpha = 0.02

    for i in range(max_iter):
        p = np.zeros((n,))
        for j in range(n):
            p[j] = sigmoid(X[j].dot(theta))
        newtheta = theta - alpha * ((X.T).dot(p-Y))/n
        if(np.abs(newtheta-theta)<=0.001):
            break
        theta = newtheta
    
    probs = sigmoid(X_test.dot(theta))
    pred = (probs>=0.5).astype('int')
    return 100*accuracy_score(Y_test,pred)

In [640]:
fda_only = fda(data[:,:-1],data[:,-1])
pca_fda = pcafda(data,k)

In [641]:
acc1 = logistic_regression(fda_only)
acc2 = logistic_regression(pca_fda)

print("Accuracy with FDA as pre-processing step: ", acc1, "\nAccuracy with PCA + FDA as pre-processing steps: ", acc2)

Accuracy with FDA as pre-processing step:  60.0 
Accuracy with PCA + FDA as pre-processing steps:  60.0
