In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

### Data Pre-processing
Target variable *Species* takes values 0,1,2 corresponding to *Iris-setosa, Iris-versicolor, Iris-virginica*  

*Species* and *ID* columns are dropped from the dataset

In [2]:
data = pd.read_csv("../dataset/iris.csv")

data.loc[data['Species']=='Iris-setosa', 'Species'] = 0
data.loc[data['Species']=='Iris-versicolor', 'Species'] = 1
data.loc[data['Species']=='Iris-virginica', 'Species'] = 2

data = data.to_numpy()
data = data[:,1:]

### kNN Algorithm
Dataset split into a training set and a testing set (*80:20* split).  
Each feature is scaled such that all values lie in the range *(0,1)* to maintain consistency of data.  

Algorithm uses *k=5* to learn from the training data, and returns the accuracy of the model w.r.t the testing data.

In [3]:
def knn(inp):
    # train-test split
    traindata, testdata, trainlabel, testlabel = train_test_split(inp[:,:-1],inp[:,-1],test_size=0.2,random_state=21)
    trainlabel = trainlabel.astype('int')
    testlabel = testlabel.astype('int')
    
    # scaling
    traindata = StandardScaler().fit_transform(traindata)
    testdata = StandardScaler().fit_transform(testdata)

    # training
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(traindata,trainlabel)

    # testing
    pred = knn.predict(testdata)
    acc = accuracy_score(testlabel,pred)

    return acc*100

### Linear Discriminant Analysis (LDA)
**m**<sub>j</sub> represents centroids of j<sup>th</sup> class (*c=3* classes) and **m** represent global mean.  
S<sub>&omega;</sub> represents within-class scatter and S<sub>b</sub> represents between-class scatter i.e.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**S**<sub>&omega;</sub> = &Sigma; **S**<sub>j</sub>  for *0<=j<=2*
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**S**<sub>b</sub> = &Sigma; n<sub>*j*</sub> (**m**<sub>*j*</sub> - **m**)(**m**<sub>*j*</sub> - **m**)<sup>T</sup>  

Maximizer of the objective function for LDA is given by the largest eigenvector *v<sub>1</sub>* of **S**<sub>&omega;</sub><sup>-1</sup>**S**<sub>b</sub> i.e.  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; **S**<sub>&omega;</sub><sup>-1</sup>**S**<sub>b</sub> = &lambda;<sub>1</sub>**v**<sub>1</sub>

In [4]:
def lda(inp):
    c = 3
    label = inp[:,-1]
    tmp = inp[:,:-1]
    #global mean
    glob = np.mean(tmp,axis=0).reshape(-1,1)
    d = tmp.shape[1]
    cent = np.zeros((c,d))

    # single-class means
    for i in range(c):
        cent[i] = np.mean(tmp[label==i], axis=0)

    s_w = np.zeros((tmp.shape[1],tmp.shape[1]))
    s_b = np.zeros((tmp.shape[1],tmp.shape[1]))

    for j in range(c):
        # within class scatter
        s_j = np.zeros((tmp.shape[1],tmp.shape[1]))
        for row in tmp[label==j]:
            row = row.reshape(-1,1)
            mean = cent[j].reshape(-1,1)
            t = (row-mean).dot((row-mean).T).astype('float64')
            s_j += t
        s_w += s_j
        
        # between class scatter
        n_j = tmp[label==j].shape[0]
        col = cent[i].reshape(-1,1)
        x = (float(n_j)/tmp.shape[0]) * ((col-glob).dot((col-glob).T)).astype('float64')
        s_b += x

    # eigenvector problem
    evals,evecs = np.linalg.eig(np.linalg.inv(s_w).dot(s_b))

    sel = np.argsort(evals)[::-1]
    evals = np.sort(evals)[::-1]
    evecs = evecs[:,sel]

    vk = evecs[:,0]

    # project data onto the eigenvector
    y = tmp.dot(vk)

    return np.column_stack((y,label))

Accuracy of kNN on data without using LDA as pre-processing step

In [5]:
print("Accuracy without using LDA: ",knn(data))

Accuracy without using LDA:  90.0


Accuracy of kNN on data after using LDA as pre-processing step

In [6]:
newdata = lda(data)
print("Accuracy after using LDA: ",knn(newdata))

Accuracy after using LDA:  93.33333333333333
