In [11]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [14]:
def data(data, label):
    x_train = np.concatenate((data[0:25, :], data[50:75, :], data[100:125, :]), axis=0)
    x_test = np.concatenate((data[25:50, :], data[75:100, :], data[125:150, :]), axis=0)

    y_train = np.concatenate((label[0:25], label[50:75], label[100:125]), axis=0)
    y_test = np.concatenate((label[25:50], label[75:100], label[125:150]), axis=0)

    return x_train, y_train, x_test, y_test

# print(x_train.shape)
# print(y_train.shape)
# print(x_test.shape)
# print(y_test.shape)
# (75, 4)
# (75,)
# (75, 4)
# (75,)

In [15]:
iris_data = np.loadtxt('iris.txt')
x_train, y_train, x_test, y_test = data(iris_data[:, :4], iris_data[:, 4])

In [16]:
def within_class_scatter(data):
    within_class_scatter = 0
    for class_data in [data[:, :25], data[:, 25:50], data[:, 50:75]]:
        prior_probability = class_data.shape[1] / data.shape[1]
        data_count = class_data.shape[1]
        mean_vector = np.mean(class_data, axis=1)

        data_scatter = 0
        for data_index in range(data_count):
            data_minus_mean_vector = np.expand_dims((class_data[:, data_index] - mean_vector), axis=1)
            data_scatter += data_minus_mean_vector.dot(data_minus_mean_vector.T)
            
        within_class_scatter += data_scatter * prior_probability / data_count

    return within_class_scatter

# within_class_scatter = within_class_scatter(x_train.T)
# within_class_scatter
# array([[0.315232  , 0.11170667, 0.205712  , 0.04266133],
#        [0.11170667, 0.12501333, 0.06198933, 0.041328  ],
#        [0.205712  , 0.06198933, 0.209152  , 0.044144  ],
#        [0.04266133, 0.041328  , 0.044144  , 0.03794133]])

In [17]:
def between_class_scatter(data):
    between_class_scatter = 0
    mean_vector = np.expand_dims(data.mean(axis=1), axis=1)
    for class_data in [data[:, :25], data[:, 25:50], data[:, 50:75]]:
        prior_probability = class_data.shape[1] / data.shape[1]
        class_mean_vector = np.expand_dims(class_data.mean(axis=1), axis=1)
        between_class_scatter += (class_mean_vector - mean_vector).dot((class_mean_vector - mean_vector).T)

    between_class_scatter = prior_probability * between_class_scatter
        
    return  between_class_scatter

# between_class_scatter = get_between_class_scatter(x_train.T)
# between_class_scatter
# array([[ 0.409184  , -0.16238933,  1.114     ,  0.472608  ],
#        [-0.16238933,  0.09149156, -0.45703467, -0.184064  ],
#        [ 1.114     , -0.45703467,  3.04109867,  1.28474133],
#        [ 0.472608  , -0.184064  ,  1.28474133,  0.54631467]])

In [18]:
def fisher_score(within_class_scatter, between_class_scatter):
    performance_index = between_class_scatter / within_class_scatter
    fisher_score = [performance_index[index, index] for index in range(performance_index.shape[0])]
        
    return fisher_score

# fisher_score
# [1.2980408080397958,
#  0.7318543799772459,
#  14.540136678906569,
#  14.398931684003369]

In [19]:
def knn_model(x_train, y_train, x_test, y_test):
    knn_model = KNeighborsClassifier(n_neighbors=3)
    knn_model.fit(x_train, y_train)

    return knn_model.score(x_test, y_test)


def get_model_accuracy(x_train, y_train, x_test, y_test):
    accuracy_1 = knn_model(x_train, y_train, x_test, y_test)
    
    x_train, x_test = x_test, x_train
    y_train, y_test = y_test, y_train
    accuracy_2 = knn_model(x_train, y_train, x_test, y_test)

    accuracy = (accuracy_1 + accuracy_2) / 2
    return accuracy

In [20]:
within_class_scatter = within_class_scatter(x_train.T)
between_class_scatter = between_class_scatter(x_train.T)
fisher_score = fisher_score(within_class_scatter, between_class_scatter)

In [21]:
feature_rank = [sorted(fisher_score).index(x) for x in fisher_score][::-1]
for rank_index in range(len(feature_rank)):
    rank_index += 1
    selected_features = feature_rank[:rank_index]
    accuracy = get_model_accuracy(x_train[:, selected_features], y_train, x_test[:, selected_features], y_test)
    print("When use top-{} features, accuracy is: {}".format(rank_index, accuracy))

When use top-1 features, accuracy is: 0.9199999999999999
When use top-2 features, accuracy is: 0.9533333333333334
When use top-3 features, accuracy is: 0.9533333333333334
When use top-4 features, accuracy is: 0.94
