In [1]:
import time
import numpy
numpy.random.seed(1)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

We will make use of a training and a test set that are based on the popular covertype data set (https://archive.ics.uci.edu/ml/datasets/covertype). For each instance, we are given 54 features. Since the first 10 features contain much larger values than the remaining ones -- which contain values between 0 and 1 -- we rescale them to 0 and 1 as well.

In [2]:
# load training and test set
data_train = numpy.genfromtxt("covtype.train.csv", delimiter=',')
data_test = numpy.genfromtxt("covtype.test.csv", delimiter=',')
Xtrain, ytrain = data_train[:,:-1], data_train[:,-1]
Xtest, ytest = data_test[:,:-1], data_test[:,-1]
print("Number of training instances: {}".format(Xtrain.shape[0]))
print("Number of features: {}".format(Xtrain.shape[1]))
print("Number of test instances: {}".format(Xtest.shape[0]))

# rescale
scaler = preprocessing.MinMaxScaler()
scaler.fit(Xtrain[:, :10])
Xtrain[:,:10] = scaler.transform(Xtrain[:,:10])
Xtest[:,:10] = scaler.transform(Xtest[:,:10])

Number of training instances: 200000
Number of features: 54
Number of test instances: 25000


In [3]:
# fit the nearest neighbor model!
model = KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree')

#Xtrain_new = Xtrain[:, :5]
#Xtest_new = Xtest[:, :5]

#model.fit(Xtrain_new, ytrain)

In [7]:
# initialize variables and lists
feats = []

# separate training and validation datasets from random 10000 instances of data_train
data_fs = data_train[numpy.random.choice(len(data_train), size = 10000, replace = False), :]
Xtrain_fs, ytrain_fs = data_fs[0:8000, :-1], data_fs[0:8000, -1]
Xval_fs, yval_fs = data_fs[8000:10000, :-1], data_fs[8000:10000, -1]

# feature selection loop
for i in range(5):
    min_err = 10
    for f in range(54):
        l = feats.copy()
        l.append(f)
        X = Xtrain_fs[:, l]
        Xv = Xval_fs[:, l]
        model.fit(X, ytrain_fs)
        pred = model.predict(Xv)
        ac = accuracy_score(yval_fs, pred)
        err = 1 - ac
        #print('Feature {}: {}' .format(f, err))
        if(err < min_err):
            min_err = err
            ind = f
    feats.append(ind)
    print(feats)

(2000, 54)
Feature 0: 0.3395
Feature 1: 0.5585
Feature 2: 0.5525
Feature 3: 0.5585
Feature 4: 0.5509999999999999
Feature 5: 0.5569999999999999
Feature 6: 0.5449999999999999
Feature 7: 0.5389999999999999
Feature 8: 0.5545
Feature 9: 0.5589999999999999
Feature 10: 0.516
Feature 11: 0.49750000000000005
Feature 12: 0.516
Feature 13: 0.5085
Feature 14: 0.511
Feature 15: 0.508
Feature 16: 0.5125
Feature 17: 0.5095000000000001
Feature 18: 0.5155000000000001
Feature 19: 0.617
Feature 20: 0.516
Feature 21: 0.516
Feature 22: 0.516
Feature 23: 0.606
Feature 24: 0.516
Feature 25: 0.516
Feature 26: 0.516
Feature 27: 0.5155000000000001
Feature 28: 0.516
Feature 29: 0.516
Feature 30: 0.5175000000000001
Feature 31: 0.516
Feature 32: 0.515
Feature 33: 0.516
Feature 34: 0.514
Feature 35: 0.483
Feature 36: 0.624
Feature 37: 0.624
Feature 38: 0.5165
Feature 39: 0.516
Feature 40: 0.5155000000000001
Feature 41: 0.516
Feature 42: 0.624
Feature 43: 0.599
Feature 44: 0.516
Feature 45: 0.516
Feature 46: 0.624
F

In [None]:
# apply the model
start = time.clock()

preds = model.predict(Xtest_new)

elapsed = time.clock() - start

print(elapsed)

In [None]:
# compute the accuracy on the test set
acc = accuracy_score(ytest, preds)
print("Accuracy on test set: {}".format(acc))