In [1]:
import numpy as np
from random import randint
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.neighbors import KNeighborsClassifier

## Wrong Way

In [2]:
X_label = [randint(0,1) for _ in range(50)]
X_feat = np.random.normal(0, 1, (50, 5000))

X_feat_cor = [np.corrcoef(X_label, x)[0,1] for x in X_feat.T]

top_100 = np.argsort(X_feat_cor)[-100:]

new_feats = np.array(X_feat.T)[top_100].T

X_label = np.ravel(np.atleast_2d(X_label).T)

print (new_feats.shape, X_label.shape)

(50, 100) (50,)


In [3]:
clf = KNeighborsClassifier(n_neighbors=1)
err = []

kf = RepeatedKFold(n_splits=5, n_repeats=50)
for train_index, test_index in kf.split(new_feats):
    X_train, X_test = new_feats[train_index], new_feats[test_index]
    y_train, y_test = X_label[train_index], X_label[test_index]
    clf.fit(X_train, y_train)
    err.append(1-clf.score(X_test, y_test))
    
print ("Error each K-Fold:", np.around(np.array(err),3))
print ("Average Error:", round(sum(err)/250,3))

Error each K-Fold: [0.  0.  0.  0.  0.2 0.  0.  0.  0.1 0.  0.  0.1 0.  0.1 0.  0.  0.2 0.1
 0.  0.  0.2 0.  0.  0.  0.  0.  0.1 0.  0.  0.1 0.  0.  0.1 0.  0.  0.
 0.  0.  0.1 0.1 0.1 0.  0.2 0.  0.  0.  0.  0.2 0.  0.  0.  0.  0.2 0.
 0.  0.1 0.  0.1 0.  0.  0.1 0.1 0.  0.  0.  0.1 0.  0.  0.  0.1 0.1 0.
 0.  0.  0.2 0.  0.  0.  0.1 0.  0.  0.  0.  0.  0.2 0.  0.  0.  0.2 0.
 0.  0.  0.1 0.1 0.  0.  0.  0.  0.1 0.1 0.  0.  0.1 0.  0.1 0.  0.1 0.1
 0.  0.  0.1 0.1 0.  0.  0.  0.  0.  0.1 0.  0.  0.1 0.  0.  0.  0.1 0.
 0.  0.1 0.1 0.  0.1 0.1 0.  0.  0.  0.  0.1 0.  0.  0.2 0.  0.  0.1 0.
 0.1 0.1 0.1 0.  0.  0.  0.  0.1 0.  0.  0.1 0.1 0.  0.  0.1 0.  0.  0.
 0.  0.1 0.1 0.  0.1 0.  0.1 0.  0.  0.  0.  0.  0.2 0.1 0.1 0.  0.  0.
 0.  0.1 0.  0.  0.1 0.  0.  0.1 0.  0.1 0.1 0.1 0.  0.  0.1 0.  0.  0.
 0.  0.1 0.  0.1 0.  0.  0.1 0.  0.1 0.  0.  0.1 0.  0.1 0.  0.  0.  0.
 0.  0.  0.2 0.  0.  0.1 0.  0.  0.1 0.2 0.  0.  0.  0.  0.1 0.1 0.  0.
 0.  0.  0.  0.1 0.1 0.  0.  0.1 0.1 0.  0.

## Right Way

In [4]:
X_label = [randint(0,1) for _ in range(50)]
X_feat = np.random.normal(0, 1, (50, 5000))

X_label = np.ravel(np.atleast_2d(X_label).T)

print (X_feat.shape, X_label.shape)

(50, 5000) (50,)


In [5]:
clf = KNeighborsClassifier(n_neighbors=1)
err = []

kf = RepeatedKFold(n_splits=5, n_repeats=10)
for train_index, test_index in kf.split(X_feat):
    X_train, X_test = X_feat[train_index], X_feat[test_index]
    y_train, y_test = X_label[train_index], X_label[test_index]

    X_feat_cor = [np.corrcoef(y_train, x)[0,1] for x in X_train.T]

    top_100 = np.argsort(X_feat_cor)[-100:]

    new_feats_train = np.array(X_train.T)[top_100].T
    new_feats_test = np.array(X_test.T)[top_100].T

    clf.fit(new_feats_train, y_train)
    err.append(1-clf.score(new_feats_test, y_test))

print ("Error each K-Fold:", np.around(np.array(err),3))
print ("Average Error:", round(sum(err)/50,3))

Error each K-Fold: [0.2 0.4 0.4 0.5 0.7 0.4 0.5 0.5 0.5 0.5 0.2 0.4 0.6 0.4 0.3 0.4 0.6 0.3
 0.5 0.6 0.6 0.3 0.9 0.5 0.4 0.6 0.3 0.5 0.5 0.2 0.6 0.4 0.6 0.5 0.5 0.5
 0.5 0.4 0.5 0.7 0.4 0.2 0.7 0.4 0.7 0.6 0.6 0.4 0.6 0. ]
Average Error: 0.47
