In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score

In [4]:
train = pd.read_csv('arcene/arcene_train.data', delimiter= ' ', header= None).values[:, :10000]
y = pd.read_csv('arcene/arcene_train.labels', header=None).values.reshape(-1, )

train.shape

(100, 10000)

In [5]:
valid = pd.read_csv('arcene/arcene_valid.data', delimiter= ' ', header= None).values[:, :10000]
y_valid = pd.read_csv('arcene/arcene_valid.labels', header= None).values.reshape(-1, )

valid.shape

(100, 10000)

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost

def test(train, valid):
    knn_model = KNeighborsClassifier(n_neighbors=1)
    svm_model = SVC(random_state=42)
    dt_model = DecisionTreeClassifier(random_state=42)
    xgb_model = xgboost.XGBClassifier()

    knn_model.fit(train, y)
    svm_model.fit(train, y)
    dt_model.fit(train, y)
    xgb_model.fit(train, y)

    knn_labels = knn_model.predict(valid)
    svm_labels = svm_model.predict(valid)
    dt_labels = dt_model.predict(valid)
    xgb_labels = xgb_model.predict(valid)


    print("Accuracy on the knn: {}\non the svm: {}\non the dt: {}\nxgb: {}".format
          (accuracy_score(y_valid, knn_labels),
          accuracy_score(y_valid, svm_labels),
          accuracy_score(y_valid, dt_labels),
          accuracy_score(y_valid, xgb_labels)))

## Filters

In [7]:
from sklearn.ensemble import ExtraTreesClassifier

def etree(train, k = 10):
    
    et_model = ExtraTreesClassifier(n_estimators = 10, max_depth=5, random_state=42)
    et_model.fit(train, y)
    imp = et_model.feature_importances_

    ar = {i : imp[i] for i in range(imp.size) if imp[i] != 0}
    sortdict = sorted(ar.items(), key=lambda item: abs(item[1]), reverse=True)
    
    if len(ar) < k:
        k = len(ar)        
    sortlist = [sortdict[i][0] for i in range(k)]
    
    return sortlist

In [8]:
from sklearn.linear_model import Lasso

def lassofilter(train, k = 10):
    clf = Lasso(alpha=0.001)
    clf.fit(train, y)
    imp = clf.coef_

    ar = {i : imp[i] for i in range(imp.size) if imp[i] != 0}
    sortdict = sorted(ar.items(), key=lambda item: abs(item[1]), reverse=True)
    if len(ar) < k:
        k = len(ar)
    sortlist = [sortdict[i][0] for i in range(k)]
    
    return sortlist

In [9]:
def corrfilter(train, k = 10):
        
    corrdict = {i: abs(np.corrcoef(train[:, i], y)[0][1]) for i in range(train.shape[1]) \
                if abs(np.corrcoef(train[:, i], y)[0][1]) > 0.40} 

    sortdict = sorted(corrdict.items(), key=lambda item: item[1], reverse=True)
    if len(sortdict) < k:
        k = len(sortdict)
        
    sortlist = [sortdict[i][0] for i in range(k)]
    return sortlist

##  Normalize

In [10]:
from sklearn import preprocessing

std_scaler = preprocessing.StandardScaler().fit(train)

train_std = std_scaler.transform(train)
valid_std = std_scaler.transform(valid)

### TESTS

In [11]:
tr = train_std
te = valid_std
test(tr, te)

Accuracy on the knn: 0.8
on the svm: 0.72
on the dt: 0.57
xgb: 0.72


In [12]:
K = 12

sortlist = etree(tr, k = K)
et_train = tr[:, sortlist]
et_test = te[:, sortlist]
test(et_train, et_test)

sortlist = lassofilter(tr, k = K)
lasso_train = tr[:, sortlist]
lasso_test = te[:, sortlist]
test(lasso_train, lasso_test)

corr_lst = corrfilter(tr, k = K)
corr_train = tr[:, corr_lst]
corr_test = te[:, corr_lst]
test(corr_train, corr_test)

Accuracy on the knn: 0.83
on the svm: 0.82
on the dt: 0.7
xgb: 0.71
Accuracy on the knn: 0.63
on the svm: 0.68
on the dt: 0.59
xgb: 0.64
Accuracy on the knn: 0.59
on the svm: 0.71
on the dt: 0.62
xgb: 0.66


## Wrapper

In [13]:
from sklearn.ensemble import RandomForestClassifier
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [14]:
features = list(range(10000))
good_features = []

start = time.time()
topacc = 0

for i in range(20):
    topacc = 0
    topi = -1
    
    for feat in features:
        locfeatures = good_features.copy()
        locfeatures.append(feat)

        if len(locfeatures) == 1:
            model = RandomForestClassifier(random_state=42)
            model.fit(train_std[:, locfeatures].reshape(-1, 1), y)
            pred = model.predict(valid_std[:, locfeatures].reshape(-1, 1))
        else:
            model = RandomForestClassifier(random_state=42)
            model.fit(train_std[:, locfeatures], y)
            pred = model.predict(valid_std[:, locfeatures])        

        accu = accuracy_score(y_valid, pred)
        if accu > topacc:
            topacc = accu
            topi = feat
    
    #if topi == -1:
    #    print('THE END. toplist: {}'.format(good_features))
    #    break

    good_features.append(topi)
    ind = features.index(topi)
    features.pop(ind)
    print('iter: {}, time: {}, topaccuracy: {}, features: {}'\
          .format(i, timeSince(start), topacc, good_features))    

iter: 0, time: 2m 0s, topaccuracy: 0.77, features: [3340]
iter: 1, time: 4m 1s, topaccuracy: 0.82, features: [3340, 4519]
iter: 2, time: 6m 2s, topaccuracy: 0.85, features: [3340, 4519, 4287]
iter: 3, time: 8m 5s, topaccuracy: 0.88, features: [3340, 4519, 4287, 7579]
iter: 4, time: 10m 7s, topaccuracy: 0.86, features: [3340, 4519, 4287, 7579, 546]
iter: 5, time: 12m 13s, topaccuracy: 0.86, features: [3340, 4519, 4287, 7579, 546, 519]
iter: 6, time: 14m 17s, topaccuracy: 0.87, features: [3340, 4519, 4287, 7579, 546, 519, 1484]
iter: 7, time: 16m 20s, topaccuracy: 0.89, features: [3340, 4519, 4287, 7579, 546, 519, 1484, 4748]
iter: 8, time: 18m 23s, topaccuracy: 0.87, features: [3340, 4519, 4287, 7579, 546, 519, 1484, 4748, 521]
iter: 9, time: 20m 26s, topaccuracy: 0.87, features: [3340, 4519, 4287, 7579, 546, 519, 1484, 4748, 521, 212]
iter: 10, time: 22m 29s, topaccuracy: 0.89, features: [3340, 4519, 4287, 7579, 546, 519, 1484, 4748, 521, 212, 1070]
iter: 11, time: 24m 32s, topaccuracy

### test

In [98]:
model = RandomForestClassifier(random_state=42)
model.fit(train_std, y)
pred = model.predict(valid_std)  
print(accuracy_score(y_valid, pred))

0.76


In [99]:
model = RandomForestClassifier(random_state=42)
model.fit(train_std[:, good_features], y)
pred = model.predict(valid_std[:, good_features])  
print(accuracy_score(y_valid, pred))

0.88
