In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv('arcene/arcene_train.data', delimiter= ' ', header= None).values[:, :10000]
y = pd.read_csv('arcene/arcene_train.labels', header=None).values.reshape(-1, )


train.shape

(100, 10000)

In [3]:
valid = pd.read_csv('arcene/arcene_valid.data', delimiter= ' ', header= None).values[:, :10000]
y_valid = pd.read_csv('arcene/arcene_valid.labels', header= None).values.reshape(-1, )

valid.shape

(100, 10000)

In [4]:
def selekt(inp, lst):
    
    #outp = pd.DataFrame(inp, columns=lst)
    return inp[:, lst]

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost

def test(train, valid):
    knn_model = KNeighborsClassifier(n_neighbors=1)
    svm_model = SVC(random_state=42)
    dt_model = DecisionTreeClassifier(random_state=42)
    xgb_model = xgboost.XGBClassifier()

    knn_model.fit(train, y)
    svm_model.fit(train, y)
    dt_model.fit(train, y)
    xgb_model.fit(train, y)

    knn_labels = knn_model.predict(valid)
    svm_labels = svm_model.predict(valid)
    dt_labels = dt_model.predict(valid)
    xgb_labels = xgb_model.predict(valid)


    print("Accuracy on the knn: {}\non the svm: {}\non the dt: {}\nxgb: {}".format
          (accuracy_score(y_valid, knn_labels),
          accuracy_score(y_valid, svm_labels),
          accuracy_score(y_valid, dt_labels),
          accuracy_score(y_valid, xgb_labels)))

## Filters

In [22]:
from sklearn.ensemble import ExtraTreesClassifier

def etree(train, k = 10):
    
    et_model = ExtraTreesClassifier(n_estimators = 10, max_depth=5, random_state=42)
    et_model.fit(train, y)
    imp = et_model.feature_importances_

    ar = {i : imp[i] for i in range(imp.size) if imp[i] != 0}
    sortdict = sorted(ar.items(), key=lambda item: abs(item[1]), reverse=True)
    
    if len(ar) < k:
        k = len(ar)        
    sortlist = [sortdict[i][0] for i in range(k)]
    
    return sortlist

In [23]:
from sklearn.linear_model import Lasso

def lassofilter(train, k = 10):
    clf = Lasso(alpha=0.001)
    clf.fit(train, y)
    imp = clf.coef_

    ar = {i : imp[i] for i in range(imp.size) if imp[i] != 0}
    sortdict = sorted(ar.items(), key=lambda item: abs(item[1]), reverse=True)
    if len(ar) < k:
        k = len(ar)
    sortlist = [sortdict[i][0] for i in range(k)]
    
    return sortlist

In [18]:
def corrfilter(train, k = 10):

    corrlist = {}
    for i in range(10000):
        a = np.corrcoef(train[:, i], y)[0][1]
        if abs(a) > 0.40:
            corrlist[i] = abs(a)
            
    sortdict = sorted(corrlist.items(), key=lambda item: abs(item[1]), reverse=True)
    if len(sortdict) < k:
        k = len(sortdict)
        
    sortlist = [sortdict[i][0] for i in range(k)]
    return sortlist

In [24]:
tr = train_std
te = valid_std
test(tr, te)

Accuracy on the knn: 0.8
on the svm: 0.72
on the dt: 0.57
xgb: 0.72


In [25]:
K = 12

sortlist = etree(tr, k = K)
et_train = selekt(tr, sortlist)
et_test = selekt(te, sortlist)
test(et_train, et_test)

sortlist = lassofilter(tr, k = K)
lasso_train = selekt(tr, sortlist)
lasso_test = selekt(te, sortlist)
test(lasso_train, lasso_test)

corr_lst = corrfilter(tr, k = K)
corr_train = selekt(tr, corr_lst)
corr_test = selekt(te, corr_lst)
test(corr_train, corr_test)

Accuracy on the knn: 0.83
on the svm: 0.82
on the dt: 0.7
xgb: 0.71
Accuracy on the knn: 0.63
on the svm: 0.68
on the dt: 0.59
xgb: 0.64
Accuracy on the knn: 0.59
on the svm: 0.71
on the dt: 0.62
xgb: 0.66


##  Тщкьфдшяу

In [11]:
from sklearn import preprocessing


min_max_scaler = preprocessing.MinMaxScaler().fit(train)
std_scaler = preprocessing.StandardScaler().fit(train)
normalizer = preprocessing.Normalizer().fit(train)



train_minmax = min_max_scaler.transform(train)
valid_minmax = min_max_scaler.transform(valid)

train_std = std_scaler.transform(train)
valid_std = std_scaler.transform(valid)

train_norm = normalizer.transform(train)
valid_norm = normalizer.transform(valid)

In [127]:
test(train_minmax, valid_minmax)
test(train_std, valid_std)
test(train_norm, valid_norm)

Accuracy on the knn: 0.84
on the svm: 0.84
on the dt: 0.57
xgb: 0.71
Accuracy on the knn: 0.8
on the svm: 0.83
on the dt: 0.57
xgb: 0.72
Accuracy on the knn: 0.88
on the svm: 0.72
on the dt: 0.73
xgb: 0.7


## Baseline

In [26]:
test(train, valid)

Accuracy on the knn: 0.88
on the svm: 0.56
on the dt: 0.57
xgb: 0.72


In [30]:
K = 14

sortlist = etree(train, k = K)
et_train = selekt(train, sortlist)
et_test = selekt(valid, sortlist)
test(et_train, et_test)

sortlist = lassofilter(train, k = K)
lasso_train = selekt(train, sortlist)
lasso_test = selekt(valid, sortlist)
test(lasso_train, lasso_test)

corr_lst = corrfilter(train, k = K)
corr_train = selekt(train, corr_lst)
corr_test = selekt(valid, corr_lst)
test(corr_train, corr_test)

Accuracy on the knn: 0.78
on the svm: 0.56
on the dt: 0.69
xgb: 0.72
Accuracy on the knn: 0.65
on the svm: 0.56
on the dt: 0.62
xgb: 0.76
Accuracy on the knn: 0.55
on the svm: 0.6
on the dt: 0.64
xgb: 0.64


### TESTS

In [31]:
tr = train_minmax
te = valid_minmax
test(tr, te)

Accuracy on the knn: 0.84
on the svm: 0.56
on the dt: 0.57
xgb: 0.71


In [34]:
K = 12

sortlist = etree(tr, k = K)
et_train = selekt(tr, sortlist)
et_test = selekt(te, sortlist)
test(et_train, et_test)

sortlist = lassofilter(tr, k = K)
lasso_train = selekt(tr, sortlist)
lasso_test = selekt(te, sortlist)
test(lasso_train, lasso_test)

corr_lst = corrfilter(tr, k = K)
corr_train = selekt(tr, corr_lst)
corr_test = selekt(te, corr_lst)
test(corr_train, corr_test)

Accuracy on the knn: 0.83
on the svm: 0.74
on the dt: 0.7
xgb: 0.71
Accuracy on the knn: 0.59
on the svm: 0.63
on the dt: 0.59
xgb: 0.54
Accuracy on the knn: 0.6
on the svm: 0.67
on the dt: 0.62
xgb: 0.66


In [35]:
tr = train_std
te = valid_std
test(tr, te)

Accuracy on the knn: 0.8
on the svm: 0.72
on the dt: 0.57
xgb: 0.72


In [36]:
K = 12

sortlist = etree(tr, k = K)
et_train = selekt(tr, sortlist)
et_test = selekt(te, sortlist)
test(et_train, et_test)

sortlist = lassofilter(tr, k = K)
lasso_train = selekt(tr, sortlist)
lasso_test = selekt(te, sortlist)
test(lasso_train, lasso_test)

corr_lst = corrfilter(tr, k = K)
corr_train = selekt(tr, corr_lst)
corr_test = selekt(te, corr_lst)
test(corr_train, corr_test)

Accuracy on the knn: 0.83
on the svm: 0.82
on the dt: 0.7
xgb: 0.71
Accuracy on the knn: 0.63
on the svm: 0.68
on the dt: 0.59
xgb: 0.64
Accuracy on the knn: 0.59
on the svm: 0.71
on the dt: 0.62
xgb: 0.66


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators = 3, max_depth=5, random_state=42)

rf_model.fit(train, y)

predicted_labels = rf_model.predict(valid)

print("Accuracy on the test set is: {}".format(
    round(accuracy_score(y_valid, predicted_labels), 5)))

Accuracy on the test set is: 0.69


In [25]:
imp = rf_model.feature_importances_

ar = {i : imp[i] for i in range(imp.size) if imp[i] != 0}
sortdict = sorted(ar.items(), key=lambda item: abs(item[1]), reverse=True)
print(len(ar))

sortdict

22


[(456, 0.1183067296957202),
 (4112, 0.11256248061568201),
 (1660, 0.06455915699613182),
 (6500, 0.06384320109810306),
 (2355, 0.06300138840429964),
 (5480, 0.060541878049175046),
 (2743, 0.05928297244823855),
 (6057, 0.056755302853742196),
 (8000, 0.04620620346261021),
 (7113, 0.04446222933617891),
 (28, 0.039298158472094565),
 (2100, 0.03873933958649659),
 (4413, 0.036291376059507635),
 (8122, 0.03366653534669369),
 (2439, 0.03175873524012779),
 (6264, 0.029441898085737397),
 (9999, 0.025321838551359378),
 (7308, 0.02242515604171079),
 (3926, 0.0171760084052807),
 (9537, 0.012671735360810987),
 (4933, 0.012267457535723917),
 (2637, 0.01142021835457494)]

In [36]:
lst = [8852, 7896, 361, 792, 2378, 174, 2182, 4296]

lst1 = [ls + 1 for ls in lst]

In [37]:
lasso_train = selekt(train, lst)
lasso_test = selekt(valid, lst)

In [38]:
test(lasso_train, lasso_test)

Accuracy on the knn: 0.47
on the svm: 0.52
on the dt: 0.43
xgb: 0.49
