In [1]:
import pandas as pd
import numpy as np

### functions

In [2]:
def cv(clf,X,y,clf_string,cv_=5):
    '''5 fold cross validation'''
    scores = cross_val_score(clf, X, y, cv=cv_)
    print('Clf: {}\nAccuracy: {:0.2f} +/- {:0.2f}'.format(clf_string, scores.mean(), scores.std() * 2))

### import data

In [3]:
df = pd.read_csv('features.csv',index_col=0)
df.dropna(inplace=True)
X = df.drop(['file','label'],axis=1).values
y = df.label.values
y_bin = (y!=0).astype(int)
print('X (data)',X.shape,type(X))
print('y (target)',y.shape,type(y),np.unique(y))
print('y_bin (binary target)',y_bin.shape,type(y_bin),np.unique(y_bin))

X (data) (1005, 45) <class 'numpy.ndarray'>
y (target) (1005,) <class 'numpy.ndarray'> [0. 1. 2. 3.]
y_bin (binary target) (1005,) <class 'numpy.ndarray'> [0 1]


### preprocess data to have mean=0 and variance=1

In [4]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(X)

### preprocess data to be on range [0,1]

In [5]:
min_max_scaler = preprocessing.MinMaxScaler()
X_01 = min_max_scaler.fit_transform(X)

### testing models

In [2]:
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [8]:
clf = svm.SVC()
cv(clf,X_scaled,y,'default svm.SVC()')

Clf: default svm.SVC()
Accuracy: 0.81 +/- 0.05


In [3]:
clf = svm.SVC(kernel='linear')
cv(clf,X_scaled,y,'svc with linear kernel')

NameError: name 'cv' is not defined

In [None]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')
cv(clf,X_scaled,y,'logistic regression')

In [None]:
clfs = []
clfs.append([svm.SVC(),'default svm'])
clfs.append([svm.SVC(kernel='linear'),'svc kernel linear'])
clfs.append([LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),'logistic reg'])
clfs.append([AdaBoostClassifier(n_estimators=100), 'adaboost'])
clfs.append([KNeighborsClassifier(n_neighbors=3), 'knn 3'])
clfs.append([MLPClassifier(solver='lbfgs', alpha=1e-5,
                           hidden_layer_sizes=(5, 2), random_state=1),'NN'])
for clf, clf_str in clfs:
    cv(clf,X_scaled,y_bin,clf_str)

### ensemble

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
estimators = []
for clf, clf_str in clfs:
    estimators.append((clf_str, clf))

In [None]:
eclf = VotingClassifier(estimators=estimators, voting='hard')
cv(eclf,X_scaled,y_bin,'ensemble')