In [1]:
import pandas as pd
import numpy as np

### functions

In [2]:
def cv(clf,X,y,clf_string,cv_=5):
    '''5 fold cross validation'''
    scores = cross_val_score(clf, X, y, cv=cv_)
    print('Clf: {}\nAccuracy: {:0.2f} +/- {:0.2f}'.format(clf_string, scores.mean(), scores.std() * 2))

### import data

In [20]:
#df = pd.read_csv('../new_output/features.csv',index_col=0)
df_regional = pd.read_csv('../csvs/regional_features.csv',index_col=0)
#df.dropna(inplace=True)
#df_regional.dropna(inplace=True)
#df_regional.drop(['file','label'],axis=1,inplace=True)

In [4]:
#df = pd.concat([df, df_regional], axis=1)

In [25]:
features_regional = df_regional.iloc[:,2:].values
features_shape = np.load('features_shape.npy')
features_stats = np.load('features_stats.npy')
features_histogram = np.load('features_histogram.npy')

In [28]:
features = np.concatenate((features_regional,features_shape,
                           features_stats,features_histogram),axis=1)
features.shape

(1472, 93)

In [30]:
#X = df.drop(['file','label'],axis=1).values
X = features[:1005]
y = df_regional.label.values[:1005]
y_bin = (y!=0).astype(int)
print('X (data)',X.shape,type(X))
print('y (target)',y.shape,type(y),np.unique(y))
print('y_bin (binary target)',y_bin.shape,type(y_bin),np.unique(y_bin))

X (data) (1005, 93) <class 'numpy.ndarray'>
y (target) (1005,) <class 'numpy.ndarray'> [0. 1. 2. 3.]
y_bin (binary target) (1005,) <class 'numpy.ndarray'> [0 1]


### preprocess data to have mean=0 and variance=1

In [31]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(X)

### preprocess data to be on range [0,1]

In [32]:
min_max_scaler = preprocessing.MinMaxScaler()
X_01 = min_max_scaler.fit_transform(X)

### testing models

In [33]:
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [34]:
clfs = []
clfs.append([svm.SVC(),'default svm'])
#clfs.append([svm.SVC(kernel='linear'),'svc kernel linear'])
clfs.append([LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),'logistic reg'])
clfs.append([AdaBoostClassifier(n_estimators=100), 'adaboost'])
clfs.append([KNeighborsClassifier(n_neighbors=3), 'knn 3'])
clfs.append([MLPClassifier(solver='lbfgs', alpha=1e-5,
                           hidden_layer_sizes=(5, 2), random_state=1),'NN'])

In [36]:
for clf, clf_str in clfs:
    cv(clf,X_scaled,y_bin,clf_str)

Clf: default svm
Accuracy: 0.79 +/- 0.07
Clf: logistic reg
Accuracy: 0.78 +/- 0.08
Clf: adaboost
Accuracy: 0.84 +/- 0.08
Clf: knn 3
Accuracy: 0.82 +/- 0.08
Clf: NN
Accuracy: 0.82 +/- 0.07


### ensemble

In [37]:
from sklearn.ensemble import VotingClassifier

In [38]:
estimators = []
for clf, clf_str in clfs:
    estimators.append((clf_str, clf))

In [39]:
eclf = VotingClassifier(estimators=estimators, voting='hard')
cv(eclf,X_scaled,y_bin,'ensemble')

Clf: ensemble
Accuracy: 0.84 +/- 0.09
