In [1]:
import pandas as pd
import numpy as np

### functions

In [2]:
def cv(clf,X,y,clf_string,cv_=5):
    '''5 fold cross validation'''
    scores = cross_val_score(clf, X, y, cv=cv_)
    print('Clf: {}\nAccuracy: {:0.2f} +/- {:0.2f}'.format(clf_string, scores.mean(), scores.std() * 2))

### import data

In [3]:
df = pd.read_csv('../csvs/features.csv',index_col=0)
df_regional = pd.read_csv('../csvs/regional_features.csv',index_col=0)
df.dropna(inplace=True)
df_regional.dropna(inplace=True)
df_regional.drop(['file','label'],axis=1,inplace=True)

In [4]:
df = pd.concat([df, df_regional], axis=1)

In [5]:
X = df.drop(['file','label'],axis=1).values
y = df.label.values
y_bin = (y!=0).astype(int)
print('X (data)',X.shape,type(X))
print('y (target)',y.shape,type(y),np.unique(y))
print('y_bin (binary target)',y_bin.shape,type(y_bin),np.unique(y_bin))

X (data) (1005, 119) <class 'numpy.ndarray'>
y (target) (1005,) <class 'numpy.ndarray'> [0. 1. 2. 3.]
y_bin (binary target) (1005,) <class 'numpy.ndarray'> [0 1]


### preprocess data to have mean=0 and variance=1

In [6]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(X)

### preprocess data to be on range [0,1]

In [7]:
min_max_scaler = preprocessing.MinMaxScaler()
X_01 = min_max_scaler.fit_transform(X)

### testing models

In [8]:
import warnings
warnings.filterwarnings('ignore')
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [9]:
clfs = []
clfs.append([svm.SVC(),'default svm'])
#clfs.append([svm.SVC(kernel='linear'),'svc kernel linear'])
clfs.append([LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial'),'logistic reg'])
clfs.append([AdaBoostClassifier(n_estimators=100), 'adaboost'])
clfs.append([KNeighborsClassifier(n_neighbors=3), 'knn 3'])
clfs.append([MLPClassifier(solver='lbfgs', alpha=1e-5,
                           hidden_layer_sizes=(5, 2), random_state=1),'NN'])

In [10]:
for clf, clf_str in clfs:
    cv(clf,X,y,clf_str)

Clf: default svm
Accuracy: 0.70 +/- 0.00
Clf: logistic reg
Accuracy: 0.69 +/- 0.06
Clf: adaboost
Accuracy: 0.66 +/- 0.13
Clf: knn 3
Accuracy: 0.72 +/- 0.05
Clf: NN
Accuracy: 0.08 +/- 0.00


### ensemble

In [11]:
from sklearn.ensemble import VotingClassifier

In [12]:
estimators = []
for clf, clf_str in clfs:
    estimators.append((clf_str, clf))

In [13]:
eclf = VotingClassifier(estimators=estimators, voting='hard')
cv(eclf,X_scaled,y_bin,'ensemble')

Clf: ensemble
Accuracy: 0.87 +/- 0.05


In [14]:
ada_clf = AdaBoostClassifier(n_estimators=100)
cv(ada_clf,X,y==1,'1')
cv(ada_clf,X,y==2,'2')
cv(ada_clf,X,y==3,'3')

Clf: 1
Accuracy: 0.94 +/- 0.03
Clf: 2
Accuracy: 0.94 +/- 0.03
Clf: 3
Accuracy: 0.96 +/- 0.03


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [16]:
yhat1 = ada_clf.fit(X_train,y_train==1).predict(X_test)
yhat2 = ada_clf.fit(X_train,y_train==2).predict(X_test)
yhat3 = ada_clf.fit(X_train,y_train==3).predict(X_test)

In [17]:
print(sum(yhat1==(y_test==1))/len(yhat1))
print(sum(yhat2==(y_test==2))/len(yhat1))
print(sum(yhat3==(y_test==3))/len(yhat1))

0.9608433734939759
0.9397590361445783
0.963855421686747


In [18]:
sum(np.logical_or.reduce((yhat1, yhat2, yhat3)) == (y_test!=0))/len(y_test)

0.8704819277108434

In [19]:
np.any(np.stack([yhat1, yhat2, yhat3]),axis=0) == np.logical_or.reduce((yhat1, yhat2, yhat3));