In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

In [3]:
from imblearn.over_sampling import RandomOverSampler

### Reading data

In [4]:
Xtest = load_npz('features_inga_duplicated.npz')
Xtest

<1719x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 27033 stored elements in Compressed Sparse Row format>

In [5]:
Ytest = np.load('classes_inga_duplicated.npy')
Ytest.shape

(1719, 71)

In [6]:
def predict(clf, X, Y, n_splits=5):
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []

    kf = KFold(n_splits=n_splits)
    for train_idx, test_idx in kf.split(X):
        Xtrain = X[train_idx]
        Xtest = X[test_idx]
        Ytrain = Y[train_idx]
        Ytest = Y[test_idx]

        Ytrainpred_arr = []
        Ytestpred_arr = []

        for i in range(71):

            ytrain = Ytrain[:,i]
            ytest = Ytest[:,i]

            if np.unique(ytrain).size == 1:
                ytrainpred = np.full(ytrain.shape, ytrain[0])
                ytestpred = np.full(ytest.shape, ytrain[0])
            else:
                ros = RandomOverSampler()
                Xtrain_resampled, ytrain_resampled = ros.fit_sample(Xtrain, ytrain)
                clf.fit(Xtrain_resampled, ytrain_resampled)
                ytrainpred = clf.predict(Xtrain)
                ytestpred = clf.predict(Xtest)
            Ytrainpred_arr.append(ytrainpred)
            Ytestpred_arr.append(ytestpred)

        Ytrainpred = np.array(Ytrainpred_arr).T
        Ytestpred = np.array(Ytestpred_arr).T

        Ytrain = (Ytrain==1).astype('int')
        Ytest = (Ytest==1).astype('int')
        Ytrainpred = (Ytrainpred==1).astype('int')
        Ytestpred = (Ytestpred==1).astype('int')

        accuracytrain.append(metrics.accuracy_score(Ytrain, Ytrainpred))
        accuracytest.append(metrics.accuracy_score(Ytest, Ytestpred))
        hammingtrain.append(1 - metrics.hamming_loss(Ytrain, Ytrainpred))
        hammingtest.append(1 - metrics.hamming_loss(Ytest, Ytestpred))
        f1train.append(metrics.f1_score(Ytrain, Ytrainpred, average='micro'))
        f1test.append(metrics.f1_score(Ytest, Ytestpred, average='micro'))
        precisiontrain.append(metrics.precision_score(Ytrain, Ytrainpred, average='micro'))
        precisiontest.append(metrics.precision_score(Ytest, Ytestpred, average='micro'))
        recalltrain.append(metrics.recall_score(Ytrain, Ytrainpred, average='micro'))
        recalltest.append(metrics.recall_score(Ytest, Ytestpred, average='micro'))

    print('Accuracy: \t \t {} \t {}'.format(np.array(accuracytrain).mean(), np.array(accuracytest).mean()))
    print('Hamming: \t \t {} \t {}'.format(np.array(hammingtrain).mean(), np.array(hammingtest).mean()))
    print('Precision: \t \t {} \t {}'.format(np.array(precisiontrain).mean(), np.array(precisiontest).mean()))
    print('Recall: \t \t {} \t {}'.format(np.array(recalltrain).mean(), np.array(recalltest).mean()))
    print('F1: \t \t \t {} \t {}'.format(np.array(f1train).mean(), np.array(f1test).mean()))

In [7]:
predict(DecisionTreeClassifier(max_depth=None), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7162790697674419
Hamming: 	 	 1.0 	 0.9875532263347526
Precision: 	 	 1.0 	 0.8955109105469514
Recall: 	 	 1.0 	 0.9242205064285904
F1: 	 	 	 1.0 	 0.9094103352823113


In [8]:
predict(RandomForestClassifier(n_estimators=100, n_jobs=-1), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7802325581395348
Hamming: 	 	 1.0 	 0.9919259744513592
Precision: 	 	 1.0 	 0.9566481536292155
Recall: 	 	 1.0 	 0.9189151551121558
F1: 	 	 	 1.0 	 0.9369462725652877


In [9]:
predict(ExtraTreesClassifier(n_estimators=100, n_jobs=-1), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7593023255813953
Hamming: 	 	 1.0 	 0.9911644284310513
Precision: 	 	 1.0 	 0.952932212355764
Recall: 	 	 1.0 	 0.9104105497633931
F1: 	 	 	 1.0 	 0.9307031714644479
