In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

In [3]:
from imblearn.over_sampling import RandomOverSampler

### Reading data

In [4]:
Xtrain = load_npz('features_silico_duplicated.npz')
Xtrain

<151627x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 1901453 stored elements in Compressed Sparse Row format>

In [5]:
Ytrain = np.load('classes_silico_duplicated.npy')
Ytrain.shape

(151627, 71)

In [8]:
def predict(clf, X, Y, n_splits=5):
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []

    kf = KFold(n_splits=n_splits)
    for train_idx, test_idx in kf.split(X):
        Xtrain = X[train_idx]
        Xtest = X[test_idx]
        Ytrain = Y[train_idx]
        Ytest = Y[test_idx]

        Ytrainpred_arr = []
        Ytestpred_arr = []

        for i in range(71):

            ytrain = Ytrain[:,i]
            ytest = Ytest[:,i]

            if np.unique(ytrain).size == 1:
                ytrainpred = np.full(ytrain.shape, ytrain[0])
                ytestpred = np.full(ytest.shape, ytrain[0])
            else:
                ros = RandomOverSampler()
                Xtrain_resampled, ytrain_resampled = ros.fit_sample(Xtrain, ytrain)
                clf.fit(Xtrain_resampled, ytrain_resampled)
                ytrainpred = clf.predict(Xtrain)
                ytestpred = clf.predict(Xtest)
            Ytrainpred_arr.append(ytrainpred)
            Ytestpred_arr.append(ytestpred)

        Ytrainpred = np.array(Ytrainpred_arr).T
        Ytestpred = np.array(Ytestpred_arr).T

        Ytrain = (Ytrain==1).astype('int')
        Ytest = (Ytest==1).astype('int')
        Ytrainpred = (Ytrainpred==1).astype('int')
        Ytestpred = (Ytestpred==1).astype('int')

        accuracytrain.append(metrics.accuracy_score(Ytrain, Ytrainpred))
        accuracytest.append(metrics.accuracy_score(Ytest, Ytestpred))
        hammingtrain.append(1 - metrics.hamming_loss(Ytrain, Ytrainpred))
        hammingtest.append(1 - metrics.hamming_loss(Ytest, Ytestpred))
        f1train.append(metrics.f1_score(Ytrain, Ytrainpred, average='micro'))
        f1test.append(metrics.f1_score(Ytest, Ytestpred, average='micro'))
        precisiontrain.append(metrics.precision_score(Ytrain, Ytrainpred, average='micro'))
        precisiontest.append(metrics.precision_score(Ytest, Ytestpred, average='micro'))
        recalltrain.append(metrics.recall_score(Ytrain, Ytrainpred, average='micro'))
        recalltest.append(metrics.recall_score(Ytest, Ytestpred, average='micro'))

    print('Accuracy: \t \t {} \t {}'.format(np.array(accuracytrain).mean(), np.array(accuracytest).mean()))
    print('Hamming: \t \t {} \t {}'.format(np.array(hammingtrain).mean(), np.array(hammingtest).mean()))
    print('Precision: \t \t {} \t {}'.format(np.array(precisiontrain).mean(), np.array(precisiontest).mean()))
    print('Recall: \t \t {} \t {}'.format(np.array(recalltrain).mean(), np.array(recalltest).mean()))
    print('F1: \t \t \t {} \t {}'.format(np.array(f1train).mean(), np.array(f1test).mean()))

In [9]:
predict(DecisionTreeClassifier(max_depth=None), Xtrain, Ytrain)

Accuracy: 	 	 0.9977081937428938 	 0.8373464397515125
Hamming: 	 	 0.9999190471395547 	 0.9955078457044892
Precision: 	 	 0.9982527385626531 	 0.9296732689634304
Recall: 	 	 0.999930741145163 	 0.9487659144858931
F1: 	 	 	 0.9990909917311595 	 0.9390310200742491


In [10]:
predict(RandomForestClassifier(n_estimators=100, n_jobs=-1), Xtrain, Ytrain)

Accuracy: 	 	 0.9976933547324176 	 0.8984890762480523
Hamming: 	 	 0.9999180717993428 	 0.9976257839579323
Precision: 	 	 0.9982304706348435 	 0.9779178680897733
Recall: 	 	 0.9999330139821151 	 0.9553299358882452
F1: 	 	 	 0.9990809816531556 	 0.9663599559615929


In [11]:
predict(ExtraTreesClassifier(n_estimators=100, n_jobs=-1), Xtrain, Ytrain)

Accuracy: 	 	 0.9977098425339179 	 0.8984956725539057
Hamming: 	 	 0.9999191400286156 	 0.9976239262540577
Precision: 	 	 0.9982545019270297 	 0.9813139089960365
Recall: 	 	 0.9999316777841836 	 0.951774805109953
F1: 	 	 	 0.9990923459378864 	 0.9660919207501053
