In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

### Reading data

In [3]:
Xtrain = load_npz('features_silico_duplicated.npz')
Xtrain

<151627x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 1901453 stored elements in Compressed Sparse Row format>

In [4]:
Ytrain = np.load('classes_silico_duplicated.npy')
Ytrain.shape

(151627, 71)

In [7]:
def predict(clf, X, Y, n_splits=5):
    kf = KFold(n_splits=n_splits, random_state=None, shuffle=True)
    
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []
    
    for train_index, test_index in kf.split(X):
        Xtrain = X[train_index]
        Ytrain = Y[train_index]
        Xtest = X[test_index]
        Ytest = Y[test_index]
        
        clf.fit(Xtrain, Ytrain)

        Ytrainpred = clf.predict(Xtrain)
        Ytestpred = clf.predict(Xtest)
        
        Ytrain = (Ytrain==1).astype('int')
        Ytest = (Ytest==1).astype('int')
        Ytrainpred = (Ytrainpred==1).astype('int')
        Ytestpred = (Ytestpred==1).astype('int')
        
        accuracytrain.append(metrics.accuracy_score(Ytrain, Ytrainpred))
        accuracytest.append(metrics.accuracy_score(Ytest, Ytestpred))
        hammingtrain.append(1 - metrics.hamming_loss(Ytrain, Ytrainpred))
        hammingtest.append(1 - metrics.hamming_loss(Ytest, Ytestpred))
        f1train.append(metrics.f1_score(Ytrain, Ytrainpred, average='micro'))
        f1test.append(metrics.f1_score(Ytest, Ytestpred, average='micro'))
        precisiontrain.append(metrics.precision_score(Ytrain, Ytrainpred, average='micro'))
        precisiontest.append(metrics.precision_score(Ytest, Ytestpred, average='micro'))
        recalltrain.append(metrics.recall_score(Ytrain, Ytrainpred, average='micro'))
        recalltest.append(metrics.recall_score(Ytest, Ytestpred, average='micro'))
        
#         print(metrics.classification_report(Ytest, Ytestpred))
        
    print('Accuracy: \t \t {} \t {}'.format(np.array(accuracytrain).mean(), np.array(accuracytest).mean()))
    print('Hamming: \t \t {} \t {}'.format(np.array(hammingtrain).mean(), np.array(hammingtest).mean()))
    print('Precision: \t \t {} \t {}'.format(np.array(precisiontrain).mean(), np.array(precisiontest).mean()))
    print('Recall: \t \t {} \t {}'.format(np.array(recalltrain).mean(), np.array(recalltest).mean()))
    print('F1: \t \t \t {} \t {}'.format(np.array(f1train).mean(), np.array(f1test).mean()))

In [8]:
predict(DecisionTreeClassifier(max_depth=None), Xtrain, Ytrain)

Accuracy: 	 	 0.9985078519389537 	 0.8796256620894388
Hamming: 	 	 0.9999525104150541 	 0.9959242086218396
Precision: 	 	 0.99963409266191 	 0.9549896759867714
Recall: 	 	 0.9993121480165785 	 0.9545635384315521
F1: 	 	 	 0.9994730923292945 	 0.954776252280569


In [9]:
predict(RandomForestClassifier(n_estimators=100, n_jobs=-1), Xtrain, Ytrain)

Accuracy: 	 	 0.9986166711271031 	 0.8940294324074651
Hamming: 	 	 0.9999524639627703 	 0.9974571588638487
Precision: 	 	 0.9995733364876473 	 0.9883763601572465
Recall: 	 	 0.9993719763028395 	 0.9548136073568843
F1: 	 	 	 0.9994726445580246 	 0.9713050996383373


In [10]:
predict(ExtraTreesClassifier(n_estimators=100, n_jobs=-1), Xtrain, Ytrain)

Accuracy: 	 	 0.9985408270118488 	 0.9000837668427382
Hamming: 	 	 0.999952742629187 	 0.9976564063999884
Precision: 	 	 0.9996454165438597 	 0.9893474283510386
Recall: 	 	 0.9993059861373575 	 0.9583233364425123
F1: 	 	 	 0.9994756705711904 	 0.9735880314486873
