In [1]:
import numpy as np
from scipy.sparse import load_npz

np.set_printoptions(threshold=np.nan)

In [2]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [3]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

### Reading data

In [4]:
Xtrain = load_npz('features_silico_duplicated.npz')
Xtrain

<151627x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 1901453 stored elements in Compressed Sparse Row format>

In [5]:
Ytrain = np.load('classes_silico_duplicated.npy')
Ytrain.shape

(151627, 71)

In [6]:
Xtest = load_npz('features_inga_dropped.npz')
Xtest

<842x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 14100 stored elements in Compressed Sparse Row format>

In [7]:
Ytest = np.load('classes_inga.npy')
Ytest.shape

(842, 71)

### Training Model

In [8]:
rfc = RandomForestClassifier(n_estimators=10)

In [9]:
rfc1 = RandomForestClassifier(n_estimators=10, class_weight={0:10, 1:1})

In [10]:
rfc2 = RandomForestClassifier(n_estimators=10, class_weight={0:1, 1:10})

In [11]:
def predicti(clf, Xtrain, Xtest, ytrain, ytest, threshold):
    
    if np.unique(ytrain).size == 1:
        ytrainpred = np.full(ytrain.shape, ytrain[0])
        ytestpred = np.full(ytest.shape, ytrain[0])
    else:
        ros = RandomOverSampler()
        Xtrain_resampled, ytrain_resampled = ros.fit_sample(Xtrain, ytrain)
        clf.fit(Xtrain_resampled, ytrain_resampled)
#         ytrainpred = clf.predict(Xtrain)
#         ytestpred = clf.predict(Xtest)
        ytrainprob = clf.predict_proba(Xtrain)
        ytestprob = clf.predict_proba(Xtest)
        ytrainpred = (ytrainprob[:,1]>=threshold).astype('int')
        ytestpred = (ytestprob[:,1]>=threshold).astype('int')
    
    return ytrainpred, ytestpred

In [12]:
def predict(clf, Xtrain, Xtest, Ytrain, Ytest, threshold=0.5):
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []

    Ytrainpred_arr = []
    Ytestpred_arr = []

    for i in range(71):

        ytrain = Ytrain[:,i]
        ytest = Ytest[:,i]

        ytrainpred, ytestpred = predicti(clf, Xtrain, Xtest, ytrain, ytest, threshold=threshold)

        Ytrainpred_arr.append(ytrainpred)
        Ytestpred_arr.append(ytestpred)

    Ytrainpred = np.array(Ytrainpred_arr).T
    Ytestpred = np.array(Ytestpred_arr).T

    print('Accuracy: \t \t {} \t {}'.format(metrics.accuracy_score(Ytrain, Ytrainpred), 
                                            metrics.accuracy_score(Ytest, Ytestpred)))
    print('Hamming: \t \t {} \t {}'.format(1 - metrics.hamming_loss(Ytrain, Ytrainpred), 
                                           1 - metrics.hamming_loss(Ytest, Ytestpred)))
    print('Precision: \t \t {} \t {}'.format(metrics.precision_score(Ytrain, Ytrainpred, average='micro'), 
                                             metrics.precision_score(Ytest, Ytestpred, average='micro')))
    print('Recall: \t \t {} \t {}'.format(metrics.recall_score(Ytrain, Ytrainpred, average='micro'), 
                                          metrics.recall_score(Ytest, Ytestpred, average='micro')))
    print('F1: \t \t \t {} \t {}'.format(metrics.f1_score(Ytrain, Ytrainpred, average='micro'), 
                                         metrics.f1_score(Ytest, Ytestpred, average='micro')))

In [13]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
Accuracy: 	 	 0.993299346422471 	 0.0332541567695962
Hamming: 	 	 0.9998548142183975 	 0.9466060018065638
Precision: 	 	 0.996968899885332 	 0.7363420427553444
Recall: 	 	 0.9998186445742785 	 0.31093279839518556
F1: 	 	 	 0.9983917387022988 	 0.4372355430183357
threshold = 0.4
Accuracy: 	 	 0.9857874916736465 	 0.04275534441805225
Hamming: 	 	 0.9997384240812587 	 0.9484627479843432
Precision: 	 	 0.9943192414178944 	 0.671585319712448
Recall: 	 	 0.9999093222871392 	 0.4450852557673019
F1: 	 	 	 0.9971064469922873 	 0.5353641984617704
threshold = 0.3
Accuracy: 	 	 0.9649666616103992 	 0.020190023752969122
Hamming: 	 	 0.9993958488013163 	 0.9375062727911412
Precision: 	 	 0.9868071302915365 	 0.5307017543859649
Recall: 	 	 0.9999649654291219 	 0.5461384152457373
F1: 	 	 	 0.9933424775318853 	 0.5383094414236282
threshold = 0.2
Accuracy: 	 	 0.9066591042492432 	 0.0
Hamming: 	 	 0.9982482959248496 	 0.8973771369308488
Precision: 	 	 0.9625994131918751 	 0.3586942214031

In [14]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc1, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
Accuracy: 	 	 0.9936027224702725 	 0.014251781472684086
Hamming: 	 	 0.9998689333731022 	 0.9413870395771302
Precision: 	 	 0.9975913576844123 	 0.5676355505869201
Recall: 	 	 0.9995053942934866 	 0.5092778335005015
F1: 	 	 	 0.9985474587734622 	 0.5368754956383821
threshold = 0.4
Accuracy: 	 	 0.9848180073469764 	 0.0035629453681710215
Hamming: 	 	 0.9997308071688522 	 0.9267505269144558
Precision: 	 	 0.9943101812092504 	 0.4611872146118721
Recall: 	 	 0.9997485754325224 	 0.5824974924774323
F1: 	 	 	 0.9970219622576363 	 0.5147922437673129
threshold = 0.3
Accuracy: 	 	 0.9622626577060813 	 0.0011876484560570072
Hamming: 	 	 0.9993526553346207 	 0.8963233080191362
Precision: 	 	 0.9859972441600988 	 0.35789609053497945
Recall: 	 	 0.9998371922882727 	 0.6978435305917753
F1: 	 	 	 0.9928689905973819 	 0.47313838830329824
threshold = 0.2
Accuracy: 	 	 0.8995693379147514 	 0.0
Hamming: 	 	 0.998083138970474 	 0.8410725636479208
Precision: 	 	 0.9593123152514558 	 0.26610

In [15]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc2, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
Accuracy: 	 	 0.99170991973725 	 0.0332541567695962
Hamming: 	 	 0.9997996380480381 	 0.9452510789200763
Precision: 	 	 0.9956720350669819 	 0.8183437221727515
Recall: 	 	 0.9999010788586973 	 0.23044132397191575
F1: 	 	 	 0.9977820758412842 	 0.3596165134024653
threshold = 0.4
Accuracy: 	 	 0.9845212264306489 	 0.0332541567695962
Hamming: 	 	 0.9996872421454539 	 0.9480612893513097
Precision: 	 	 0.9931311146588071 	 0.8430458430458431
Recall: 	 	 0.9999773305717848 	 0.27206619859578735
F1: 	 	 	 0.9965424644311291 	 0.41137440758293836
threshold = 0.3
Accuracy: 	 	 0.9651315398972479 	 0.057007125890736345
Hamming: 	 	 0.9993680749377851 	 0.9482620186678264
Precision: 	 	 0.9861817368859622 	 0.7040583675330597
Recall: 	 	 0.9999917565715581 	 0.38716148445336007
F1: 	 	 	 0.9930387356551191 	 0.4995955347031225
threshold = 0.2
Accuracy: 	 	 0.9115131210140675 	 0.015439429928741092
Hamming: 	 	 0.998326229943253 	 0.9377237295507008
Precision: 	 	 0.964195018817535