In [1]:
import numpy as np
from scipy.sparse import load_npz

np.set_printoptions(threshold=np.nan)

In [2]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [3]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

### Reading data

In [4]:
Xtrain = load_npz('features_silico_duplicated.npz')
Xtrain

<151627x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 1901453 stored elements in Compressed Sparse Row format>

In [5]:
Ytrain = np.load('classes_silico_duplicated.npy')
Ytrain.shape

(151627, 71)

In [6]:
Xtest = load_npz('features_inga_dropped.npz')
Xtest

<842x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 14100 stored elements in Compressed Sparse Row format>

In [7]:
Ytest = np.load('classes_inga.npy')
Ytest.shape

(842, 71)

### Training Model

In [8]:
rfc = RandomForestClassifier(n_estimators=10)

In [9]:
rfc1 = RandomForestClassifier(n_estimators=10, class_weight={0:10, 1:1})

In [10]:
rfc2 = RandomForestClassifier(n_estimators=10, class_weight={0:1, 1:10})

In [11]:
def predicti(clf, Xtrain, Xtest, ytrain, ytest, threshold):
    
    if np.unique(ytrain).size == 1:
        ytrainpred = np.full(ytrain.shape, ytrain[0])
        ytestpred = np.full(ytest.shape, ytrain[0])
    else:
        ros = RandomOverSampler()
        Xtrain_resampled, ytrain_resampled = ros.fit_sample(Xtrain, ytrain)
        clf.fit(Xtrain_resampled, ytrain_resampled)
#         ytrainpred = clf.predict(Xtrain)
#         ytestpred = clf.predict(Xtest)
        ytrainprob = clf.predict_proba(Xtrain)
        ytestprob = clf.predict_proba(Xtest)
        ytrainpred = (ytrainprob[:,1]>=threshold).astype('int')
        ytestpred = (ytestprob[:,1]>=threshold).astype('int')
    
    return ytrainpred, ytestpred

In [12]:
def predict(clf, Xtrain, Xtest, Ytrain, Ytest, threshold=0.5):
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []

    Ytrainpred_arr = []
    Ytestpred_arr = []

    for i in range(71):

        ytrain = Ytrain[:,i]
        ytest = Ytest[:,i]

        ytrainpred, ytestpred = predicti(clf, Xtrain, Xtest, ytrain, ytest, threshold)

        print(metrics.confusion_matrix(ytrain, ytrainpred))
        print(metrics.confusion_matrix(ytest, ytestpred))
        
        Ytrainpred_arr.append(ytrainpred)
        Ytestpred_arr.append(ytestpred)

    Ytrainpred = np.array(Ytrainpred_arr).T
    Ytestpred = np.array(Ytestpred_arr).T

    print('Accuracy: \t \t {} \t {}'.format(metrics.accuracy_score(Ytrain, Ytrainpred), 
                                            metrics.accuracy_score(Ytest, Ytestpred)))
    print('Hamming: \t \t {} \t {}'.format(1 - metrics.hamming_loss(Ytrain, Ytrainpred), 
                                           1 - metrics.hamming_loss(Ytest, Ytestpred)))
    print('Precision: \t \t {} \t {}'.format(metrics.precision_score(Ytrain, Ytrainpred, average='micro'), 
                                             metrics.precision_score(Ytest, Ytestpred, average='micro')))
    print('Recall: \t \t {} \t {}'.format(metrics.recall_score(Ytrain, Ytrainpred, average='micro'), 
                                          metrics.recall_score(Ytest, Ytestpred, average='micro')))
    print('F1: \t \t \t {} \t {}'.format(metrics.f1_score(Ytrain, Ytrainpred, average='micro'), 
                                         metrics.f1_score(Ytest, Ytestpred, average='micro')))

In [13]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
[[73239   100]
 [   66 78222]]
[[ 52  37]
 [ 17 736]]
[[123943     24]
 [     2  27658]]
[[242  12]
 [363 225]]
[[145201     11]
 [     0   6415]]
[[789  14]
 [ 33   6]]
[[137949      5]
 [     0  13673]]
[[634   6]
 [200   2]]
[[148634      0]
 [     0   2993]]
[[821   6]
 [ 15   0]]
[[143778      0]
 [     0   7849]]
[[491   1]
 [323  27]]
[[148077      0]
 [     0   3550]]
[[773   0]
 [ 69   0]]
[[147531      0]
 [     0   4096]]
[[839   0]
 [  3   0]]
[[147723      0]
 [     0   3904]]
[[842]]
[[147459      0]
 [     0   4168]]
[[842]]
[[148956      7]
 [     0   2664]]
[[833   0]
 [  9   0]]
[[147772      1]
 [     0   3854]]
[[774   0]
 [ 68   0]]
[[148609      1]
 [     0   3017]]
[[729   6]
 [103   4]]
[[145052      4]
 [     0   6571]]
[[824   3]
 [ 14   1]]
[[147280      3]
 [     0   4344]]
[[498  17]
 [211 116]]
[[116145     75]
 [     0  35407]]
[[798  29]
 [ 10   5]]
[[148246     48]
 [     0   3333]]
[[832   1]
 [  9   0]]
[[147531      0]
 [     0   4096

In [14]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc1, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
[[73238   101]
 [  101 78187]]
[[  6  83]
 [  6 747]]
[[123926     41]
 [     0  27660]]
[[158  96]
 [148 440]]
[[145191     21]
 [     0   6415]]
[[712  91]
 [ 22  17]]
[[137947      7]
 [     0  13673]]
[[613  27]
 [183  19]]
[[148631      3]
 [     0   2993]]
[[822   5]
 [ 15   0]]
[[143778      0]
 [     0   7849]]
[[464  28]
 [214 136]]
[[148077      0]
 [     0   3550]]
[[771   2]
 [ 69   0]]
[[147531      0]
 [     0   4096]]
[[839   0]
 [  3   0]]
[[147723      0]
 [     0   3904]]
[[842]]
[[147458      1]
 [     0   4168]]
[[842]]
[[148957      6]
 [     0   2664]]
[[824   9]
 [  9   0]]
[[147771      2]
 [     0   3854]]
[[768   6]
 [ 68   0]]
[[148606      4]
 [     0   3017]]
[[729   6]
 [ 96  11]]
[[145054      2]
 [     0   6571]]
[[814  13]
 [  7   8]]
[[147282      1]
 [     0   4344]]
[[484  31]
 [193 134]]
[[116195     25]
 [    31  35376]]
[[750  77]
 [ 11   4]]
[[148265     29]
 [     4   3329]]
[[828   5]
 [  9   0]]
[[147531      0]
 [     0   4096

In [15]:
for t in [0.5, 0.4, 0.3, 0.2, 0.1]:
    print('threshold = {}'.format(t))
    predict(rfc2, Xtrain, Xtest, Ytrain, Ytest, threshold=t)

threshold = 0.5
[[73211   128]
 [   46 78242]]
[[ 81   8]
 [ 61 692]]
[[123941     26]
 [     0  27660]]
[[251   3]
 [555  33]]
[[145194     18]
 [     0   6415]]
[[801   2]
 [ 38   1]]
[[137939     15]
 [     0  13673]]
[[639   1]
 [201   1]]
[[148632      2]
 [     0   2993]]
[[824   3]
 [ 15   0]]
[[143778      0]
 [     0   7849]]
[[492   0]
 [348   2]]
[[148075      2]
 [     0   3550]]
[[773   0]
 [ 69   0]]
[[147531      0]
 [     0   4096]]
[[839   0]
 [  3   0]]
[[147723      0]
 [     0   3904]]
[[842]]
[[147459      0]
 [     0   4168]]
[[842]]
[[148956      7]
 [     0   2664]]
[[833   0]
 [  9   0]]
[[147772      1]
 [     0   3854]]
[[774   0]
 [ 68   0]]
[[148606      4]
 [     0   3017]]
[[731   4]
 [103   4]]
[[145055      1]
 [     0   6571]]
[[826   1]
 [ 14   1]]
[[147278      5]
 [     0   4344]]
[[507   8]
 [240  87]]
[[116141     79]
 [     2  35405]]
[[824   3]
 [ 15   0]]
[[148240     54]
 [     0   3333]]
[[833   0]
 [  9   0]]
[[147531      0]
 [     0   4096