In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Reading data

In [3]:
Xtrain = load_npz('features_silico_duplicated.npz')
Xtrain

<151627x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 1901453 stored elements in Compressed Sparse Row format>

In [4]:
Ytrain = np.load('classes_silico_duplicated.npy')
Ytrain.shape

(151627, 71)

In [5]:
Xtest = load_npz('features_inga_dropped.npz')
Xtest

<842x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 14100 stored elements in Compressed Sparse Row format>

In [6]:
Ytest = np.load('classes_inga.npy')
Ytest.shape

(842, 71)

### Training Model

In [7]:
def predict(clf, Xtrain, Xtest, Ytrain, Ytest, threshold=0.5):
    clf.fit(Xtrain, Ytrain)
    Ytrainprob = clf.predict_proba(Xtrain)
    Ytrainpred = np.array(list(map(lambda x: (x[:,0]<(1-threshold)).astype('int'), Ytrainprob))).T
    Ytestprob = clf.predict_proba(Xtest)
    Ytestpred = np.array(list(map(lambda x: (x[:,0]<(1-threshold)).astype('int'), Ytestprob))).T
    
    print(metrics.classification_report(Ytrain, Ytrainpred))
    print(metrics.classification_report(Ytest, Ytestpred))
    
    print('Accuracy: \t \t {} \t {}'.format(metrics.accuracy_score(Ytrain, Ytrainpred), 
                                        metrics.accuracy_score(Ytest, Ytestpred)))
    print('Hamming: \t \t {} \t {}'.format(1 - metrics.hamming_loss(Ytrain, Ytrainpred), 
                                           1 - metrics.hamming_loss(Ytest, Ytestpred)))
    print('Precision: \t \t {} \t {}'.format(metrics.f1_score(Ytrain, Ytrainpred, average='micro'), 
                                             metrics.f1_score(Ytest, Ytestpred, average='micro')))
    print('Recall: \t \t {} \t {}'.format(metrics.precision_score(Ytrain, Ytrainpred, average='micro'), 
                                          metrics.precision_score(Ytest, Ytestpred, average='micro')))
    print('F1: \t \t \t {} \t {}'.format(metrics.recall_score(Ytrain, Ytrainpred, average='micro'), 
                                         metrics.recall_score(Ytest, Ytestpred, average='micro')))

In [8]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Xtest, Ytrain, Ytest)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00     78288
          1       1.00      1.00      1.00     27660
          2       1.00      1.00      1.00      6415
          3       1.00      1.00      1.00     13673
          4       1.00      1.00      1.00      2993
          5       1.00      1.00      1.00      7849
          6       1.00      1.00      1.00      3550
          7       1.00      1.00      1.00      4096
          8       1.00      1.00      1.00      3904
          9       1.00      1.00      1.00      4168
         10       1.00      1.00      1.00      2664
         11       1.00      1.00      1.00      3854
         12       1.00      1.00      1.00      3017
         13       1.00      1.00      1.00      6571
         14       1.00      1.00      1.00      4344
         15       1.00      1.00      1.00     35407
         16       1.00      1.00      1.00      3333
         17       1.00      1.00      1.00   

In [9]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Xtest, Ytrain, Ytest, threshold=0.4)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00     78288
          1       1.00      1.00      1.00     27660
          2       1.00      1.00      1.00      6415
          3       1.00      1.00      1.00     13673
          4       1.00      1.00      1.00      2993
          5       1.00      1.00      1.00      7849
          6       1.00      1.00      1.00      3550
          7       1.00      1.00      1.00      4096
          8       1.00      1.00      1.00      3904
          9       1.00      1.00      1.00      4168
         10       1.00      1.00      1.00      2664
         11       1.00      1.00      1.00      3854
         12       1.00      1.00      1.00      3017
         13       1.00      1.00      1.00      6571
         14       1.00      1.00      1.00      4344
         15       1.00      1.00      1.00     35407
         16       1.00      1.00      1.00      3333
         17       1.00      1.00      1.00   

In [10]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Xtest, Ytrain, Ytest, threshold=0.3)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          0       1.00      1.00      1.00     78288
          1       1.00      1.00      1.00     27660
          2       1.00      1.00      1.00      6415
          3       1.00      1.00      1.00     13673
          4       1.00      1.00      1.00      2993
          5       1.00      1.00      1.00      7849
          6       1.00      1.00      1.00      3550
          7       1.00      1.00      1.00      4096
          8       1.00      1.00      1.00      3904
          9       1.00      1.00      1.00      4168
         10       1.00      1.00      1.00      2664
         11       1.00      1.00      1.00      3854
         12       1.00      1.00      1.00      3017
         13       1.00      1.00      1.00      6571
         14       1.00      1.00      1.00      4344
         15       1.00      1.00      1.00     35407
         16       0.99      1.00      1.00      3333
         17       1.00      1.00      1.00   

In [11]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Xtest, Ytrain, Ytest, threshold=0.2)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          0       0.99      1.00      1.00     78288
          1       0.99      1.00      1.00     27660
          2       0.99      1.00      1.00      6415
          3       1.00      1.00      1.00     13673
          4       1.00      1.00      1.00      2993
          5       1.00      1.00      1.00      7849
          6       1.00      1.00      1.00      3550
          7       1.00      1.00      1.00      4096
          8       1.00      1.00      1.00      3904
          9       1.00      1.00      1.00      4168
         10       0.99      1.00      0.99      2664
         11       1.00      1.00      1.00      3854
         12       1.00      1.00      1.00      3017
         13       1.00      1.00      1.00      6571
         14       1.00      1.00      1.00      4344
         15       0.99      1.00      1.00     35407
         16       0.99      1.00      0.99      3333
         17       1.00      1.00      1.00   

In [12]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Xtest, Ytrain, Ytest, threshold=0.1)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

          0       0.97      1.00      0.98     78288
          1       0.97      1.00      0.99     27660
          2       0.95      1.00      0.97      6415
          3       0.97      1.00      0.98     13673
          4       0.97      1.00      0.99      2993
          5       1.00      1.00      1.00      7849
          6       1.00      1.00      1.00      3550
          7       1.00      1.00      1.00      4096
          8       1.00      1.00      1.00      3904
          9       1.00      1.00      1.00      4168
         10       0.97      1.00      0.98      2664
         11       0.99      1.00      0.99      3854
         12       0.98      1.00      0.99      3017
         13       0.99      1.00      1.00      6571
         14       0.99      1.00      1.00      4344
         15       0.98      1.00      0.99     35407
         16       0.97      1.00      0.98      3333
         17       1.00      1.00      1.00   