In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Reading data

In [16]:
Xtrain = load_npz('features_silico_duplicated2.npz')
Xtrain

<380324x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 5001622 stored elements in Compressed Sparse Row format>

In [17]:
Ytrain = np.load('classes_silico_duplicated2.npy')
Ytrain.shape

(380324, 71)

In [18]:
Xtest = load_npz('features_inga_dropped.npz')
Xtest

<842x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 14100 stored elements in Compressed Sparse Row format>

In [19]:
Ytest = np.load('classes_inga.npy')
Ytest.shape

(842, 71)

### Training Model

In [7]:
clf = RandomForestClassifier(n_estimators=100)

In [8]:
clf.fit(Xtrain, Ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
Ytrainpred = clf.predict(Xtrain)
Ytestpred = clf.predict(Xtest)

In [10]:
Ytrain = (Ytrain==1).astype('int')
Ytest = (Ytest==1).astype('int')
Ytrainpred = (Ytrainpred==1).astype('int')
Ytestpred = (Ytestpred==1).astype('int')

In [11]:
print(metrics.classification_report(Ytest, Ytestpred))

             precision    recall  f1-score   support

          0       0.98      0.97      0.97       753
          1       0.97      0.48      0.64       588
          2       0.00      0.00      0.00        39
          3       0.00      0.00      0.00       202
          4       0.00      0.00      0.00        15
          5       1.00      0.02      0.04       350
          6       0.00      0.00      0.00        69
          7       0.00      0.00      0.00         3
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         0
         10       0.00      0.00      0.00         9
         11       0.00      0.00      0.00        68
         12       1.00      0.03      0.05       107
         13       0.67      0.13      0.22        15
         14       0.91      0.32      0.47       327
         15       0.14      0.20      0.17        15
         16       0.00      0.00      0.00         9
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [12]:
print('Accuracy: \t \t {} \t {}'.format(metrics.accuracy_score(Ytrain, Ytrainpred), 
                                        metrics.accuracy_score(Ytest, Ytestpred)))
print('Hamming: \t \t {} \t {}'.format(1 - metrics.hamming_loss(Ytrain, Ytrainpred), 
                                       1 - metrics.hamming_loss(Ytest, Ytestpred)))
print('Precision: \t \t {} \t {}'.format(metrics.f1_score(Ytrain, Ytrainpred, average='micro'), 
                                         metrics.f1_score(Ytest, Ytestpred, average='micro')))
print('Recall: \t \t {} \t {}'.format(metrics.precision_score(Ytrain, Ytrainpred, average='micro'), 
                                      metrics.precision_score(Ytest, Ytestpred, average='micro')))
print('F1: \t \t \t {} \t {}'.format(metrics.recall_score(Ytrain, Ytrainpred, average='micro'), 
                                     metrics.recall_score(Ytest, Ytestpred, average='micro')))

Accuracy: 	 	 0.9985095002868882 	 0.017814726840855107
Hamming: 	 	 0.9999507687368846 	 0.9514904151751363
Precision: 	 	 0.999453826716736 	 0.4476190476190476
Recall: 	 	 0.9995382899827477 	 0.9310618066561014
F1: 	 	 	 0.9993693777241955 	 0.29463390170511533


In [20]:
def predict(clf, X, Y, n_splits=5):
    kf = KFold(n_splits=n_splits, random_state=None, shuffle=True)
    
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []
    
    for train_index, test_index in kf.split(X):
        Xtrain = X[train_index]
        Ytrain = Y[train_index]
        Xtest = X[test_index]
        Ytest = Y[test_index]
        
        clf.fit(Xtrain, Ytrain)

        Ytrainpred = clf.predict(Xtrain)
        Ytestpred = clf.predict(Xtest)
        
        Ytrain = (Ytrain==1).astype('int')
        Ytest = (Ytest==1).astype('int')
        Ytrainpred = (Ytrainpred==1).astype('int')
        Ytestpred = (Ytestpred==1).astype('int')
        
        accuracytrain.append(metrics.accuracy_score(Ytrain, Ytrainpred))
        accuracytest.append(metrics.accuracy_score(Ytest, Ytestpred))
        hammingtrain.append(1 - metrics.hamming_loss(Ytrain, Ytrainpred))
        hammingtest.append(1 - metrics.hamming_loss(Ytest, Ytestpred))
        f1train.append(metrics.f1_score(Ytrain, Ytrainpred, average='micro'))
        f1test.append(metrics.f1_score(Ytest, Ytestpred, average='micro'))
        precisiontrain.append(metrics.precision_score(Ytrain, Ytrainpred, average='micro'))
        precisiontest.append(metrics.precision_score(Ytest, Ytestpred, average='micro'))
        recalltrain.append(metrics.recall_score(Ytrain, Ytrainpred, average='micro'))
        recalltest.append(metrics.recall_score(Ytest, Ytestpred, average='micro'))
        
#         print(metrics.classification_report(Ytest, Ytestpred))
        
    print('Accuracy: \t \t {} \t {}'.format(np.array(accuracytrain).mean(), np.array(accuracytest).mean()))
    print('Hamming: \t \t {} \t {}'.format(np.array(hammingtrain).mean(), np.array(hammingtest).mean()))
    print('Precision: \t \t {} \t {}'.format(np.array(precisiontrain).mean(), np.array(precisiontest).mean()))
    print('Recall: \t \t {} \t {}'.format(np.array(recalltrain).mean(), np.array(recalltest).mean()))
    print('F1: \t \t \t {} \t {}'.format(np.array(f1train).mean(), np.array(f1test).mean()))

In [None]:
predict(RandomForestClassifier(n_estimators=100), Xtrain, Ytrain)