In [1]:
import numpy as np

from scipy.sparse import load_npz

In [2]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import metrics

### Reading data

In [3]:
Xtest = load_npz('features_inga_duplicated.npz')
Xtest

<1719x1676 sparse matrix of type '<class 'numpy.float64'>'
	with 27033 stored elements in Compressed Sparse Row format>

In [4]:
Ytest = np.load('classes_inga_duplicated.npy')
Ytest.shape

(1719, 71)

In [5]:
def predict(clf, X, Y, n_splits=5):
    kf = KFold(n_splits=n_splits, random_state=None, shuffle=True)
    
    accuracytrain = []
    accuracytest = []
    hammingtrain = []
    hammingtest = []
    f1train = []
    f1test = []
    precisiontrain = []
    precisiontest = []
    recalltrain = []
    recalltest = []
    
    for train_index, test_index in kf.split(X):
        Xtrain = X[train_index]
        Ytrain = Y[train_index]
        Xtest = X[test_index]
        Ytest = Y[test_index]
        
        clf.fit(Xtrain, Ytrain)

        Ytrainpred = clf.predict(Xtrain)
        Ytestpred = clf.predict(Xtest)
        
        Ytrain = (Ytrain==1).astype('int')
        Ytest = (Ytest==1).astype('int')
        Ytrainpred = (Ytrainpred==1).astype('int')
        Ytestpred = (Ytestpred==1).astype('int')
        
        accuracytrain.append(metrics.accuracy_score(Ytrain, Ytrainpred))
        accuracytest.append(metrics.accuracy_score(Ytest, Ytestpred))
        hammingtrain.append(1 - metrics.hamming_loss(Ytrain, Ytrainpred))
        hammingtest.append(1 - metrics.hamming_loss(Ytest, Ytestpred))
        f1train.append(metrics.f1_score(Ytrain, Ytrainpred, average='micro'))
        f1test.append(metrics.f1_score(Ytest, Ytestpred, average='micro'))
        precisiontrain.append(metrics.precision_score(Ytrain, Ytrainpred, average='micro'))
        precisiontest.append(metrics.precision_score(Ytest, Ytestpred, average='micro'))
        recalltrain.append(metrics.recall_score(Ytrain, Ytrainpred, average='micro'))
        recalltest.append(metrics.recall_score(Ytest, Ytestpred, average='micro'))
        
#         print(metrics.classification_report(Ytest, Ytestpred))
        
    print('Accuracy: \t \t {} \t {}'.format(np.array(accuracytrain).mean(), np.array(accuracytest).mean()))
    print('Hamming: \t \t {} \t {}'.format(np.array(hammingtrain).mean(), np.array(hammingtest).mean()))
    print('Precision: \t \t {} \t {}'.format(np.array(precisiontrain).mean(), np.array(precisiontest).mean()))
    print('Recall: \t \t {} \t {}'.format(np.array(recalltrain).mean(), np.array(recalltest).mean()))
    print('F1: \t \t \t {} \t {}'.format(np.array(f1train).mean(), np.array(f1test).mean()))

In [6]:
predict(DecisionTreeClassifier(max_depth=None), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7614990846837075
Hamming: 	 	 1.0 	 0.988365265155241
Precision: 	 	 1.0 	 0.9183945977852433
Recall: 	 	 1.0 	 0.901839800936818
F1: 	 	 	 1.0 	 0.9100039404439778


In [7]:
predict(RandomForestClassifier(n_estimators=100, n_jobs=-1), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7527798494813208
Hamming: 	 	 1.0 	 0.9913895093389
Precision: 	 	 1.0 	 0.9719360594822504
Recall: 	 	 1.0 	 0.8937078397616137
F1: 	 	 	 1.0 	 0.9310911704445342


In [8]:
predict(ExtraTreesClassifier(n_estimators=100, n_jobs=-1), Xtest, Ytest)

Accuracy: 	 	 1.0 	 0.7347379483354805
Hamming: 	 	 1.0 	 0.9903731119512518
Precision: 	 	 1.0 	 0.9619844487541835
Recall: 	 	 1.0 	 0.8874403373522833
F1: 	 	 	 1.0 	 0.9231872425778406
