In [19]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

### Reading data

In [4]:
Xtrain = np.load('Xtrain_dropped.npy')
Xtrain.shape

(842, 1257)

In [6]:
Ytrain = np.load('Ytrain.npy')
Ytrain.shape

(842, 71)

In [9]:
Xtest = np.load('Xtest_dropped.npy')
Xtest.shape

(2449, 1257)

In [18]:
classes_names = np.load('classes_names.npy')
classes_names.shape

(71,)

In [92]:
dict_testing = pd.read_csv('dict_testing.csv', squeeze=True, header=None)
dict_testing.shape

(2449,)

### Training Model

In [10]:
clf = RandomForestClassifier(n_estimators=100)

In [11]:
clf.fit(Xtrain, Ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Predicting

In [14]:
Ytestpred = clf.predict(Xtest)

In [104]:
pred = pd.DataFrame(Ytestpred, columns=classes_names, dtype='int8')
pred

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,Xanthonoid,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin
0,1,1,0,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
pred['compound_name'] = pred.index

In [106]:
pred['compound_name'] = pred['compound_name'].apply(lambda x: dict_testing[x])

In [107]:
pred

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,compound_name
0,1,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,Inga_compound_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_3
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_4
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_5
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_6
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_7
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_8
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_9
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_10


### Predicting with probability

In [55]:
Ytestprob = clf.predict_proba(Xtest)

In [71]:
def converter(x):
    if x.shape[1] == 1:
        x2 = 1 - x
        return x2.reshape(-1,)
    else:
        return x[:,-1].reshape(-1,)

In [83]:
Ytestprob1 = np.array(list(map(converter, Ytestprob))).T
Ytestprob1

array([[ 1.  ,  0.95,  0.1 , ...,  0.44,  0.02,  0.07],
       [ 0.28,  0.1 ,  0.01, ...,  0.02,  0.  ,  0.  ],
       [ 0.29,  0.09,  0.01, ...,  0.01,  0.  ,  0.  ],
       ..., 
       [ 0.22,  0.05,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.37,  0.05,  0.01, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  ,  0.96,  0.51, ...,  0.  ,  0.  ,  0.5 ]])

In [84]:
prob = pd.DataFrame(Ytestprob1, columns=classes_names)
prob

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,Xanthonoid,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin
0,1.00,0.95,0.10,0.77,0.03,0.08,0.55,0.00,0.0,0.0,...,0.01,0.02,0.03,0.57,0.19,0.01,0.01,0.44,0.02,0.07
1,0.28,0.10,0.01,0.02,0.01,0.07,0.00,0.00,0.0,0.0,...,0.00,0.01,0.01,0.03,0.02,0.00,0.02,0.02,0.00,0.00
2,0.29,0.09,0.01,0.01,0.01,0.07,0.00,0.00,0.0,0.0,...,0.00,0.01,0.01,0.04,0.01,0.00,0.02,0.01,0.00,0.00
3,0.34,0.07,0.01,0.00,0.01,0.06,0.00,0.00,0.0,0.0,...,0.00,0.00,0.01,0.03,0.00,0.00,0.01,0.02,0.00,0.00
4,0.19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
5,0.31,0.02,0.00,0.00,0.00,0.02,0.00,0.00,0.0,0.0,...,0.00,0.11,0.00,0.02,0.00,0.00,0.00,0.00,0.00,0.00
6,0.39,0.17,0.02,0.03,0.01,0.12,0.01,0.00,0.0,0.0,...,0.00,0.00,0.01,0.05,0.02,0.00,0.05,0.01,0.00,0.01
7,0.17,0.04,0.00,0.01,0.00,0.03,0.00,0.00,0.0,0.0,...,0.00,0.00,0.00,0.02,0.01,0.00,0.01,0.00,0.00,0.00
8,0.20,0.06,0.01,0.00,0.01,0.05,0.00,0.00,0.0,0.0,...,0.00,0.01,0.01,0.02,0.00,0.00,0.02,0.01,0.00,0.00
9,0.09,0.04,0.01,0.01,0.00,0.02,0.00,0.00,0.0,0.0,...,0.00,0.00,0.00,0.00,0.01,0.00,0.02,0.00,0.00,0.01


In [108]:
prob['compound_name'] = prob.index

In [109]:
prob['compound_name'] = prob['compound_name'].apply(lambda x: dict_testing[x])

In [110]:
prob

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,compound_name
0,1.00,0.95,0.10,0.77,0.03,0.08,0.55,0.00,0.0,0.0,...,0.02,0.03,0.57,0.19,0.01,0.01,0.44,0.02,0.07,Inga_compound_1
1,0.28,0.10,0.01,0.02,0.01,0.07,0.00,0.00,0.0,0.0,...,0.01,0.01,0.03,0.02,0.00,0.02,0.02,0.00,0.00,Inga_compound_2
2,0.29,0.09,0.01,0.01,0.01,0.07,0.00,0.00,0.0,0.0,...,0.01,0.01,0.04,0.01,0.00,0.02,0.01,0.00,0.00,Inga_compound_3
3,0.34,0.07,0.01,0.00,0.01,0.06,0.00,0.00,0.0,0.0,...,0.00,0.01,0.03,0.00,0.00,0.01,0.02,0.00,0.00,Inga_compound_4
4,0.19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,Inga_compound_5
5,0.31,0.02,0.00,0.00,0.00,0.02,0.00,0.00,0.0,0.0,...,0.11,0.00,0.02,0.00,0.00,0.00,0.00,0.00,0.00,Inga_compound_6
6,0.39,0.17,0.02,0.03,0.01,0.12,0.01,0.00,0.0,0.0,...,0.00,0.01,0.05,0.02,0.00,0.05,0.01,0.00,0.01,Inga_compound_7
7,0.17,0.04,0.00,0.01,0.00,0.03,0.00,0.00,0.0,0.0,...,0.00,0.00,0.02,0.01,0.00,0.01,0.00,0.00,0.00,Inga_compound_8
8,0.20,0.06,0.01,0.00,0.01,0.05,0.00,0.00,0.0,0.0,...,0.01,0.01,0.02,0.00,0.00,0.02,0.01,0.00,0.00,Inga_compound_9
9,0.09,0.04,0.01,0.01,0.00,0.02,0.00,0.00,0.0,0.0,...,0.00,0.00,0.00,0.01,0.00,0.02,0.00,0.00,0.01,Inga_compound_10


### Writing to file

In [111]:
pred.to_csv('prediction_result.csv')

In [112]:
prob.to_csv('prediction_result_with_prob.csv')