In [4]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import csv
from sklearn.model_selection import train_test_split

data = open('data_Feature_Importance.txt').readlines()[1:]

feature_names = ['active_site','metal_binding_site','binding_site','site','dna_binding_site','nucleotide_phosphate_binding','zinc_finger','calcium_binding','region','repeat','coiled_coil','motif','domain','topological_domain','transmembrane','intramembrane','peptide','propeptide','transit_peptide','signal_peptide','modified_residues','lipidation','glycosylation','disulfide_bond','cross_link']\

X = []
y = []

for line in data:
    line = line[:-1].split('\t')
    if line[-1] == 'constrained':
        y.append(1)
    else:
        y.append(0)

    X.append(line[42-25-1:42-1])
    for i in range(len(X[-1])):
        X[-1][i] = int(X[-1][i])

X = np.array(X)
y = np.array(y)

# define the model
model = RandomForestClassifier(criterion = "gini")
model.fit(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

model.fit(X_train, y_train)

print('Training set metrics:')
print('Accuracy:', accuracy_score(y_train, model.predict(X_train)))
print('Precision:', precision_score(y_train, model.predict(X_train)))
print()
print('Test set metrics:')
print('Accuracy:', accuracy_score(y_test, model.predict(X_test)))
print('Precision:', precision_score(y_test, model.predict(X_test)))
print()

# get importance
results = permutation_importance(model, X, y, scoring='average_precision', n_repeats = 10)
importance = results.importances_mean
#print(importance)
indices = np.argsort(importance)[::-1]
#print(indices)
std = results.importances_std
#print(std)
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %s, Score: %.5f' % (feature_names[i],v))

# plot feature importance
pyplot.bar([feature_names[x] for x in indices], importance[indices],color="yellowgreen", yerr=std[indices], align="center")
pyplot.xticks(rotation=90)
pyplot.ylabel('Scores')
pyplot.xlabel('Feature Name')
fig = pyplot.gcf()
fig.set_size_inches(10.5, 8)
fig.tight_layout()
pyplot.show()
#fig.savefig('count_perm_gini_avg_precision.png', dpi=300)


Feature: active_site, Score: -0.00002
Feature: metal_binding_site, Score: -0.00007
Feature: binding_site, Score: 0.00556
Feature: site, Score: 0.00267
Feature: dna_binding_site, Score: 0.08125
Feature: nucleotide_phosphate_binding, Score: -0.00000
Feature: zinc_finger, Score: 0.00264
Feature: calcium_binding, Score: 0.00001
Feature: region, Score: 0.18543
Feature: repeat, Score: 0.01036
Feature: coiled_coil, Score: 0.00448
Feature: motif, Score: 0.10943
Feature: domain, Score: 0.05108
Feature: topological_domain, Score: 0.04093
Feature: transmembrane, Score: 0.03769
Feature: intramembrane, Score: 0.00000
Feature: peptide, Score: -0.00096
Feature: propeptide, Score: -0.00008
Feature: transit_peptide, Score: 0.00000
Feature: signal_peptide, Score: -0.00010
Feature: modified_residues, Score: 0.27825
Feature: lipidation, Score: 0.00030
Feature: glycosylation, Score: 0.00824
Feature: disulfide_bond, Score: 0.06104
Feature: cross_link, Score: 0.07978
