## The Tools

In [31]:
#navigation
import os

#data wrangling
import pandas as pd
import numpy as np

#model building
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.model_selection import cross_val_predict, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from itertools import combinations
from sklearn.feature_selection import RFE

#model scoring
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, r2_score, mean_squared_error, \
confusion_matrix, precision_score, f1_score, matthews_corrcoef

#data visualization
import matplotlib.pyplot as plt
import pylab 

#feature extraction
import BlackBoxAuditing as BBA
from BlackBoxAuditing.model_factories.SKLearnModelVisitor import SKLearnModelVisitor
import shap 
import math
from operator import itemgetter


## The Data

In [3]:
#aggregated sd2e data
df_sd2e = pd.read_csv('/home/jupyter/tacc-work/test-harness-v3/versioned-datasets/data/protein-design/aggregated_data/all_libs_cleaned.v1.aggregated_data.csv',comment="#" )

#total S_PC and TbD features
df_fet = pd.read_csv("/home/jupyter/tacc-work/test-harness/protein-design/protstab_test_harness_and_leaderboard/model_runner_data/default_model_runner_data/spc_tbd_features.csv",comment="#").dropna(axis=0)

#features with stabilityscore values
df = df_fet.merge(df_sd2e[['stabilityscore','stabilityscore_calibrated','name','topology']])

### Data Prep for Audit Comparison of Features

In [8]:
df_all = df_sd2e.merge(df_fet)


In [9]:
df_all.to_csv("./dataframes/df_all_features.csv",index=False)

In [10]:
df_all

Unnamed: 0,dataset,name,sequence,dssp,topology,description,stabilityscore,stabilityscore_calibrated,stabilityscore_t,stabilityscore_calibrated_t,...,SS5,SS6,SS7,SS8,SS9,SS10,SS11,SS12,SS13,SS14
0,Eva1,bGM_490,KSVEVEVQNGDTKTKYEVPPDPTEARRATETVYKTYNAQEVHYEED...,LEEEEEEEELLEEEEEEELLLHHHHHHHHHHHHHHLLLLEEEEEEL...,beta_grasp,data/171207_Eva1/bGM_490,0.379,0.309716,0.379,0.309716,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,Eva1,ems_4hM_668,DPYQSAATVVNREGDVQKAAELLKKLVNNDESRKEADKVRKSSDPT...,LHHHHHHHHHHHLLLHHHHHHHHHHHLLLHHHHHHHHHHHHLLLHH...,4h,data/171207_Eva1/ems_4hM_668,0.039,-0.033396,0.039,-0.033396,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,Eva1,bGM_337,NVTVHIQSKSEQYQVDVRDREEAKKAVKEVASKLNAPFEVHTSGNS...,LEEEEEELLLLEEEEEELLHHHHHHHHHHHHHHHLLLEEEEEELLE...,beta_grasp,data/171207_Eva1/bGM_337,0.152,0.042282,0.152,0.083679,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,Eva1,ems_ferrM_346_0002,RVEIHVDDEKAADKVYTIAKRAGAEVHRKDGKLTVHVPDPEGAKKI...,LEEEEELLHHHHHHHHHHHHHHLLEEEEELLEEEEEELLHHHHHHH...,ferredoxin,data/171207_Eva1/ems_ferrM_346_0002,0.435,0.028514,0.731,0.661607,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,Eva1,ems_4hC_1122_0002,DDKISKKLSDKLRKSVEDKDKVNSVIERAKKMNLVKEAAKVLENGT...,LHHHHHHHHHHHHHHLLLHHHHHHHHHHHHHLLHHHHHHHHHHLLL...,4h,data/171207_Eva1/ems_4hC_1122_0002,-0.198,-0.405374,0.067,-0.006000,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
5,Eva1,ems_ferrM_3817,VTVTIHASDPRLHEAAKNVNAEFEIEQKPGTKVEIHLKIDPKEAKK...,LEEEEELLLLHHHHHHHHLLLLEEEEELLLLEEEEEELLLHHHHHH...,ferredoxin,data/171207_Eva1/ems_ferrM_3817,1.557,1.012885,1.557,1.497840,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
6,Eva1,ems_4hM_4969_0002,DAEEKLKKLLTKINPKAKEFVKEVNKEASKAEIPEQREKIIESYTD...,LHHHHHHHHHHHHLHHHHHHHHHHHHHHLLLLLHHHHHHHHHHHHL...,4h,data/171207_Eva1/ems_4hM_4969_0002,0.045,-0.027534,0.045,-0.027534,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
7,Eva1,ems_ferrM_2250_0002,SSYEVRVDNPEAAKIVAKVGNLEVHTLPGNKVSVKLTNVDPEQVKK...,LEEEEEELLHHHHHHHHHHHLLEEEEELLLEEEEEELLLLHHHHHH...,ferredoxin,data/171207_Eva1/ems_ferrM_2250_0002,0.978,0.911904,0.978,0.911904,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
8,Eva1,ems_4hC_1518_0002,DSEKARKFVETVSSQGKAPEAYKFYKKVKDSGNEEEAVKTVLSVLL...,LHHHHHHHHHHHHHLLLLHHHHHHHHHHHHLLLHHHHHHHHHHHHL...,4h,data/171207_Eva1/ems_4hC_1518_0002,-0.498,-0.574986,-0.498,-0.574986,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
9,Eva1,ems_4hM_3614,DLEEISKKVKEDAKKINDETQATKFVKKWAENNPVLKKIIDELSKK...,LHHHHHHHHHHHHHHHLLHHHHHHHHHHHHLLLHHHHHHHHHHHHH...,4h,data/171207_Eva1/ems_4hM_3614,1.883,1.385704,2.267,2.206778,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


### Data Prep for Regressions

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df_all,)

### Results for `shap` audit of all features

In [13]:
shap_audit_results = pd.DataFrame([('sum_best_frags', 0.01815889976698281), ('score_per_res', 0.017520350724301477), ('net_atr_net_sol_per_res', 0.01189227370628641), ('total_score', 0.011829266017298938), ('avg_all_frags', 0.011741001729944142), ('exposed_np_AFILMVWY', 0.010543621409788755), ('buried_np_AFILMVWY_per_res', 0.010540248058455893), ('avg_best_frag', 0.009757981815803734), ('contig_not_hp_avg', 0.0095713082268951), ('worst6frags', 0.00936997002649029), ('worstfrag', 0.00916483146463831), ('n_hydrophobic_noA', 0.007640735570839752), ('largest_hphob_cluster', 0.00711540462975133), ('buried_np_AFILMVWY', 0.007046774360452206), ('hbond_lr_bb_per_sheet', 0.006922759934165471), ('nres_helix', 0.00669566782821688), ('hphob_sc_contacts', 0.006666061527651647), ('nres_sheet', 0.006098045128477637), ('n_res', 0.0060340383306778074), ('nres', 0.00558786114196341), ('frac_sheet', 0.005443409235436654), ('frac_helix', 0.00506305341462748), ('net_sol_per_res', 0.004964121974125374), ('E_range_entropy', 0.004834128684660958), ('contact_core_SASA', 0.004623983826410658), ('hydrophobicity', 0.004324036665293184), ('contact_all', 0.004320481734360051), ('net_atr_per_res', 0.004309398109084828), ('E_min_entropy', 0.004085680726649463), ('contact_core_SCN', 0.004045378291850982), ('nearest_chymo_cut_to_Cterm', 0.003852773530249793), ('chymo_with_LM_cut_sites', 0.0037297389555669236), ('fa_atr_per_res', 0.0036844877844504774), ('E_max_entropy', 0.0036500242158029974), ('hbond_lr_bb', 0.003319106939951626), ('fa_intra_atr_xover4', 0.003209542568169975), ('holes', 0.0031911682562915777), ('hbond_bb_sc', 0.0029890158106729812), ('bb', 0.0028914097948769305), ('mismatch_probability', 0.0028274937833124643), ('lk_ball_iso', 0.0026742417788718337), ('buried_np_per_res', 0.002602523452919682), ('SumE_entropies', 0.002383685058554441), ('contig_not_hp_avg_norm', 0.002382130290837122), ('n_hydrophobic', 0.0021977034578310748), ('hbond_sr_bb', 0.002154133509845926), ('chymo_cut_sites', 0.0019063479263131417), ('Mean_E_entropy', 0.0019029786518831992), ('fa_atr', 0.0018904620148159502), ('ref', 0.001834613320372878), ('exposed_total', 0.0017846223324374008), ('rama_prepro', 0.0017727517755189014), ('SumH_entropies', 0.001770024931358286), ('fa_sol', 0.0017504688684304552), ('lk_ball', 0.0016741013383724463), ('abego_res_profile_penalty', 0.0016178411985622917), ('nearest_tryp_cut_to_Nterm', 0.0015926861670506806), ('buried_np', 0.0013927817871500449), ('AlaCount', 0.00136842332922512), ('exposed_polars', 0.0012798175155687933), ('Mean_H_entropy', 0.0012228373266457385), ('hphob_sc_degree', 0.001210637917503637), ('omega', 0.0011656023504417951), ('H_range_entropy', 0.001137222441254191), ('Tminus1_netq', 0.001101318327693489), ('fa_intra_elec', 0.0010692975304956883), ('fa_dun_dev', 0.0009826786644904484), ('fa_dun_rot', 0.0009414045939471607), ('degree', 0.0008802571254872426), ('H_max_entropy', 0.0008766078806773833), ('loop_sc', 0.0008568005452666282), ('fa_intra_rep_xover4', 0.0008564709269013335), ('ss_sc', 0.0008056005449410899), ('fxn_exposed_is_np', 0.0007986990518131019), ('buns_nonheavy', 0.000794339531706785), ('H_min_entropy', 0.0007728769290057993), ('buried_minus_exposed', 0.0007483038237216485), ('L_range_entropy', 0.000736834881243391), ('Tend_netq', 0.0007073958163134551), ('T1_absq', 0.000706253662629004), ('frac_loop', 0.0006977709988094428), ('nearest_chymo_cut_to_Nterm', 0.000697425721997199), ('fa_elec', 0.0006881840284963385), ('hxl_tors', 0.0006519388391725977), ('hbond_sr_bb_per_helix', 0.0006500885437235953), ('abego_res_profile', 0.0006450496824560289), ('two_core_each', 0.0006313877688608209), ('lk_ball_bridge_uncpl', 0.000604797059397908), ('exposed_hydrophobics', 0.0006034701035793052), ('p_aa_pp', 0.0005802454716827955), ('nearest_chymo_cut_to_term', 0.000578493708062799), ('fa_rep', 0.0005355707101334389), ('SumL_entropies', 0.0005272858106716325), ('buried_over_exposed', 0.0005127505405961292), ('fa_rep_per_res', 0.0004795098514057501), ('pro_close', 0.00046997824399698393), ('Tend_absq', 0.00045229701187650466), ('netcharge', 0.0004489435292843967), ('L_max_entropy', 0.0004214765843203605), ('one_core_each', 0.00040779969248768636), ('n_charged', 0.00040574626450546307), ('percent_core_SCN', 0.0004031410070779678), ('helix_sc', 0.0003968691504565582), ('contig_not_hp_internal_max', 0.00039587249960075364), ('Mean_L_entropy', 0.0003950561016098001), ('res_count_core_SCN', 0.00037215572081225335), ('nres_loop', 0.0003701203157123032), ('hbond_sc', 0.0003519196027118523), ('percent_core_SASA', 0.0003498034930054653), ('buns_bb_heavy', 0.00030831448530252924), ('fa_intra_sol_xover4', 0.0003013286548567961), ('lk_ball_bridge', 0.000292387895935223), ('T1_netq', 0.00027694961160889165), ('S_PC', 0.0002656390138732597), ('fa_dun_semi', 0.00025302325893629044), ('contig_not_hp_max', 0.000252782756600342), ('n_polar_core', 0.00025237856041252106), ('L_min_entropy', 0.0002485819678927337), ('tryp_cut_sites', 0.0002485142348164215), ('Mean_res_entropy', 0.00023830976509666435), ('pack', 0.00020426962103547547), ('nearest_tryp_cut_to_term', 0.00017011898117372235), ('res_count_core_SASA', 0.00015903111906336354), ('n_hphob_clusters', 0.00010625958075152066), ('nearest_tryp_cut_to_Cterm', 9.824158580684285e-05), ('Tminus1_absq', 7.200650018655219e-05), ('ss_contributes_core', 2.397517495085895e-05), ('buns_sc_heavy', 1.3571109906561356e-05), ('dslf_fa13', 0.0), ('entropy', 0.0)],columns=['feature_name','shap importance'])

In [21]:
features = df_fet.columns
is_rosetta = []
for i in shap_audit_results['feature_name']:
    #print(i not in features)
    is_rosetta.append(i not in features)
shap_audit_results['is_rosetta'] = is_rosetta

In [23]:
shap_audit_results[shap_audit_results['is_rosetta']==False]

Unnamed: 0,feature_name,shap importance,is_rosetta
23,E_range_entropy,0.004834,False
28,E_min_entropy,0.004086,False
33,E_max_entropy,0.00365,False
42,SumE_entropies,0.002384,False
47,Mean_E_entropy,0.001903,False
52,SumH_entropies,0.00177,False
60,Mean_H_entropy,0.001223,False
63,H_range_entropy,0.001137,False
69,H_max_entropy,0.000877,False
75,H_min_entropy,0.000773,False


In [30]:
shap_audit_results[shap_audit_results['is_rosetta']==True]

Unnamed: 0,feature_name,shap importance,is_rosetta
0,sum_best_frags,0.018159,True
1,score_per_res,0.017520,True
2,net_atr_net_sol_per_res,0.011892,True
3,total_score,0.011829,True
4,avg_all_frags,0.011741,True
5,exposed_np_AFILMVWY,0.010544,True
6,buried_np_AFILMVWY_per_res,0.010540,True
7,avg_best_frag,0.009758,True
8,contig_not_hp_avg,0.009571,True
9,worst6frags,0.009370,True


In [29]:
df_all['topology'].value_counts()

4h            14692
ferredoxin    14385
HHH           12016
EEHEE          5247
HEEH           4986
EHEE           3457
beta_grasp     2374
thio           2046
fold2           657
coil            573
fold4           255
Name: topology, dtype: int64

### Results for `BBA` audit for all features

In [35]:
#by accuracy
bba_audit_results_accuracy = pd.DataFrame([('S_PC', 0.09883037316665644), ('Mean_H_entropy', 0.09883037316665644), ('Mean_L_entropy', 0.09883037316665644), ('Mean_E_entropy', 0.09883037316665644), ('Mean_res_entropy', 0.09883037316665644), ('SumH_entropies', 0.09883037316665644), ('SumL_entropies', 0.09883037316665644), ('SumE_entropies', 0.09883037316665644), ('H_max_entropy', 0.09883037316665644), ('H_min_entropy', 0.09883037316665644), ('H_range_entropy', 0.09883037316665644), ('L_max_entropy', 0.09883037316665644), ('L_min_entropy', 0.09883037316665644), ('L_range_entropy', 0.09883037316665644), ('E_min_entropy', 0.09883037316665644), ('E_range_entropy', 0.09883037316665644), ('T1_absq', 0.09883037316665644), ('Tend_absq', 0.09883037316665644), ('Tend_netq', 0.09883037316665644), ('Tminus1_absq', 0.09883037316665644), ('Tminus1_netq', 0.09883037316665644), ('abego_res_profile', 0.09883037316665644), ('abego_res_profile_penalty', 0.09883037316665644), ('avg_all_frags', 0.09883037316665644), ('avg_best_frag', 0.09883037316665644), ('buns_bb_heavy', 0.09883037316665644), ('buried_minus_exposed', 0.09883037316665644), ('buried_np', 0.09883037316665644), ('buried_np_AFILMVWY', 0.09883037316665644), ('buried_np_AFILMVWY_per_res', 0.09883037316665644), ('buried_np_per_res', 0.09883037316665644), ('buried_over_exposed', 0.09883037316665644), ('chymo_cut_sites', 0.09883037316665644), ('chymo_with_LM_cut_sites', 0.09883037316665644), ('contact_all', 0.09883037316665644), ('contact_core_SASA', 0.09883037316665644), ('contact_core_SCN', 0.09883037316665644), ('contig_not_hp_avg', 0.09883037316665644), ('contig_not_hp_avg_norm', 0.09883037316665644), ('contig_not_hp_internal_max', 0.09883037316665644), ('contig_not_hp_max', 0.09883037316665644), ('degree', 0.09883037316665644), ('exposed_hydrophobics', 0.09883037316665644), ('exposed_np_AFILMVWY', 0.09883037316665644), ('exposed_polars', 0.09883037316665644), ('exposed_total', 0.09883037316665644), ('fa_atr', 0.09883037316665644), ('fa_atr_per_res', 0.09883037316665644), ('fa_dun_dev', 0.09883037316665644), ('fa_dun_rot', 0.09883037316665644), ('fa_dun_semi', 0.09883037316665644), ('fa_elec', 0.09883037316665644), ('fa_intra_atr_xover4', 0.09883037316665644), ('fa_intra_elec', 0.09883037316665644), ('fa_intra_rep_xover4', 0.09883037316665644), ('fa_intra_sol_xover4', 0.09883037316665644), ('fa_rep', 0.09883037316665644), ('fa_rep_per_res', 0.09883037316665644), ('fa_sol', 0.09883037316665644), ('frac_helix', 0.09883037316665644), ('frac_loop', 0.09883037316665644), ('frac_sheet', 0.09883037316665644), ('fxn_exposed_is_np', 0.09883037316665644), ('hbond_bb_sc', 0.09883037316665644), ('hbond_lr_bb_per_sheet', 0.09883037316665644), ('hbond_sc', 0.09883037316665644), ('hbond_sr_bb', 0.09883037316665644), ('hbond_sr_bb_per_helix', 0.09883037316665644), ('helix_sc', 0.09883037316665644), ('holes', 0.09883037316665644), ('hphob_sc_degree', 0.09883037316665644), ('hxl_tors', 0.09883037316665644), ('hydrophobicity', 0.09883037316665644), ('largest_hphob_cluster', 0.09883037316665644), ('lk_ball', 0.09883037316665644), ('lk_ball_bridge', 0.09883037316665644), ('lk_ball_bridge_uncpl', 0.09883037316665644), ('lk_ball_iso', 0.09883037316665644), ('loop_sc', 0.09883037316665644), ('mismatch_probability', 0.09883037316665644), ('n_charged', 0.09883037316665644), ('n_hphob_clusters', 0.09883037316665644), ('n_hydrophobic_noA', 0.09883037316665644), ('n_polar_core', 0.09883037316665644), ('nearest_chymo_cut_to_term', 0.09883037316665644), ('nearest_tryp_cut_to_Cterm', 0.09883037316665644), ('nearest_tryp_cut_to_Nterm', 0.09883037316665644), ('net_atr_net_sol_per_res', 0.09883037316665644), ('net_atr_per_res', 0.09883037316665644), ('net_sol_per_res', 0.09883037316665644), ('netcharge', 0.09883037316665644), ('nres_helix', 0.09883037316665644), ('nres_loop', 0.09883037316665644), ('nres_sheet', 0.09883037316665644), ('omega', 0.09883037316665644), ('one_core_each', 0.09883037316665644), ('p_aa_pp', 0.09883037316665644), ('pack', 0.09883037316665644), ('percent_core_SASA', 0.09883037316665644), ('percent_core_SCN', 0.09883037316665644), ('pro_close', 0.09883037316665644), ('rama_prepro', 0.09883037316665644), ('ref', 0.09883037316665644), ('res_count_core_SASA', 0.09883037316665644), ('score_per_res', 0.09883037316665644), ('ss_contributes_core', 0.09883037316665644), ('ss_sc', 0.09883037316665644), ('sum_best_frags', 0.09883037316665644), ('total_score', 0.09883037316665644), ('tryp_cut_sites', 0.09883037316665644), ('worst6frags', 0.09883037316665644), ('worstfrag', 0.09883037316665644), ('buns_sc_heavy', 0.09876848814901917), ('E_max_entropy', 0.09858283309610749), ('n_hydrophobic', 0.09833529302555855), ('bb', 0.08806238009777834), ('nres', 0.08595828949811257), ('n_res', 0.08577263444520089), ('hbond_lr_bb', 0.08571074942756363), ('hphob_sc_contacts', 0.08261649854570219), ('nearest_tryp_cut_to_term', 0.08137879819295757), ('two_core_each', 0.07661365183489077), ('AlaCount', 0.06194690265486735), ('T1_netq', 0.05705798626152614), ('nearest_chymo_cut_to_Nterm', 0.047651463580667186), ('nearest_chymo_cut_to_Cterm', 0.047032613404294876), ('res_count_core_SCN', 0.027662602883841925), ('buns_nonheavy', 0.01868927532644349), ('dslf_fa13', 0.0), ('entropy', 0.0)], columns=['feature_name','bba_importance'])
bba_audit_results_bcr = pd.DataFrame([('S_PC', 0.41826178772101974), ('Mean_H_entropy', 0.41826178772101974), ('Mean_L_entropy', 0.41826178772101974), ('Mean_E_entropy', 0.41826178772101974), ('Mean_res_entropy', 0.41826178772101974), ('SumH_entropies', 0.41826178772101974), ('SumL_entropies', 0.41826178772101974), ('SumE_entropies', 0.41826178772101974), ('H_max_entropy', 0.41826178772101974), ('H_min_entropy', 0.41826178772101974), ('H_range_entropy', 0.41826178772101974), ('L_max_entropy', 0.41826178772101974), ('L_min_entropy', 0.41826178772101974), ('L_range_entropy', 0.41826178772101974), ('E_min_entropy', 0.41826178772101974), ('E_range_entropy', 0.41826178772101974), ('T1_absq', 0.41826178772101974), ('Tend_absq', 0.41826178772101974), ('Tend_netq', 0.41826178772101974), ('Tminus1_absq', 0.41826178772101974), ('Tminus1_netq', 0.41826178772101974), ('abego_res_profile', 0.41826178772101974), ('abego_res_profile_penalty', 0.41826178772101974), ('avg_all_frags', 0.41826178772101974), ('avg_best_frag', 0.41826178772101974), ('buns_bb_heavy', 0.41826178772101974), ('buried_minus_exposed', 0.41826178772101974), ('buried_np', 0.41826178772101974), ('buried_np_AFILMVWY', 0.41826178772101974), ('buried_np_AFILMVWY_per_res', 0.41826178772101974), ('buried_np_per_res', 0.41826178772101974), ('buried_over_exposed', 0.41826178772101974), ('chymo_cut_sites', 0.41826178772101974), ('chymo_with_LM_cut_sites', 0.41826178772101974), ('contact_all', 0.41826178772101974), ('contact_core_SASA', 0.41826178772101974), ('contact_core_SCN', 0.41826178772101974), ('contig_not_hp_avg', 0.41826178772101974), ('contig_not_hp_avg_norm', 0.41826178772101974), ('contig_not_hp_internal_max', 0.41826178772101974), ('contig_not_hp_max', 0.41826178772101974), ('degree', 0.41826178772101974), ('exposed_hydrophobics', 0.41826178772101974), ('exposed_np_AFILMVWY', 0.41826178772101974), ('exposed_polars', 0.41826178772101974), ('exposed_total', 0.41826178772101974), ('fa_atr', 0.41826178772101974), ('fa_atr_per_res', 0.41826178772101974), ('fa_dun_dev', 0.41826178772101974), ('fa_dun_rot', 0.41826178772101974), ('fa_dun_semi', 0.41826178772101974), ('fa_elec', 0.41826178772101974), ('fa_intra_atr_xover4', 0.41826178772101974), ('fa_intra_elec', 0.41826178772101974), ('fa_intra_rep_xover4', 0.41826178772101974), ('fa_intra_sol_xover4', 0.41826178772101974), ('fa_rep', 0.41826178772101974), ('fa_rep_per_res', 0.41826178772101974), ('fa_sol', 0.41826178772101974), ('frac_helix', 0.41826178772101974), ('frac_loop', 0.41826178772101974), ('frac_sheet', 0.41826178772101974), ('fxn_exposed_is_np', 0.41826178772101974), ('hbond_bb_sc', 0.41826178772101974), ('hbond_lr_bb_per_sheet', 0.41826178772101974), ('hbond_sc', 0.41826178772101974), ('hbond_sr_bb', 0.41826178772101974), ('hbond_sr_bb_per_helix', 0.41826178772101974), ('helix_sc', 0.41826178772101974), ('holes', 0.41826178772101974), ('hphob_sc_degree', 0.41826178772101974), ('hxl_tors', 0.41826178772101974), ('hydrophobicity', 0.41826178772101974), ('largest_hphob_cluster', 0.41826178772101974), ('lk_ball', 0.41826178772101974), ('lk_ball_bridge', 0.41826178772101974), ('lk_ball_bridge_uncpl', 0.41826178772101974), ('lk_ball_iso', 0.41826178772101974), ('loop_sc', 0.41826178772101974), ('mismatch_probability', 0.41826178772101974), ('n_charged', 0.41826178772101974), ('n_hphob_clusters', 0.41826178772101974), ('n_hydrophobic_noA', 0.41826178772101974), ('n_polar_core', 0.41826178772101974), ('nearest_chymo_cut_to_term', 0.41826178772101974), ('nearest_tryp_cut_to_Cterm', 0.41826178772101974), ('nearest_tryp_cut_to_Nterm', 0.41826178772101974), ('net_atr_net_sol_per_res', 0.41826178772101974), ('net_atr_per_res', 0.41826178772101974), ('net_sol_per_res', 0.41826178772101974), ('netcharge', 0.41826178772101974), ('nres_helix', 0.41826178772101974), ('nres_loop', 0.41826178772101974), ('nres_sheet', 0.41826178772101974), ('omega', 0.41826178772101974), ('one_core_each', 0.41826178772101974), ('p_aa_pp', 0.41826178772101974), ('pack', 0.41826178772101974), ('percent_core_SASA', 0.41826178772101974), ('percent_core_SCN', 0.41826178772101974), ('pro_close', 0.41826178772101974), ('rama_prepro', 0.41826178772101974), ('ref', 0.41826178772101974), ('res_count_core_SASA', 0.41826178772101974), ('score_per_res', 0.41826178772101974), ('ss_contributes_core', 0.41826178772101974), ('ss_sc', 0.41826178772101974), ('sum_best_frags', 0.41826178772101974), ('total_score', 0.41826178772101974), ('tryp_cut_sites', 0.41826178772101974), ('worst6frags', 0.41826178772101974), ('worstfrag', 0.41826178772101974), ('buns_sc_heavy', 0.4176840294922197), ('E_max_entropy', 0.41743592588194356), ('n_hydrophobic', 0.41688009514761726), ('bb', 0.38064910331652047), ('hbond_lr_bb', 0.37395104804048407), ('n_res', 0.3664278731267878), ('nres', 0.36627093007993894), ('two_core_each', 0.31210824786298075), ('AlaCount', 0.29642351367004793), ('T1_netq', 0.2753534441270773), ('nearest_chymo_cut_to_Nterm', 0.24316060092793412), ('nearest_tryp_cut_to_term', 0.2361617587621434), ('nearest_chymo_cut_to_Cterm', 0.22361143229768887), ('hphob_sc_contacts', 0.15820161211356043), ('buns_nonheavy', 0.12547233958553383), ('res_count_core_SCN', 0.1039351519090983), ('dslf_fa13', 0.0), ('entropy', 0.0)], columns=['feature_name','bba_importance'])

In [36]:
bba_audit_results_accuracy

Unnamed: 0,feature_name,bba_importance
0,S_PC,0.098830
1,Mean_H_entropy,0.098830
2,Mean_L_entropy,0.098830
3,Mean_E_entropy,0.098830
4,Mean_res_entropy,0.098830
5,SumH_entropies,0.098830
6,SumL_entropies,0.098830
7,SumE_entropies,0.098830
8,H_max_entropy,0.098830
9,H_min_entropy,0.098830


In [37]:
bba_audit_results_bcr

Unnamed: 0,feature_name,bba_importance
0,S_PC,0.418262
1,Mean_H_entropy,0.418262
2,Mean_L_entropy,0.418262
3,Mean_E_entropy,0.418262
4,Mean_res_entropy,0.418262
5,SumH_entropies,0.418262
6,SumL_entropies,0.418262
7,SumE_entropies,0.418262
8,H_max_entropy,0.418262
9,H_min_entropy,0.418262


## RandomForestRegressor

In [42]:
rfr = RandomForestRegressor()