In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector, OneHotEncoder

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=161803)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    OneHotEncoder(minimum_fraction=0.1, sparse=False, threshold=10),
    ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.05, min_samples_leaf=8, min_samples_split=8, n_estimators=100)
)

In [3]:
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=12, subset_list='module23.csv')), ('onehotencoder', OneHotEncoder(categorical_features=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fals...imators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['PAXBP1', 'IQCC', 'NUP188', 'LRP2BP', 'PLCE1', 'RDH13', 'FNDC9', 'BAP1', 'RNF167', 'TRIM17', 'KCNH3', 'DDX58', 'FARP2', 'PLXNC1', 'LOC100132356', 'UBE2D4', 'BORA', 'BCL9', 'TSSK4', 'SPTBN2', 'DOPEY1', 'FAR2', 'TSSK3', 'EML5', 'SRPK3', 'PAPSS2', 'FMNL2', 'RAD9A', 'MYRIP', 'KCNT1', 'CEMIP', 'BYSL', 'LRRC8B', 'ZNF233', 'ZHX3', 'SEMA6A', 'LCAT', 'MTHFD2', 'WHSC1L1', 'LRRN2', 'SCN8A', 'LOC100996251', 'FAM193B', 'PCDH12', 'TOMM34', 'TEAD2', 'NOP16', 'VTN', 'CASP8', 'MICAL1', 'TOMM70A', 'RNASEH2C', 'LOC100506469', 'CCDC28B', 'GPT', 'NIF3L1', 'WDR5B', 'METTL12', 'FBXL16', 'MIR1282', 'GFM2', 'FUCA2', 'SYTL1', 'RHOF', 'TIMM17B', 'TUB', 'TBC1D10C', 'PEX5L', 'JPH3', 'SSBP3', 'SLC24A1', 'DGKQ', 'DPY19L2P2', 'NAALAD2', 'VPS52', 'TRAF3IP2', 'SNX22', 'CASKIN1', 'ORM2', 'PODXL', 'PPP1R16A', 'NCOR1', 'CROCCP3', 'EIF4ENIF1', 'PPAPDC1B', 'CISD2', 'H1FNT', 'SNX11', 'COA7', 'LOC101928943', 'SLFNL1', 'RPS6KA2', 'LOC101927211', 'MRPS7', 'SIK2', 'ZNF397', 'ARPC3', 'MXRA8', 'PRR3', 'C15orf40', 'ZNF213', 'ACAT1

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.01635466, 0.01319688, 0.00471392, 0.00065353, 0.00043133,
       0.0033827 , 0.00784433, 0.00601431, 0.01702124, 0.00348735,
       0.00725501, 0.01249049, 0.00509962, 0.01939634, 0.00248395,
       0.0052077 , 0.00380103, 0.00536525, 0.00391898, 0.00366037,
       0.00243446, 0.00220728, 0.01775068, 0.00214831, 0.0032709 ,
       0.01073433, 0.00698201, 0.00285576, 0.00210634, 0.00584499,
       0.01402457, 0.00619465, 0.00134987, 0.00393446, 0.00411075,
       0.01982223, 0.01413088, 0.00290513, 0.00278306, 0.01974749,
       0.008806  , 0.000195  , 0.00144436, 0.00182034, 0.00091571,
       0.00619909, 0.01520571, 0.00500785, 0.00210366, 0.00061228,
       0.00220086, 0.00401822, 0.01168023, 0.00618071, 0.00807882,
       0.00405305, 0.01341466, 0.00417444, 0.00202111, 0.00328019,
       0.00223049, 0.00191604, 0.00067592, 0.0109131 , 0.00341344,
       0.00800704, 0.0114696 , 0.00365332, 0.00721481, 0.00104428,
       0.00436513, 0.00362481, 0.0043322 , 0.00100215, 0.00742

In [6]:
results = exported_pipeline.predict(testing_features)

In [8]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [9]:
feat_imp.to_csv("featureImp12.csv")