In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.builtins import DatasetSelector

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=161803)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    MaxAbsScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
)

In [3]:
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=4, subset_list='module23.csv')), ('maxabsscaler', MaxAbsScaler(copy=True)), ('extratreesclassifier', ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.4, max_leaf_nodes=None,
  ...imators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [8]:
print(exported_pipeline.steps[0][1].feat_list)

['LOC101241902', 'ATAD5', 'HMBOX1', 'PTPN14', 'MIR3661', 'FDPS', 'MPL', 'CARD14', 'SNX21', 'TP53I3', 'NFYC', 'CAMK1', 'LOC101929767', 'HSPD1', 'PPP1R3B', 'UTP14A', 'USP34', 'FAM98B', 'TMEM217', 'MCTP1', 'CALB1', 'ZNF865', 'TAC4', 'ZNF407', 'TNFAIP8L1', 'NUDT5', 'SIGLEC16', 'DARS2', 'GUCY1B2', 'RUSC1', 'UBE2E1', 'ZC3H8', 'SPAG4', 'ITIH2', 'ZNF552', 'PIGZ', 'PSMD5', 'MIR635', 'TFIP11', 'RAD17', 'DCDC2B', 'PGM3', 'THADA', 'TBC1D8', 'CFD', 'MSL1', 'FOXM1', 'PRPF38A', 'UHRF1', 'FAM13A', 'MRS2', 'NT5C2', 'PPM1N', 'DCLRE1C', 'METTL25', 'RNF113A', 'CLEC2B', 'LOC646471', 'DPP7', 'CCDC104', 'MLLT10P1', 'GPR137', 'MYO19', 'ZNF57', 'PCP4L1', 'ALS2CR12', 'MARCH1', 'SBF1', 'UBTF', 'FKBP4', 'MMP19', 'REV3L', 'FAM86HP', 'HECTD1', 'LGALS8', 'SLC25A25', 'LENG9', 'ZNF506', 'ENTPD3', 'NUDT13', 'BMP8A', 'RRP7A', 'CD160', 'LOC100129148', 'TCF7L1', 'CNTRL', 'B9D2', 'BAD', 'NDUFA2', 'RNF149', 'PDE6C', 'OSGEPL1', 'PCCA', 'LOC102724246', 'MSTO1', 'NREP', 'TAF3', 'KDELR3', 'GPR18', 'NFE2L3', 'PLEKHB1', 'EGLN1', 

In [4]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.00171307, 0.00461673, 0.00451463, 0.00468596, 0.00214541,
       0.00083695, 0.00117561, 0.00150725, 0.00194034, 0.00441193,
       0.00178796, 0.00323788, 0.00510511, 0.00168073, 0.0073903 ,
       0.00359235, 0.00174137, 0.00248937, 0.00265686, 0.        ,
       0.00178733, 0.00420438, 0.00862721, 0.00206305, 0.00330247,
       0.0043915 , 0.        , 0.00107515, 0.00186621, 0.00239664,
       0.01444189, 0.00258658, 0.0035314 , 0.00293179, 0.00288591,
       0.00212998, 0.02768891, 0.00141359, 0.00204136, 0.00208141,
       0.00467248, 0.00733866, 0.00292945, 0.00053519, 0.00098887,
       0.00170817, 0.00373915, 0.00236595, 0.0021229 , 0.00899772,
       0.00089884, 0.00464089, 0.00146718, 0.0017541 , 0.0022918 ,
       0.00141004, 0.00669811, 0.00512765, 0.00321898, 0.00113051,
       0.0005926 , 0.00564184, 0.00333274, 0.00342095, 0.00263959,
       0.00134803, 0.00091001, 0.00330588, 0.00407871, 0.00154361,
       0.00412519, 0.00176203, 0.00381653, 0.00237026, 0.00130

In [5]:
results = exported_pipeline.predict(testing_features)

In [6]:
results

array([0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1.,
       1., 0., 0., 1., 0., 0.])

In [23]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [24]:
feat_imp.to_csv("featureImp5.csv")