In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.builtins import DatasetSelector
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    MaxAbsScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
)

In [3]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=4, subset_list='module23.csv')), ('maxabsscaler', MaxAbsScaler(copy=True)), ('extratreesclassifier', ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.4, max_leaf_nodes=None,
  ...tors=100, n_jobs=None,
           oob_score=False, random_state=1618, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['S100A1', 'ARHGAP27', 'RPUSD3', 'WARS2', 'CACNB2', 'SPAG4', 'GUCY1B2', 'GLE1', 'WIPI1', 'CFD', 'CARD14', 'PCCA', 'OXR1', 'MMP19', 'CLEC2B', 'LOC100379224', 'SIGLEC16', 'ZNF57', 'KMT2C', 'TCF7L1', 'PPARA', 'DRG1', 'PRPF3', 'TRAF1', 'PCP4L1', 'ZNF552', 'SNRPD2', 'SLC25A25', 'ALG12', 'LOC101927275', 'PIGZ', 'THADA', 'PPM1N', 'ZNF83', 'ANXA6', 'SKIV2L', 'MEGF11', 'CAMK1', 'BAD', 'LOC101927901', 'LOC100129148', 'GALNT10', 'UBTF', 'NDUFAF2', 'RPRD1A', 'MROH6', 'COA5', 'TMEM263', 'TNFAIP8L1', 'DCLRE1C', 'RBM4B', 'RUSC1', 'C19orf68', 'GOLGA8H', 'PDE6C', 'LHX4', 'UTP14A', 'ENTPD3', 'TAC4', 'BSN', 'PGM3', 'LOC102723885', 'CREB1', 'SENCR', 'AQP11', 'STX17', 'CNTRL', 'CALB1', 'CUL4A', 'TFPT', 'NFYC', 'C9orf169', 'FAM103A1', 'HACL1', 'GPM6B', 'ATP5H', 'HEXIM1', 'AKAP5', 'ATG14', 'FAS', 'GNPTG', 'TP53I3', 'TNK1', 'DOT1L', 'CCDC181', 'FKBP4', 'OSGEPL1', 'SNX21', 'MARCH1', 'LYG2', 'CRLF3', 'PRPF38A', 'NT5C2', 'ZNF865', 'LGALS8', 'ZNF407', 'FOXP3', 'TXLNA', 'GALNS', 'DHODH', 'NFE2L3', 'ZNF646', 'ALOX1

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.00373921, 0.00166101, 0.01831089, 0.00408702, 0.00249836,
       0.00058685, 0.00110793, 0.00131406, 0.00105909, 0.00195426,
       0.00499914, 0.00330212, 0.01920895, 0.01571573, 0.01270085,
       0.00328586, 0.00449239, 0.00172488, 0.00301095, 0.00420038,
       0.00434121, 0.0012803 , 0.00363005, 0.00230586, 0.00678344,
       0.001287  , 0.00360235, 0.00159611, 0.00027465, 0.0028394 ,
       0.00122492, 0.005076  , 0.00183827, 0.0028213 , 0.00466706,
       0.00228003, 0.00486999, 0.00404167, 0.00357327, 0.00294281,
       0.00154707, 0.00095279, 0.00204652, 0.00124853, 0.00629682,
       0.00962159, 0.00137508, 0.0051312 , 0.00327271, 0.00081084,
       0.003017  , 0.00220401, 0.00053113, 0.00564533, 0.00066824,
       0.0198139 , 0.00717809, 0.00055361, 0.00789731, 0.00175059,
       0.00439174, 0.00281088, 0.00410321, 0.00383826, 0.00777076,
       0.00881305, 0.00370055, 0.00255346, 0.00129993, 0.01258795,
       0.00274861, 0.00644344, 0.00083281, 0.00333491, 0.00072

In [6]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.28849206, 0.71150794],
       [0.71278571, 0.28721429],
       [0.32866667, 0.67133333],
       [0.25847619, 0.74152381],
       [0.78915476, 0.21084524],
       [0.34344048, 0.65655952],
       [0.49583333, 0.50416667],
       [0.37552417, 0.62447583],
       [0.36893254, 0.63106746],
       [0.78603175, 0.21396825],
       [0.43585714, 0.56414286],
       [0.76822222, 0.23177778],
       [0.60864286, 0.39135714],
       [0.74984091, 0.25015909],
       [0.60344048, 0.39655952],
       [0.44616667, 0.55383333],
       [0.43606385, 0.56393615],
       [0.59539286, 0.40460714],
       [0.63633333, 0.36366667],
       [0.43540909, 0.56459091],
       [0.24166667, 0.75833333],
       [0.49658333, 0.50341667],
       [0.39922619, 0.60077381],
       [0.39537229, 0.60462771],
       [0.30154365, 0.69845635],
       [0.61553175, 0.38446825],
       [0.51177778, 0.48822222],
       [0.42375758, 0.57624242],
       [0.3739329 , 0.6260671 ],
       [0.58244048, 0.41755952],
       [0.

In [7]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [8]:
feat_imp.to_csv("featureImp5.csv")

In [9]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions5.csv", index=False)

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.7020202020202021