In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.builtins import DatasetSelector
from sklearn import metrics

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    MaxAbsScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
)

In [3]:
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=4, subset_list='module23.csv')), ('maxabsscaler', MaxAbsScaler(copy=True)), ('extratreesclassifier', ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.4, max_leaf_nodes=None,
  ...imators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['USP34', 'SPAG4', 'CALB1', 'GPR18', 'C9orf169', 'CREB1', 'PRPF38A', 'FAM13A', 'RPRD1A', 'DNAH1', 'CD160', 'TNK1', 'UHRF1', 'TTPAL', 'LOC646471', 'LOC100379224', 'PRKX', 'A2MP1', 'TMEM259', 'NREP', 'TRIM7', 'C6orf136', 'PARD6G', 'FTSJ3', 'JAM3', 'NFYC', 'REV3L', 'LOC101928063', 'MSTO1', 'SLC13A4', 'CACNB2', 'RHOQ', 'TACC1', 'CD72', 'ZNF552', 'ATP5H', 'PRPF3', 'SENP1', 'GTF2F1', 'TFIP11', 'LOC729732', 'STX17', 'HSPD1', 'REEP5', 'SKIV2L', 'DCLRE1C', 'CNTRL', 'TMEM217', 'PPWD1', 'UTP14A', 'ZNF814', 'ENTPD3', 'GNPTG', 'DARS2', 'NUDT5', 'PLEKHB1', 'ZNHIT3', 'ZNF805', 'SLC48A1', 'TMEM131', 'ATP2C2', 'FAS', 'COL4A4', 'ZNF544', 'GOLGA8H', 'FXR2', 'B9D2', 'GPR137', 'ERCC6L', 'LRP11', 'SAFB', 'C11orf42', 'ATG14', 'DHODH', 'DCDC2B', 'FAM86HP', 'GSTO1', 'PXK', 'TXLNA', 'DRG1', 'PPARA', 'PDE6C', 'GLE1', 'PROX2', 'ANAPC15', 'NDUFA2', 'GPM6B', 'TCF7L1', 'PCP4L1', 'ECT2L', 'ZNF865', 'MPL', 'RUFY3', 'PCCA', 'MCTP1', 'TNFAIP8L1', 'C1orf146', 'KMT2C', 'TRAF1', 'MRS2', 'NR2C2', 'GSE1', 'ARHGAP27', 'KDELR3

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.01991641, 0.00360656, 0.00886395, 0.00439832, 0.00196295,
       0.00502269, 0.00483326, 0.01287379, 0.00680362, 0.        ,
       0.0018355 , 0.00439417, 0.00392417, 0.00157504, 0.00234971,
       0.00111453, 0.00574769, 0.00221801, 0.00414208, 0.00184699,
       0.0008995 , 0.00376232, 0.00229739, 0.00745691, 0.00362571,
       0.00283217, 0.00425491, 0.00239002, 0.00074589, 0.00220399,
       0.00502764, 0.00633839, 0.00141598, 0.00250764, 0.00359786,
       0.00388314, 0.00050359, 0.00177911, 0.00173426, 0.00205796,
       0.00258387, 0.00375539, 0.00277546, 0.00111043, 0.00122577,
       0.        , 0.00249182, 0.00387704, 0.00124792, 0.00854044,
       0.00153996, 0.00045052, 0.00058762, 0.00223733, 0.00385473,
       0.00237344, 0.00318229, 0.00156084, 0.00213517, 0.00328378,
       0.00297868, 0.00198907, 0.00062305, 0.00360559, 0.00343588,
       0.00125267, 0.        , 0.00195879, 0.00056907, 0.00415843,
       0.00024491, 0.00134641, 0.00178797, 0.        , 0.00793

In [6]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.43302778, 0.56697222],
       [0.69330952, 0.30669048],
       [0.37064286, 0.62935714],
       [0.34458333, 0.65541667],
       [0.69472619, 0.30527381],
       [0.41180952, 0.58819048],
       [0.42347222, 0.57652778],
       [0.38197222, 0.61802778],
       [0.34455556, 0.65544444],
       [0.69352381, 0.30647619],
       [0.31869048, 0.68130952],
       [0.7495873 , 0.2504127 ],
       [0.50478571, 0.49521429],
       [0.78610714, 0.21389286],
       [0.66205556, 0.33794444],
       [0.36316667, 0.63683333],
       [0.32953571, 0.67046429],
       [0.52169048, 0.47830952],
       [0.70419048, 0.29580952],
       [0.40188889, 0.59811111],
       [0.19893651, 0.80106349],
       [0.56521429, 0.43478571],
       [0.40719048, 0.59280952],
       [0.45623016, 0.54376984],
       [0.32878968, 0.67121032],
       [0.55039286, 0.44960714],
       [0.55058333, 0.44941667],
       [0.40999206, 0.59000794],
       [0.36936905, 0.63063095],
       [0.547     , 0.453     ],
       [0.

In [7]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [8]:
feat_imp.to_csv("featureImp5.csv")

In [9]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions5.csv", index=False)

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.7272727272727273