In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector, OneHotEncoder
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    OneHotEncoder(minimum_fraction=0.1, sparse=False, threshold=10),
    ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.05, min_samples_leaf=8, min_samples_split=8, n_estimators=100)
)

In [3]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=12, subset_list='module23.csv')), ('onehotencoder', OneHotEncoder(categorical_features=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fals...tors=100, n_jobs=None,
           oob_score=False, random_state=1618, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['PRH2', 'MICAL1', 'FAM193B', 'SLFNL1', 'ZNF24', 'TRAF3IP2', 'TEAD2', 'WDR5B', 'TOMM34', 'CEMIP', 'PAPSS2', 'SNX11', 'LRP2BP', 'TSSK3', 'NUTM2G', 'DDX58', 'SNX22', 'GFM2', 'LRRN2', 'PODXL', 'FAM86A', 'NUP188', 'TSSK4', 'SLC34A3', 'MXRA8', 'FAR2', 'LOC101929010', 'MIR1282', 'PEX5L', 'RPS6KA2', 'ZNF397', 'ZHX3', 'MIR497HG', 'ZNF154', 'SH3BP5', 'H1FNT', 'LSR', 'PRR3', 'KCNH3', 'ZNF213', 'CASP8', 'METTL12', 'WHSC1L1', 'LOC100132356', 'SPTBN2', 'SLC1A4', 'LRRC8B', 'EIF4ENIF1', 'MYRIP', 'TIMM17B', 'TRIM17', 'ORM2', 'DISP2', 'VTN', 'RNASEH2C', 'FBXL16', 'JPH3', 'LOC100996251', 'FUCA2', 'CISD2', 'NOP16', 'ACAT1', 'DOPEY1', 'KCNT1', 'RDH13', 'NIF3L1', 'NCOR1', 'ABTB2', 'PPP1R16A', 'NAALAD2', 'RHOF', 'MRPS7', 'ARPC3', 'CCDC28B', 'TUB', 'ERMAP', 'SLC24A1', 'CROCCP3', 'LOC101928943', 'BCL9', 'SIK2', 'EML5', 'LOC100506469', 'SYTL1', 'THBS4', 'BYSL', 'BORA', 'UBE2D4', 'FARP2', 'PLXNC1', 'VPS52', 'SNED1', 'PFKP', 'ZBED4', 'RAD9A', 'GDF9', 'TOMM70A', 'CASKIN1', 'MTHFD2', 'C15orf40', 'SRPK3', 'SSBP3', 

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([1.10552882e-02, 1.09108413e-03, 3.73529488e-03, 1.81416925e-02,
       6.14324869e-03, 1.04922722e-03, 2.91135916e-03, 2.18761360e-03,
       8.84466655e-03, 8.71681987e-03, 6.63151414e-03, 3.78299864e-03,
       8.31062192e-04, 1.24730007e-02, 1.09136612e-02, 2.81924919e-03,
       6.26609227e-03, 2.84021632e-03, 2.66934616e-02, 7.30935524e-03,
       1.81397630e-03, 6.32277358e-03, 4.29470925e-03, 1.91798377e-02,
       3.61927161e-02, 5.53949829e-03, 1.56447789e-03, 9.61824969e-03,
       3.45720676e-04, 7.69995126e-03, 5.82248161e-03, 4.96135538e-03,
       6.63861376e-03, 1.19324688e-02, 1.21136652e-02, 3.62769752e-04,
       1.62684303e-02, 5.70584421e-03, 4.28475582e-03, 1.33330493e-02,
       5.72654793e-03, 7.85291936e-03, 6.32147580e-03, 2.51073080e-03,
       7.90895374e-03, 2.32347941e-03, 2.67659091e-03, 4.96859742e-03,
       2.01510795e-03, 5.00997425e-04, 1.35392197e-02, 1.31499352e-02,
       3.62528034e-03, 2.60009760e-03, 6.06943467e-04, 2.19195832e-03,
      

In [6]:
results = exported_pipeline.predict_proba(testing_features)

In [7]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [8]:
feat_imp.to_csv("featureImp13.csv")

In [9]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions13.csv", index=False)

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.5883838383838383