In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector, OneHotEncoder
from sklearn import metrics

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    OneHotEncoder(minimum_fraction=0.1, sparse=False, threshold=10),
    ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.05, min_samples_leaf=8, min_samples_split=8, n_estimators=100)
)

In [3]:
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=12, subset_list='module23.csv')), ('onehotencoder', OneHotEncoder(categorical_features=[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, Fals...imators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['LSR', 'TSSK4', 'PRR3', 'SNX22', 'MRPS7', 'ZNF24', 'TEAD2', 'CLECL1', 'VTN', 'PPAPDC1B', 'MIR497HG', 'SYTL1', 'BCL9', 'RGL4', 'GDF9', 'RDH13', 'LOC101929010', 'ZNF397', 'PAXBP1', 'DOPEY1', 'TOMM70A', 'DISP2', 'FBXL16', 'KCNT1', 'THBS4', 'MICAL1', 'CISD2', 'SRPK3', 'NSUN5P2', 'PROC', 'SIK2', 'UBE2D4', 'WDR5B', 'CCDC28B', 'PLXNC1', 'TBC1D10C', 'TSSK3', 'LOC100132356', 'TMPRSS6', 'FNDC9', 'LCAT', 'PCDH12', 'GPT', 'ARPC3', 'MXRA8', 'GFM2', 'FARP2', 'PEX5L', 'LRRC8B', 'MIR1282', 'NOP16', 'LRP2BP', 'RHOF', 'EIF4ENIF1', 'PFKP', 'IQCC', 'NUTM2G', 'KCNH3', 'MYRIP', 'PDZK1', 'SEMA6A', 'CROCCP3', 'SLFNL1', 'VPS52', 'WHSC1L1', 'SLC1A4', 'ZNF154', 'MTHFD2', 'SCN8A', 'C15orf40', 'SLC24A1', 'COA7', 'SPTBN2', 'NAALAD2', 'CASP8', 'NCOR1', 'CEMIP', 'BAP1', 'RPS6KA2', 'SLC34A3', 'SNX11', 'PLCE1', 'CASKIN1', 'TIMM17B', 'FAM86A', 'EML5', 'PAPSS2', 'NUP188', 'ZNF233', 'ABTB2', 'ZBED4', 'CABP7', 'LOC101927211', 'SNED1', 'SSBP3', 'ORM2', 'DDX58', 'LOC101928943', 'RNF167', 'FUCA2', 'BORA', 'JPH3', 'FAR2', 'LO

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.00738432, 0.0048871 , 0.00450648, 0.00555164, 0.00341666,
       0.00184246, 0.00386489, 0.00258396, 0.00165217, 0.00029053,
       0.00933002, 0.00171829, 0.00853102, 0.01280829, 0.00933864,
       0.00518477, 0.00470579, 0.004309  , 0.00735147, 0.0034345 ,
       0.00610977, 0.02464003, 0.00489563, 0.01051103, 0.00347736,
       0.00094209, 0.00840397, 0.00818802, 0.00398917, 0.00560028,
       0.0154187 , 0.00296717, 0.01590904, 0.00645005, 0.00917275,
       0.01598655, 0.02202255, 0.00362537, 0.00603207, 0.00882028,
       0.01383739, 0.00488203, 0.02364595, 0.00195361, 0.01884132,
       0.00880938, 0.00461281, 0.00378467, 0.00126554, 0.00558512,
       0.03908593, 0.00093542, 0.00786287, 0.00253368, 0.00656192,
       0.00266874, 0.01922   , 0.00476643, 0.00060716, 0.00602495,
       0.01001444, 0.00480421, 0.00891672, 0.00792756, 0.01580901,
       0.00317605, 0.01783115, 0.00608553, 0.02207125, 0.00870516,
       0.00469621, 0.00371708, 0.01506411, 0.00296731, 0.00783

In [6]:
results = exported_pipeline.predict_proba(testing_features)

In [7]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [8]:
feat_imp.to_csv("featureImp13.csv")

In [9]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions13.csv", index=False)

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.6313131313131314