In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.builtins import DatasetSelector
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.7259130434782609
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    MaxAbsScaler(),
    ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.4, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
)

In [3]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=4, subset_list='module23.csv')), ('maxabsscaler', MaxAbsScaler(copy=True)), ('extratreesclassifier', ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.4, max_leaf_nodes=None,
  ...tors=100, n_jobs=None,
           oob_score=False, random_state=1618, verbose=0, warm_start=False))])

In [4]:
print(exported_pipeline.steps[0][1].feat_list)

['OXR1', 'GTF2F1', 'TFIP11', 'CFD', 'PTPN14', 'TAC4', 'ZNF544', 'NT5C2', 'KDELR3', 'DRG1', 'FAM13A', 'GSE1', 'PRPF3', 'SLC26A2', 'FOXM1', 'MYO19', 'MIR635', 'GPR18', 'TACC1', 'BTBD2', 'REEP5', 'TUBB1', 'LHX4', 'C2orf88', 'TAF3', 'EVA1C', 'LOC101929767', 'DHODH', 'COQ6', 'PIGZ', 'MYO5B', 'CUL4A', 'ARAP1', 'LYG2', 'AKT1', 'MMP19', 'BCS1L', 'GPR155', 'RNF224', 'THADA', 'PSMD7', 'TBC1D8', 'ITIH2', 'TCF7L1', 'JAM3', 'PCP4L1', 'TRIM7', 'PXK', 'SLC25A25', 'ECT2L', 'RBM4B', 'TMEM221', 'LOC102724246', 'CD160', 'SEC13', 'REV3L', 'MCTP1', 'CREB1', 'JPX', 'DOT1L', 'NUDT13', 'MEGF11', 'GOLGA8H', 'TXLNA', 'CCT6P3', 'LARS', 'LRRC37A6P', 'INTS4L2', 'UBE2E1', 'SKIV2L', 'AKAP5', 'METTL25', 'TSC1', 'CCDC104', 'PARD6G', 'ICMT', 'RHOQ', 'A2MP1', 'POLR2J4', 'RNF149', 'DNAJC17', 'TMEM259', 'LOC100379224', 'CCDC181', 'ZNF814', 'LOC100996385', 'DARS2', 'FAM98B', 'RUFY3', 'PGM3', 'RNF113A', 'TRAF1', 'UHRF1', 'C11orf42', 'TMEM263', 'DPP7', 'SENCR', 'FDPS', 'RUSC1', 'KMT2C', 'ANXA6', 'HSPD1', 'ARHGEF26', 'ZNHIT3'

In [5]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.02169967, 0.00200719, 0.00264165, 0.0025068 , 0.00173451,
       0.00662777, 0.00251391, 0.00106874, 0.00569524, 0.00308943,
       0.01601675, 0.00265213, 0.0054406 , 0.00241941, 0.00335645,
       0.00525842, 0.00222968, 0.00267979, 0.0016922 , 0.00072498,
       0.00125605, 0.00086005, 0.00989301, 0.0034698 , 0.00299947,
       0.00051697, 0.00559787, 0.00278037, 0.00131791, 0.00133638,
       0.00189669, 0.00717583, 0.00271621, 0.0040089 , 0.00308911,
       0.01078403, 0.00098162, 0.        , 0.0046175 , 0.00212951,
       0.00192265, 0.00730107, 0.00133578, 0.00490789, 0.00265714,
       0.00311932, 0.00107502, 0.0019026 , 0.00120436, 0.00287009,
       0.00443267, 0.00145254, 0.00232971, 0.00055431, 0.00081671,
       0.00458336, 0.0005396 , 0.00348774, 0.00314152, 0.0011632 ,
       0.00166789, 0.0042212 , 0.00426789, 0.00188104, 0.00276951,
       0.00165128, 0.00312265, 0.00168419, 0.00618806, 0.00560974,
       0.00279565, 0.00221321, 0.00377804, 0.0032919 , 0.00369

In [6]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.39289286, 0.60710714],
       [0.62545238, 0.37454762],
       [0.34197619, 0.65802381],
       [0.32921861, 0.67078139],
       [0.74655952, 0.25344048],
       [0.35694084, 0.64305916],
       [0.40665512, 0.59334488],
       [0.349671  , 0.650329  ],
       [0.28000794, 0.71999206],
       [0.75615909, 0.24384091],
       [0.37707143, 0.62292857],
       [0.78719048, 0.21280952],
       [0.61641667, 0.38358333],
       [0.73016667, 0.26983333],
       [0.62097619, 0.37902381],
       [0.45124242, 0.54875758],
       [0.34529762, 0.65470238],
       [0.61590909, 0.38409091],
       [0.70016667, 0.29983333],
       [0.41929762, 0.58070238],
       [0.19010714, 0.80989286],
       [0.50975433, 0.49024567],
       [0.39205195, 0.60794805],
       [0.45461905, 0.54538095],
       [0.318     , 0.682     ],
       [0.56715909, 0.43284091],
       [0.5862619 , 0.4137381 ],
       [0.43305159, 0.56694841],
       [0.43042857, 0.56957143],
       [0.59639286, 0.40360714],
       [0.

In [7]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

In [8]:
feat_imp.to_csv("featureImp5.csv")

In [9]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions5.csv", index=False)

In [10]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.7045454545454546