In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.728
exported_pipeline = make_pipeline(
    Nystroem(gamma=0.2, kernel="linear", n_components=5),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.05, min_samples_leaf=6, min_samples_split=13, n_estimators=100)
)

In [3]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('nystroem', Nystroem(coef0=None, degree=None, gamma=0.2, kernel='linear',
     kernel_params=None, n_components=5, random_state=1618)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.05, max_le...obs=None,
            oob_score=False, random_state=1618, verbose=0,
            warm_start=False))])

In [4]:
# print(exported_pipeline.steps[-1][1].feat_list)
# exported_pipeline.steps[-1][1].feature_importances_
# feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
#             'score': exported_pipeline.steps[-1][1].feature_importances_})
# feat_imp.to_csv("featureImpTPOTStand.csv")

In [5]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.45903469, 0.54096531],
       [0.70813484, 0.29186516],
       [0.58972477, 0.41027523],
       [0.42041509, 0.57958491],
       [0.69518582, 0.30481418],
       [0.49428968, 0.50571032],
       [0.54709856, 0.45290144],
       [0.61398513, 0.38601487],
       [0.25654965, 0.74345035],
       [0.47191041, 0.52808959],
       [0.44418853, 0.55581147],
       [0.73714217, 0.26285783],
       [0.67533614, 0.32466386],
       [0.61292119, 0.38707881],
       [0.70357373, 0.29642627],
       [0.61870861, 0.38129139],
       [0.68995786, 0.31004214],
       [0.49555012, 0.50444988],
       [0.69619973, 0.30380027],
       [0.31169944, 0.68830056],
       [0.46675425, 0.53324575],
       [0.47867077, 0.52132923],
       [0.66788315, 0.33211685],
       [0.46070502, 0.53929498],
       [0.45617519, 0.54382481],
       [0.60997586, 0.39002414],
       [0.62414239, 0.37585761],
       [0.60832538, 0.39167462],
       [0.43136967, 0.56863033],
       [0.61315441, 0.38684559],
       [0.

In [6]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictionsTPOTStand.csv", index=False)

In [7]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.6287878787878789