In [1]:
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.preprocessing import MaxAbsScaler
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.ensemble import GradientBoostingClassifier
from tpot.builtins import DatasetSelector, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import Normalizer

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
dat_name = 'RNASeq_MDD'
tpot_data = pd.read_csv('rnaSeqMDD.csv')
tpot_data = tpot_data.drop(tpot_data.columns[0], axis=1)

features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.5854060066740823
exported_pipeline_XGB = XGBClassifier(learning_rate=0.5, max_depth=9, min_child_weight=1, n_estimators=100, nthread=1, subsample=0.45)

exported_pipeline_TPOT = make_pipeline( #22
    Nystroem(gamma=0.7000000000000001, kernel="linear", n_components=7),
    GradientBoostingClassifier(learning_rate=0.01, max_depth=7, max_features=0.8500000000000001, min_samples_leaf=12, min_samples_split=13, n_estimators=100, subsample=0.3)
)

exported_pipeline_DS = make_pipeline( #66
    DatasetSelector(sel_subset=4, subset_list="module23.csv"),
    OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
    ExtraTreesClassifier(bootstrap=True, criterion="gini", max_features=0.9500000000000001, min_samples_leaf=4, min_samples_split=18, n_estimators=100)
)

exported_pipeline_RF = RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.05, min_samples_leaf=10, min_samples_split=16, n_estimators=100)

tpot_obj= TPOTClassifier()

In [3]:
XGBscores = []
DSscores = []
TPOTscores = []
for seed in range(100):
    params = {"random_state":seed}
    exported_pipeline_XGB.set_params(**params)
    exported_pipeline_XGB.fit(training_features, training_target)
    score = exported_pipeline_XGB.score(testing_features, testing_target)
    XGBscores.append(score)
    
    tpot_obj._set_param_recursive(exported_pipeline_DS.steps, 'random_state', seed)
    exported_pipeline_DS.fit(training_features, training_target)
    score = exported_pipeline_DS.score(testing_features, testing_target)
    DSscores.append(score)
    
    tpot_obj._set_param_recursive(exported_pipeline_TPOT.steps, 'random_state', seed)
    exported_pipeline_TPOT.fit(training_features, training_target)
    score = exported_pipeline_TPOT.score(testing_features, testing_target)
    TPOTscores.append(score)

In [4]:
bestaccu = pd.DataFrame({'XGBoost': XGBscores, 'TPOT': TPOTscores, 'TPOT-DS': DSscores})
bestaccu.to_csv('bestAccuracies.csv', index = False)