In [1]:
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn import metrics
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.ensemble import GradientBoostingClassifier
from tpot.builtins import DatasetSelector
from sklearn.ensemble import RandomForestClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('simulatedGenex.csv', index_col=False, header=0)
tpot_data =tpot_data.astype(np.float64)
tpot_data = tpot_data.drop(tpot_data.columns[0], axis=1)

features = tpot_data.drop('class', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'], random_state=1618)

# Average CV score on the training set was:0.5854060066740823
exported_pipeline_XGB = XGBClassifier(learning_rate=1.0, max_depth=5, min_child_weight=7, 
                                  n_estimators=100, nthread=1, subsample=0.4)
exported_pipeline_DS = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.8500000000000001, kernel="linear", n_components=7),
    GradientBoostingClassifier(learning_rate=0.5, max_depth=3, max_features=0.5, min_samples_leaf=4, min_samples_split=6, n_estimators=100, subsample=0.5)
)

exported_pipeline_TPOT = make_pipeline(
    RBFSampler(gamma=0.9),
    ExtraTreesClassifier(bootstrap=True, criterion="gini",
                         max_features=0.9000000000000001, min_samples_leaf=18, min_samples_split=6, n_estimators=100)
)

exported_pipeline_RF = RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.8500000000000001, min_samples_leaf=5, min_samples_split=20, n_estimators=100)

tpot_obj= TPOTClassifier()

    

In [3]:
XGBscores = []
DSscores = []
TPOTscores = []
RFscores = []
for seed in range(100):
    params = {"random_state":seed}
    exported_pipeline_XGB.set_params(**params)
    exported_pipeline_XGB.fit(training_features, training_target)
    score = exported_pipeline_XGB.score(testing_features, testing_target)
    XGBscores.append(score)

    exported_pipeline_RF.fit(training_features, training_target)
    score = exported_pipeline_RF.score(testing_features, testing_target)
    RFscores.append(score)
    
    tpot_obj._set_param_recursive(exported_pipeline_DS.steps, 'random_state', seed)
    exported_pipeline_DS.fit(training_features, training_target)
    score = exported_pipeline_DS.score(testing_features, testing_target)
    DSscores.append(score)
    
    tpot_obj._set_param_recursive(exported_pipeline_TPOT.steps, 'random_state', seed)
    exported_pipeline_TPOT.fit(training_features, training_target)
    score = exported_pipeline_TPOT.score(testing_features, testing_target)
    TPOTscores.append(score)
    


In [4]:
bestaccu = pd.DataFrame({'Random forest': RFscores, 'XGBoost': XGBscores, 'TPOT': TPOTscores, 'TPOT-DS': DSscores})
bestaccu.to_csv('bestAccuracies.csv', index = False)