In [1]:
import numpy as np
import pandas as pd
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('simulatedGenex.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('class', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'], random_state=1618)

# Average CV score on the training set was:0.728
exported_pipeline = make_pipeline(
    RBFSampler(gamma=0.2),
    XGBClassifier(learning_rate=0.1, max_depth=2, min_child_weight=6, n_estimators=100, nthread=1, subsample=0.9500000000000001)
)

In [3]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('rbfsampler', RBFSampler(gamma=0.2, n_components=100, random_state=1618)), ('xgbclassifier', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=6, missing=None, n_e...=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.9500000000000001))])

In [4]:
# print(exported_pipeline.steps[-1][1].feat_list)
# exported_pipeline.steps[-1][1].feature_importances_
# feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
#             'score': exported_pipeline.steps[-1][1].feature_importances_})
# feat_imp.to_csv("featureImpTPOTStand.csv")

In [5]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.35124522, 0.6487548 ],
       [0.5990188 , 0.4009812 ],
       [0.67077464, 0.32922536],
       [0.16664416, 0.83335584],
       [0.44365013, 0.5563499 ],
       [0.47675008, 0.5232499 ],
       [0.6705998 , 0.32940015],
       [0.50640136, 0.49359864],
       [0.37945485, 0.62054515],
       [0.7802849 , 0.21971509],
       [0.31384468, 0.6861553 ],
       [0.6321698 , 0.36783022],
       [0.31908715, 0.68091285],
       [0.38846594, 0.61153406],
       [0.6273544 , 0.37264565],
       [0.70583844, 0.29416156],
       [0.26752096, 0.73247904],
       [0.12197167, 0.87802833],
       [0.5822952 , 0.4177048 ],
       [0.4141314 , 0.5858686 ],
       [0.52744734, 0.4725527 ],
       [0.43562955, 0.56437045],
       [0.47876137, 0.5212386 ],
       [0.6425054 , 0.35749462],
       [0.6661116 , 0.3338884 ],
       [0.08308607, 0.9169139 ],
       [0.2147739 , 0.7852261 ],
       [0.19074613, 0.8092539 ],
       [0.6547303 , 0.34526965],
       [0.6884652 , 0.31153482],
       [0.

In [6]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictionsTPOTStand.csv", index=False)

In [7]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.577922077922078