In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector
from sklearn import metrics
from tpot import TPOTClassifier

In [14]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# tpot_data = pd.read_csv('simulatedGenex.csv', index_col=0, header=0)
# tpot_data =tpot_data.astype(np.float64)
# tpot_data = tpot_data.drop(tpot_data.columns[0], axis=1)
tpot_data = pd.read_csv('simulatedGenex.csv')
Xdata = tpot_data.loc[:, tpot_data.columns != 'class']
Xdata = Xdata.drop(Xdata.columns[0], axis=1)
Ydata = tpot_data['class']

# features = tpot_data.drop('class', axis=1)
features = Xdata
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'], random_state=1618, train_size=0.75, test_size=0.25)

# Average CV score on the training set was:0.7882832777159806
# exported_pipeline = make_pipeline(
#     DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
#     Nystroem(gamma=1.0, kernel="linear", n_components=8),
#     ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
# )

exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=0.8500000000000001, kernel="linear", n_components=7),
    GradientBoostingClassifier(learning_rate=0.5, max_depth=3, max_features=0.5, min_samples_leaf=4, min_samples_split=6, n_estimators=100, subsample=0.5)
)

In [15]:
# training_features.to_csv("simdat/Xtrain.csv", index = False)
# testing_features.to_csv("simdat/Xtest.csv", index = False)
# training_target.to_csv("simdat/ytrain.csv", index = False)
# testing_target.to_csv("simdat/ytest.csv", index = False)

In [4]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 42)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=0, subset_list='subsets.csv')), ('nystroem', Nystroem(coef0=None, degree=None, gamma=0.8500000000000001, kernel='linear',
     kernel_params=None, n_components=7, random_state=42)), ('gradientboostingclassifier', GradientBoostingClassifier(criter...    subsample=0.5, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))])

In [5]:
myfeats = exported_pipeline.steps[0][1].feat_list
subX = training_features.loc[:, myfeats]
exported_pipeline.steps[1][1].transform(subX)

array([[40.50729669, 41.22941652, 27.70089311, ..., 37.90519529,
        43.45484734, 39.92312012],
       [39.56545527, 43.84349688, 30.31950352, ..., 39.47166252,
        39.98423539, 48.02010881],
       [41.15327175, 37.5759686 , 33.99165089, ..., 37.56229649,
        40.20594512, 42.60497186],
       ...,
       [34.40600659, 34.37979951, 36.95168721, ..., 39.29911117,
        35.45443869, 38.62063792],
       [44.66662732, 47.49895114, 23.58328011, ..., 37.56471859,
        43.32114551, 47.64949449],
       [39.44181604, 35.46612351, 26.5480226 , ..., 37.46290617,
        37.9011257 , 41.31167556]])

In [6]:
len(exported_pipeline.steps[0][1].feat_list)

207

In [7]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.15498023, 0.20081026, 0.14441425, 0.18249907, 0.0558853 ,
       0.14060386, 0.12080703])

In [8]:
results = exported_pipeline.predict_proba(testing_features)

In [9]:
# feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
#             'score': exported_pipeline.steps[-1][1].feature_importances_})
# feat_imp.to_csv("featureImpBest.csv")

In [10]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictionsBest.csv", index=False)

In [11]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.737012987012987

In [12]:
exported_pipeline.score(testing_features, testing_target)

0.6