In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector
from sklearn import metrics
from tpot import TPOTClassifier

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('simulatedGenex.csv', index_col=0, header=0)
tpot_data =tpot_data.astype(np.float64)
features = tpot_data.drop('class', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['class'], random_state=1618)

# Average CV score on the training set was:0.7882832777159806
exported_pipeline = make_pipeline(
    DatasetSelector(sel_subset=0, subset_list="subsets.csv"),
    Nystroem(gamma=1.0, kernel="linear", n_components=8),
    ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=3, min_samples_split=3, n_estimators=100)
)

In [3]:
training_features.to_csv("simdat/Xtrain.csv", index = False)
testing_features.to_csv("simdat/Xtest.csv", index = False)
training_target.to_csv("simdat/ytrain.csv", index = False)
testing_target.to_csv("simdat/ytest.csv", index = False)

In [4]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 1618)
exported_pipeline.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=0, subset_list='subsets.csv')), ('nystroem', Nystroem(coef0=None, degree=None, gamma=1.0, kernel='linear',
     kernel_params=None, n_components=8, random_state=1618)), ('extratreesclassifier', ExtraTreesClassifier(bootstrap=False, class_weight=N...imators=100, n_jobs=1, oob_score=False, random_state=1618,
           verbose=0, warm_start=False))])

In [5]:
print(exported_pipeline.steps[0][1].feat_list)

['simvar142', 'var4500', 'var3883', 'var1912', 'var1059', 'simvar48', 'var4053', 'var235', 'simvar92', 'simvar80', 'var459', 'simvar102', 'var846', 'var3439', 'simvar187', 'var2310', 'var298', 'var1074', 'simvar153', 'var2982', 'var86', 'simvar77', 'simvar121', 'simvar165', 'var3443', 'var392', 'simvar117', 'var1160', 'var610', 'simvar114', 'simvar103', 'simvar163', 'simvar87', 'var3044', 'var967', 'var3784', 'var3858', 'var109', 'simvar89', 'var2632', 'var1692', 'var2150', 'simvar183', 'simvar84', 'simvar128', 'var1173', 'simvar20', 'var4383', 'var1503', 'simvar110', 'var150', 'var3552', 'var4039', 'var352', 'simvar71', 'var4326', 'var4191', 'var2077', 'var343', 'simvar184', 'var1229', 'simvar182', 'var3872', 'var4016', 'var1161', 'simvar2', 'simvar27', 'var668', 'var3234', 'var2658', 'var757', 'simvar10', 'var1272', 'simvar51', 'var2626', 'simvar16', 'simvar81', 'var2954', 'var3134', 'simvar107', 'simvar97', 'simvar129', 'var3214', 'var2937', 'simvar94', 'var2816', 'var3866', 'var467

In [6]:
exported_pipeline.steps[-1][1].feature_importances_

array([0.08637112, 0.08985964, 0.10628944, 0.11987067, 0.06575571,
       0.10645718, 0.0899919 , 0.33540434])

In [7]:
results = exported_pipeline.predict_proba(testing_features)
results

array([[0.51555952, 0.48444048],
       [0.60290476, 0.39709524],
       [0.4275873 , 0.5724127 ],
       [0.95661111, 0.04338889],
       [0.63111905, 0.36888095],
       [0.44685714, 0.55314286],
       [0.6365    , 0.3635    ],
       [0.89761905, 0.10238095],
       [0.63214922, 0.36785078],
       [0.37829762, 0.62170238],
       [0.43470238, 0.56529762],
       [0.31940476, 0.68059524],
       [0.84292857, 0.15707143],
       [0.6245873 , 0.3754127 ],
       [0.88305579, 0.11694421],
       [0.80098039, 0.19901961],
       [0.42366667, 0.57633333],
       [0.5866651 , 0.4133349 ],
       [0.36025   , 0.63975   ],
       [0.42604762, 0.57395238],
       [0.50059524, 0.49940476],
       [0.63461111, 0.36538889],
       [0.19178571, 0.80821429],
       [0.82855952, 0.17144048],
       [0.53688095, 0.46311905],
       [0.30954762, 0.69045238],
       [0.59645238, 0.40354762],
       [0.67341667, 0.32658333],
       [0.44644048, 0.55355952],
       [0.59000758, 0.40999242],
       [0.

In [8]:
feat_imp = pd.DataFrame({'feat': exported_pipeline.steps[0][1].feat_list, 
            'score': exported_pipeline.steps[-1][1].feature_importances_})

ValueError: arrays must all be same length

In [None]:
feat_imp.to_csv("featureImpBest.csv")

In [None]:
pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictionsBest.csv", index=False)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)