In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import FeatureAgglomeration
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from tpot.builtins import DatasetSelector
from sklearn import metrics
from tpot import TPOTClassifier
import eli5
from eli5.sklearn import PermutationImportance

In [2]:
# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
tpot_data = pd.read_csv('rnaSeqMDD.csv', index_col=0, header=0)
tpot_data = tpot_data.astype(np.float64)
features = tpot_data.drop('phenotype', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['phenotype'], random_state=1618)

# Average CV score on the training set was:0.736
exported_pipeline = make_pipeline( #53
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    FeatureAgglomeration(affinity="euclidean", linkage="average"),
    KNeighborsClassifier(n_neighbors=19, p=2, weights="distance")
)

In [3]:
exported_pipeline_process = make_pipeline( #53
    DatasetSelector(sel_subset=12, subset_list="module23.csv"),
    FeatureAgglomeration(affinity="euclidean", linkage="average"))
exported_pipeline_process.fit(training_features, training_target)

Pipeline(memory=None,
     steps=[('datasetselector', DatasetSelector(sel_subset=12, subset_list='module23.csv')), ('featureagglomeration', FeatureAgglomeration(affinity='euclidean', compute_full_tree='auto',
           connectivity=None, linkage='average', memory=None, n_clusters=2,
           pooling_func=<function mean at 0x10d88a620>))])

In [4]:
tpot_obj= TPOTClassifier()
tpot_obj._set_param_recursive(exported_pipeline.steps, 'random_state', 42)
model = exported_pipeline.fit(training_features, training_target)

In [5]:
perm = PermutationImportance(model, n_iter=100).fit(testing_features, testing_target)

In [6]:
results = exported_pipeline.predict_proba(testing_features)
feat_imp = pd.DataFrame({'feat': list(testing_features), 
            'score': perm.feature_importances_})

feat_imp.to_csv("featureImp13.csv")

pd.DataFrame({'y':testing_target, 'ypred':results[:,1]}).to_csv("predictions13.csv", index=False)

fpr, tpr, thresholds = metrics.roc_curve(testing_target, results[:,1])
metrics.auc(fpr, tpr)

0.5404040404040404