In [39]:
# generic
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
import missingno as msno
from math import ceil
import plotly.express as px
# sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, StratifiedKFold, LeaveOneOut, ShuffleSplit, LeaveOneGroupOut, StratifiedGroupKFold, StratifiedShuffleSplit, GroupShuffleSplit, GroupKFold
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score, matthews_corrcoef, f1_score, precision_score, recall_score, roc_curve, roc_auc_score, RocCurveDisplay, make_scorer
from sklearn.linear_model import LogisticRegression, RidgeCV, LassoCV, RidgeClassifier, ElasticNetCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif, f_classif, SelectFromModel, RFECV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.inspection import permutation_importance
# keras to create ANN
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
!pip install scikit-optimize
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
!pip install scikeras
from scikeras.wrappers import KerasClassifier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### FS library setup

In [40]:
# library for MRMR, CONDRED, DISR FS techniques
# !pip install -U fstoolbox
# import fstoolbox

In [41]:
# another attempt to setup FS toolbox
# !git clone https://github.com/EESI/PyFeast.git
# !python PyFeast/setup.py build
# !python feast.py
# from feast.py import DISR

## methods

In [42]:
# get a list of models to evaluate
def get_models():
 models = dict()
 models['knn'] = KNeighborsClassifier()
 models['dt'] = DecisionTreeClassifier()
 models['rf'] = RandomForestClassifier()
#  models['et'] = ExtraTreesClassifier()
 models['l_svm'] = LinearSVC()
 models['svm'] = SVC(kernel='linear')
#  models['bayes'] = GaussianNB()
 models['lr'] = LogisticRegression(max_iter=1000)
 models['en'] = LogisticRegression(penalty = 'elasticnet', solver='saga', l1_ratio = 0.5, max_iter=1000)
 models['lda'] = LinearDiscriminantAnalysis()
 models['ridge'] = RidgeClassifier()
 return models

In [43]:
# get a list of cross-validation methods to evaluate
def get_cvs():
  cvs = dict()
  cvs['RSK'] = RepeatedStratifiedKFold(n_splits=10, n_repeats=50, random_state=1)
  cvs['LOO'] = LeaveOneOut()
  cvs['SSS'] = StratifiedShuffleSplit(n_splits=10)
  cvs['SS'] = ShuffleSplit(n_splits=10, test_size=0.3)
  return cvs

In [44]:
def performance_metrics(model, y_test, y_pred):
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=50, random_state=1)
  # [ TN / FN ]
  # [ FP / TP ]
  print("=== Confusion Matrix ===")
  print(confusion_matrix(y_test, y_pred))
  print("=== Classification Report ===")
  print(classification_report(y_test, y_pred))
  print("=== Metrics ===")
  auc_score = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
  print("%.3f" % auc_score.mean())
  acc_score = cross_val_score(model, X, y, cv=cv, scoring='balanced_accuracy')
  print("%.3f" % acc_score.mean())
  f1_score = cross_val_score(model, X, y, cv=cv, scoring='f1')
  print("%.3f" % f1_score.mean())  
  se_score = cross_val_score(model, X, y, cv=cv, scoring='recall')
  print("%.3f" % se_score.mean())
  specificity = make_scorer(recall_score, pos_label=0)
  sp_score = cross_val_score(model, X, y, cv=cv, scoring=specificity)
  print("%.3f" % sp_score.mean())
  mcc_score = cross_val_score(model, X, y, cv=cv, scoring='matthews_corrcoef')
  print("%.3f" % mcc_score.mean())

In [45]:
def plot_distributions(data, columns):
    maxCols  = 4
    if len(columns) <4:
        numCols = len(columns)
    else:
        numCols = maxCols
    numRows = ceil(len(columns) / 4)

    fig, axs = plt.subplots(numRows, numCols)
    fig.set_figwidth(5*numCols)
    fig.set_figheight(3*numCols)
    fig.tight_layout(pad=5.0)

    i=j=0
    for c in columns:
        sns.histplot(data=data, x=c, ax=axs[i,j])
        # sns.kdeplot(data=data, x=c, ax=axs[i,j])
        # sns.boxplot(data=data, x=c, ax=axs[i,j])

        axs[i,j].set_title(c)
        j = j+1
        if j == 4:
            i = i+1
            j= 0

## RPPA Preprocessing

### loading data

In [46]:
c_df = pd.read_csv("/content/sample_data/clinical-rppa.txt", sep="\t")
c_df.set_index("participant", inplace=True)

In [47]:
p_df = pd.read_csv("/content/sample_data/protein.txt", sep="\t")
# each column is a tissue sample
# each row is a protein
# therefore each cell represents the level of the specific protein in that specific tissue sample.

In [48]:
# sns.heatmap(c_df.corr(), cmap="Blues")
# msno.bar(c_df)

### separating X and y

In [49]:
# dataframe with participant number index and pfi column
pfi_df = pd.DataFrame(data=c_df.pfi)
# make index the participant number
p_df.set_index("Sample REF", inplace=True)
transpose_df = p_df.transpose()
# merge clinical pfi result and protein data
merged_df = transpose_df.merge(pfi_df, how="inner", on=transpose_df.index)
merged_df.set_index("key_0", inplace=True)
merged_df

Unnamed: 0_level_0,14-3-3_epsilon,4E-BP1,4E-BP1_pS65,4E-BP1_pT37T46,4E-BP1_pT70,53BP1,ACC1,ACC_pS79,AMPK_alpha,AMPK_pT172,...,p27_pT157,p27_pT198,p38_MAPK,p38_pT180_Y182,p53,p63,p70S6K,p70S6K_pT389,p90RSK_pT359_S363,pfi
key_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4074,-0.975455,-0.344284,0.461891,1.795396,1.76321,0.940777,2.52912,-0.673478,1.312644,-0.470337,...,1.252789,0.107269,0.516851,0.000823,0.459285,1.816838,0.191172,-2.19129,-0.743105,0
4078,-1.023625,0.33599,-0.042486,1.065434,0.595061,1.315773,2.119877,-0.878761,1.848433,2.194422,...,0.400928,0.155596,1.030774,1.941106,0.071299,2.852411,0.484189,-2.548276,-0.337144,1
5149,-0.656239,1.112516,0.524127,0.412476,1.08562,1.450957,1.806726,-1.032176,0.710373,1.551747,...,0.74028,-0.221206,0.563064,0.724724,0.577707,2.276514,-0.102408,-2.40165,-1.125616,1
5151,-0.417939,1.837359,0.363827,0.262011,1.287822,0.978116,2.832971,-0.122219,1.584088,1.813148,...,0.786717,-0.091238,0.786525,0.735234,0.293831,2.273316,0.230295,-2.180772,-1.328046,0
5152,-0.771338,1.38695,0.059264,0.576787,0.821192,0.813838,3.509105,-0.082333,1.688436,2.15773,...,0.731234,-0.127136,0.876586,0.974012,0.010363,1.938218,1.059708,-2.069344,-1.370619,1
5153,-1.177445,0.458458,0.057335,0.889808,0.445702,1.284402,2.100869,-0.062238,1.432684,2.949738,...,0.290629,0.076513,0.775424,0.796941,-0.523405,2.098402,0.84987,-2.577867,-0.838349,1
5555,-0.89929,1.175845,0.376164,1.122129,0.987013,0.445268,2.569775,-0.260467,1.378232,2.208302,...,0.381795,-0.067311,0.910765,1.706154,0.364286,2.751934,0.263214,-2.683242,-0.813617,0
5556,-1.307023,0.825999,0.859202,2.140152,1.190538,0.335052,1.508071,-1.470166,1.178747,0.410832,...,0.421548,0.01491,0.378915,1.254009,-0.529587,1.691012,-0.288981,-2.134266,-1.011465,1
5557,-0.681439,0.817223,0.178751,-0.748593,0.955448,0.242571,2.415125,-1.157911,1.301458,1.007503,...,1.180263,0.130057,0.572511,-0.176515,-0.22135,0.058433,0.070605,-2.219878,-1.173273,0
5559,-1.085889,0.810669,0.024167,0.466862,0.663202,1.572463,1.686433,-1.183921,1.292243,2.424905,...,0.379145,0.110331,0.667008,0.753264,-0.47165,1.818901,0.181642,-2.85574,-0.843307,1


In [50]:
# input
X = merged_df.drop('pfi', axis=1)
# to predict
y = merged_df.pfi

In [51]:
# PFI class imbalance
y.value_counts()

0    118
1     94
Name: pfi, dtype: int64

In [52]:
# split the data twice to have training, testing, and validation
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

### Feature distribution

In [None]:
plot_distributions(c_df, c_df.columns)

In [None]:
fc, tc = c_df.female.value_counts().sort_index().tolist()
d = {'male': [fc], 'female': [tc]}
sex = pd.DataFrame(data=d)
sns.barplot(sex, palette="Blues").set(title='sex', ylabel='Count')

## DNAm PFI Preprocessing/EDA



### Loading data

In [None]:
mc_df = pd.read_csv("/content/sample_data/clinical-dnam.txt", sep="\t")
# aligning participant number with predicted-protein Hybridization REF
mc_df.set_index("participant.tissue", inplace=True)
mc_df

In [None]:
mc_df.corr()

In [None]:
sns.heatmap(mc_df.corr(), cmap="Blues")

In [None]:
# row = protein, column = DNAm, cell = EpiScore
pp_df = pd.read_csv("/content/sample_data/predicted-proteins.txt", sep="\t")

### separating X and y

In [None]:
# dataframe with participant number index and pfi column
mc3_df = pd.DataFrame(data=[mc_df.pfi])
# make index the Hybridization REF
transpose_dnam_df = pp_df.transpose()
transpose_dnam_df
# merge clinical pfi result and protein data
merged_dnam_df = transpose_dnam_df.merge(mc3_df.transpose(), how="inner", on=transpose_dnam_df.index)
merged_dnam_df.set_index("key_0", inplace=True)
merged_dnam_df

In [None]:
# input
X = merged_dnam_df.drop('pfi', axis=1)
# to predict
y = merged_dnam_df.pfi

In [None]:
y.value_counts()

In [None]:
# split the data twice to have training, testing, and validation
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

### Feature distribution

In [None]:
plot_distributions(mc_df, mc_df.columns)

In [None]:
fc, tc = mc_df.female.value_counts().sort_index().tolist()
d = {'male': [fc], 'female': [tc]}
sex = pd.DataFrame(data=d)
sns.barplot(sex, palette="Blues").set(title='sex', ylabel='Count')

## DNAm TN Preprocessing/EDA

### Loading data

In [None]:
mc_df = pd.read_csv("/content/sample_data/clinical-dnam.txt", sep="\t")
# aligning participant number with predicted-protein Hybridization REF
mc_df.set_index("participant.tissue", inplace=True)
mc_df

In [None]:
# row = protein, column = DNAm, cell = EpiScore
pp_df = pd.read_csv("/content/sample_data/predicted-proteins.txt", sep="\t")

### separating X and y

In [None]:
# dataframe with participant number index and pfi column
mc3_df = pd.DataFrame(data=[mc_df['tumor.or.normal']])
# make index the Hybridization REF
transpose_dnam_df = pp_df.transpose()
transpose_dnam_df
# merge clinical pfi result and protein data
merged_dnam_df = transpose_dnam_df.merge(mc3_df.transpose(), how="inner", on=transpose_dnam_df.index)
merged_dnam_df.set_index("key_0", inplace=True)
merged_dnam_df

In [None]:
# input
X = merged_dnam_df.drop('tumor.or.normal', axis=1)
# to predict
merged_dnam_df['tumor.or.normal'].replace(['tumor', 'normal'],
                        [0, 1], inplace=True)
y = merged_dnam_df['tumor.or.normal']

In [None]:
y.value_counts()

In [None]:
# split the data twice to have training, testing, and validation
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

## Anomaly Detection

In [None]:
def find_zscore_outliers(df, col_name, bottom_threshold=-9, top_threshold=9):
  global zscore_outliers
  global zscore_non_outliers

  mean = np.mean(df[col_name])
  mad = df[col_name].mad()

  zscore_outliers = [x for x in df[col_name] if (
    ((x-mean)/mad < bottom_threshold) or ((x-mean)/mad > top_threshold))]
  zscore_non_outliers = [x for x in df[col_name] if (
    ((x-mean)/mad < top_threshold) & ((x-mean)/mad > bottom_threshold))]

  outlier_filtered_zscore = df.loc[df[col_name].isin(zscore_outliers)]
  filtered_zscore = df.loc[df[col_name].isin(zscore_non_outliers)]

  # print(f"{col_name} - {outlier_filtered_zscore.index}")
  final_outliers.append(outlier_filtered_zscore.index)

In [None]:
def get_outlier_count(outliers) :
  count = 0
  unique_outliers_manual = []
  for outlier_list in outliers:
    for outlier in outlier_list:
      unique_outliers_manual.append(outlier)
      count = count + 1
  # total number of outliers found
  print("total number of outliers: " + str(count))
  unique_outliers_manual = np.unique(unique_outliers_manual)
  # number of unique outlier indexes
  print("number of unique outliers: " + str(unique_outliers_manual.size))
  print(unique_outliers_manual)

In [None]:
final_outliers = []
global filtered_zscore
filtered_zscore = merged_df.copy()
# filtered_zscore = merged_dnam_df.copy()

for i in filtered_zscore.columns:
    # skip the PFI column
    if i == filtered_zscore.columns[(len(filtered_zscore.columns)-1)]:
      continue;
    else:
      # method removes outliers from global variable 'filtered_zscore' so do not
      # need to drop any rows here
      find_zscore_outliers(filtered_zscore, i)

final_outliers
get_outlier_count(final_outliers)

In [None]:
merged_df.drop(['4725', '4726', '4728', '5326', '5434', '5977', '6222', '6225', '6960'])

In [None]:
merged_dnam_df.drop(['4722-01', '4727-01', '7863-01', 'A6T2-01'])

## PCA

In [None]:
def plot_PCA():
  pca = PCA()
  pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
  Xt = pipe.fit_transform(X)
  plot = plt.scatter(Xt[:,0], Xt[:,1], c=y)
  plt.legend(handles=plot.legend_elements()[0], labels=list(y))
  plt.show()

## Model selection

### FS techniques

In [None]:
sel = VarianceThreshold(threshold=0.001)

In [None]:
fs = RidgeCV(alphas=np.logspace(-6, 6, num=5)).fit(X_train, y_train)
sel = SelectFromModel(fs)

In [None]:
fs = LassoCV(tol=0.001).fit(X_train, y_train)
sel = SelectFromModel(fs)

In [None]:
fs = ElasticNetCV(tol=0.01).fit(X_train, y_train)
sel = SelectFromModel(fs, prefit=True)

In [None]:
min_features_to_select=10
# fs_model = RandomForestClassifier()
fs_model = SVC(kernel='linear', C=1)
# fs_model = LogisticRegression(penalty="l1", solver='liblinear')
# fs_model = LinearSVC(C=0.1)
sel = RFECV(estimator=fs_model, 
              step=1, 
              cv=5, 
              min_features_to_select=min_features_to_select,
              scoring = 'roc_auc')

In [None]:
sel = SelectKBest(f_classif, k=20)

In [None]:
sel = SelectKBest(mutual_info_classif, k=20)

In [None]:
# method from fstoolbox library
# sel = feast('mrmr', 10, data, labels)

In [None]:
# sel.get_support()

### Remove non-selected features

In [None]:
sel.fit(X_train, y_train)

concol = [column for column in X_train.columns 
          if column not in X_train.columns[sel.get_support()]]

for features in X_train.columns[sel.get_support()]:
    print(features)

X_train.drop(concol,axis=1, inplace=True)
X_test.drop(concol, axis=1, inplace=True)

In [None]:
X_train

### Model cross-validated F1 scores

In [None]:
def evaluate_model(model, X_train, y_train):
  cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=50, random_state=1)
  scores = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, error_score='raise')
  return scores

In [None]:
def run_cv(X_train, y_train):
  models = get_models()
  # evaluate the models and store results
  results, model_names = list(), list()
  for model_name, model in models.items():
      scores = evaluate_model(model, X_train, y_train)
      results.append(scores)
      model_names.append(model_name)
      print('>%s %.3f (%.3f)' % (model_name, np.mean(scores), np.std(scores)))
  # plot model performance for comparison
  plt.boxplot(results, labels=model_names, showmeans=True)
  plt.ylabel("F1 score")
  plt.xlabel("ML Model")
  plt.show()

In [None]:
run_cv(X_train, y_train)

### Model ROCs

In [None]:
def show_model_rocs():
  ax = plt.gca()
  fig = plt.gcf()
  fig.set_size_inches(9,5)

  models = get_models()
  for name, model in models.items():
    model.fit(X_train, y_train)
    RocCurveDisplay.from_estimator(model, X_test, y_test, ax=ax, alpha=0.8)
  plt.ylabel("Sensitivity", fontsize=14)
  plt.xlabel("1-Specificity", fontsize=14)
  for label in (ax.get_xticklabels() + ax.get_yticklabels()): label.set_fontsize(14)
  plt.show()

In [None]:
show_model_rocs()

## Pipelines

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                      # ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', KNeighborsClassifier())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                     #  ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', LogisticRegression())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                      ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', SVC())])

In [None]:
pipeline = Pipeline([('normalize', MinMaxScaler()),
                      # ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', LinearSVC())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                     ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', DecisionTreeClassifier())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                     ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', RandomForestClassifier())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                     ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', LinearDiscriminantAnalysis())])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                      ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', LogisticRegression(penalty = 'elasticnet', solver = 'saga'))])

In [None]:
pipeline = Pipeline([('normalize', Normalizer()),
                     ('smt', SMOTE()),
                    #  ('selection', sel),
                     ('model', RidgeClassifier())])

### Pipeline execution - GridSearchCV

In [None]:
### KNN

parameters = {
    # 'selection__k': list(range(1, X.shape[1]+1)),
    'model__n_neighbors': list(range(2, 10)),
    'model__weights': ['uniform', 'distance'],
}

### LASSO

# parameters = {
#     # 'selection__k': list(range(1, X.shape[1]+1)),
#     'model__C': [1e-6, 1],
#     'model__penalty': ['l1', 'l2'],
#     'model__solver': ['liblinear'],
#     'model__tol': [0.01, 0.0001, 0.000001],
#     'model__class_weight': [None, 'balanced'],
# }

### RIDGE

# parameters = {
#     # 'selection__k': list(range(1, X.shape[1]+1)),
#     'model__alpha': list(range(1,3)),
#     'model__tol': [0.0001, 0.001],
#     'model__class_weight': [None, 'balanced'],
# }

# parameters = {
#     'model__C': [1e-6, 1],
#     'model__solver': ['liblinear', 'lbfgs'],
# }

### SVC

# parameters = {
#     # 'selection__k': list(range(1, X.shape[1]+1)),
#     'model__C': [1e-6, 1],
#     'model__kernel': ['linear', 'rbf'],
#     'model__class_weight': [None, 'balanced']
# }

### L-SVC

# parameters = {
#     'model__penalty': ['l1', 'l2'],
#     'model__loss': ['squared_hinge'],
#     'model__dual': [False],
#     'model__tol': [1e-02, 1e-04, 1e-06],
#     'model__C': [1e-06, 1e-02, 1, 10],
#     'model__max_iter': [1000, 2000],
#     'model__class_weight': [None, 'balanced']
# }

### LDA

# parameters = {
#     'selection__k': list(range(1, X.shape[1]+1)),
#     'model__solver': ['svd'],
# }

### RF 

# parameters = {
    # 'selection__k': list(range(1, X.shape[1]+1)),
    # 'model__n_estimators': [10, 100],
    # 'model__min_samples_leaf': [1, 5],
    # 'model__min_impurity_decrease': [1e-6, 1],
# }

### DT

# parameters = {
#     # 'selection__k': list(range(1, X.shape[1]+1)),
#     'model__min_samples_leaf': [1,5],
#     'model__min_impurity_decrease': [1e-6],
# }

### EN

# parameters = {
    # 'selection__k': list(range(16, X.shape[1]+1)),
    # 'model__class_weight': [None, 'balanced'],
    # 'model__l1_ratio': [0.1, 0.5, 0.9]
# }

# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=1)
grid = GridSearchCV(pipeline, parameters, cv=10, scoring="f1", error_score='raise')
grid.fit(X_train, y_train)

print("the best estimator is \n {} ".format(grid.best_estimator_))
print("the best parameters are \n {}".format(grid.best_params_))

### Pipeline execution - BayesSearchSV

In [None]:
# pipeline class is used as estimator to enable
# search over different model types
pipe = Pipeline([
    ('model', RidgeClassifier())
])

# lsvc_search = {
#     'model': [LinearSVC(max_iter=1000)],
#     'model__C': (1e-6, 1e+6, 'log-uniform'),
# }

# explicit dimension classes can be specified like this
# svc_search = {
#     'model': Categorical([SVC()]),
#     'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
#     'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
#     'model__degree': Integer(1,8),
#     'model__kernel': Categorical(['linear', 'poly', 'rbf']),
# }

# rf_search = {
#     'model': Categorical([RandomForestClassifier()]),
#     'model__n_estimators': [10, 20, 50, 100],
#     'model__min_samples_leaf': [1, 2, 5],
#     'model__min_impurity_decrease': Real(1e-6, 1, prior='log-uniform'),
# }

# lr_search = {
#     'model': Categorical([LogisticRegression(penalty='l1')]),
#     # 'model__dual': [True, False],
#     'model__C': Real(1e-6, 1, prior='log-uniform'),
#     'model__solver': ['liblinear', 'lbfgs'],
#     'model__max_iter': [100, 500, 1000],
# }

# lr_search = {
#     'model': Categorical([LogisticRegression(penalty='l2')]),
#     'model__dual': [True, False],
#     'model__C': Real(1e-6, 1, prior='log-uniform'),
#     'model__solver': ['liblinear'],
#     'model__max_iter': [100, 500, 1000],
# }

# lr_search = {
#     'model': Categorical([LogisticRegression(penalty='elasticnet')]),
#     # 'model__dual': [True, False],
#     'model__C': Real(1e-6, 1, prior='log-uniform'),
#     'model__solver': ['liblinear', 'lbfgs'],
#     'model__max_iter': [100, 500, 1000],
# }

opt = BayesSearchCV(
    pipe,
    [(parameters, 10)],
    cv=10
)

opt.fit(X_train, y_train)

print("best params: %s" % str(opt.best_params_))

## Performance metrics

In [None]:
y_pred = grid.best_estimator_.named_steps['model'].predict(X_test)

In [None]:
performance_metrics(grid.best_estimator_.named_steps['model'], y_test, y_pred)

## Feature Importance

In [None]:
model = opt.best_estimator_

In [None]:
model = grid.best_estimator_.named_steps['model']

In [None]:
imps = permutation_importance(grid.best_estimator_.named_steps['model'], X_test, y_test)
importance = imps.importances_mean

!!! ensure any features not selected by the implemented FS techniques have been removed from X_train and X_test before collating feature importances

In [None]:
# RFI
importance = model.feature_importances_

In [None]:
# coefficients
importance = model.coef_[0]

In [None]:
importances = pd.Series(importance, index=X_train.columns)
pd.set_option("display.max_rows", None)
fi = importances.sort_values(ascending=False)
print(fi)

In [None]:
fi.plot(kind='barh')
plt.gca().invert_yaxis()
plt.gca().set_ylabel("Protein feature")
plt.gca().set_xlabel("Permutation importance")

## ANN

In [None]:
def create_baseline():
 model = Sequential()
 model.add(Dense(60, input_shape=(60,), activation='relu'))
 model.add(Dense(1, activation='sigmoid'))
 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 return model

In [None]:
estimator = KerasClassifier(model=create_baseline, epochs=100, batch_size=5, verbose=0)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=50, random_state=1)
f1_score = cross_val_score(estimator, X_train, y_train, cv=cv, scoring='f1')
print("%.3f%% (%.3f%%)" % (f1_score.mean(), f1_score.std()))