# __Machine Learning for Prioritizing Blood Pressure Genes__ 

## Import modules:

In [9]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence
from sklearn.utils import class_weight

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings
warnings.filterwarnings('ignore')

filterwarnings("ignore")

seed = 0

# Load data:

In [27]:
data = pd.read_csv("2021-11-19_training_cleaned.csv", header=0, sep=",")

In [28]:
data['BPlabel_encoded'] = data['BPlabel'].map( {'most likely':1,'probable':2, 'least likely':3})
classes_weights_all = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=data["BPlabel_encoded"]
)

In [29]:
Y = data["BPlabel_encoded"] 
data = data.drop(["BPlabel"],1)
data.shape 

(243, 103)

In [30]:
X = pd.read_csv("2021-11-19_imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [32]:
classes_weights_train = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y_train
)

# Building Models:
- Parameter tuning with Bayesian optimization over hyper parameters

In [34]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss') 
xgb_params = {
    'max_depth':  (1, 4), #Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    'learning_rate': (0.01, 0.5, 'log-uniform'),  #Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features
    'n_estimators':  (10, 50), #Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'reg_alpha':  (1, 10, 'log-uniform'), #L1 regularization term on weights. Increasing this value will make model more conservative.
    'reg_lambda':  (1, 10, 'log-uniform')} #L2 regularization term on weights. Increasing this value will make model more conservative.


gb = GradientBoostingClassifier(random_state=seed)
gb_params = {
    'learning_rate': (0.01, 0.5),
    'max_depth': (1, 4),
    "max_features":["log2","sqrt", "auto"],
    "criterion": ["friedman_mse", "mse", "mae"],
    'n_estimators': (10, 50)
    }

rf = RandomForestClassifier(random_state=seed)
rf_params={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]} 

dt = DecisionTreeClassifier(random_state=seed)
dt_params= {"criterion": ["gini", "entropy"],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : (1, 4)}

extra = ExtraTreesClassifier(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]}


inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []

models.append(('XGB', BayesSearchCV(xgb, xgb_params, cv=inner_cv, iid=False, n_jobs=1))) 
models.append(('RF', BayesSearchCV(rf, rf_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('GB', BayesSearchCV(gb, gb_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


## Model Benchmarking - all features:
### Tree-based models:

In [35]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train, sample_weight)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([68.36431885, 70.71956396, 70.11045504, 73.22160363, 70.91914225]), 'score_time': array([0.00626302, 0.00587416, 0.00563717, 0.00658727, 0.00670791]), 'test_accuracy': array([0.69387755, 0.73469388, 0.83673469, 0.72916667, 0.70833333]), 'test_balanced_accuracy': array([0.65      , 0.68013468, 0.72979798, 0.69347319, 0.62356902]), 'test_f1_weighted': array([0.63798701, 0.71292142, 0.81418853, 0.723557  , 0.67040073]), 'test_precision_weighted': array([0.61451247, 0.70828331, 0.80915609, 0.7232599 , 0.67830882]), 'test_recall_weighted': array([0.69387755, 0.73469388, 0.83673469, 0.72916667, 0.70833333])} 

XGB Accuracy Nested CV Average 0.7405612244897959
XGB Balanced Accuracy Nested CV Average 0.6753949753949754
XGB F1 Nested CV Average 0.7118109377196912
XGB Precision Nested CV Average 0.7067041202418629
XGB Recall Nested CV Average 0.7405612244897959
Best Parameters: 
OrderedDict([('learning_rate', 0.01), ('max_depth', 3), ('n

In [37]:
for name, model in models:
    model.fit(X_train, Y_train, sample_weight=classes_weights_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')


Best Parameters: 
OrderedDict([('learning_rate', 0.14296452942858345), ('max_depth', 4), ('n_estimators', 49), ('reg_alpha', 1), ('reg_lambda', 9)])

Non-nested CV Results:
XGB Train accuracy: 0.9948453608247423 Test accuracy: 0.6326530612244898
XGB Train balanced accuracy: 0.9971988795518207 Test balanced accuracy: 0.6666666666666666
XGB Train F1 0.9948697918063256 Test F1: 0.6229400030688967
XGB Train recall: 0.9948453608247423 Test recall: 0.6326530612244898
XGB Train precision: 0.9949846753970466 Test precision: 0.6392813535670678


Best Parameters: 
OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('max_features', 'sqrt'), ('n_estimators', 50)])

Non-nested CV Results:
RF Train accuracy: 0.9742268041237113 Test accuracy: 0.7551020408163265
RF Train balanced accuracy: 0.9687567334626158 Test balanced accuracy: 0.6833333333333332
RF Train F1 0.9741133645287551 Test F1: 0.6829593949380994
RF Train recall: 0.9742268041237113 Test recall: 0.7551020408163265
RF Train precision: 

### Others models:
- all other models require feature scaling before running

In [38]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan', 'minkowski']}


lr = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto',random_state=seed)
lr_params= {
    'penalty':['l1', 'l2'], 
    'C': [0.5, 1, 5, 10], 
    'max_iter':[500, 1000, 2500]}

svc = SVC()
svc_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': ['scale', 'auto']}

othermodels = []

othermodels.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('LR', BayesSearchCV(lr, lr_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


In [39]:
X2 = MinMaxScaler().fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2, random_state=0)

In [40]:
classes_weights_train = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y_train
)

In [20]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2 , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2 , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results) 

KNN Nested CV results for all scores: 
 {'fit_time': array([45.75626874, 51.53988719, 52.73966575, 47.63927698, 53.21452904]), 'score_time': array([0.00563407, 0.00351691, 0.00375724, 0.00381708, 0.00376678]), 'test_accuracy': array([0.6122449 , 0.71428571, 0.85714286, 0.66666667, 0.58333333]), 'test_balanced_accuracy': array([0.51587302, 0.59499759, 0.73989899, 0.55011655, 0.40864198]), 'test_f1_weighted': array([0.54093279, 0.69764521, 0.82877411, 0.57936508, 0.48511905]), 'test_precision_weighted': array([0.48510756, 0.68992462, 0.83642826, 0.51801802, 0.45203488]), 'test_recall_weighted': array([0.6122449 , 0.71428571, 0.85714286, 0.66666667, 0.58333333])} 

KNN Accuracy Nested CV Average 0.686734693877551
KNN Balanced Accuracy Nested CV Average 0.5619056252389586
KNN F1 Nested CV Average 0.6263672473974655
KNN Precision Nested CV Average 0.5963026683541635
KNN Recall Nested CV Average 0.686734693877551
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 17), ('

In [41]:
for name, model in othermodels:
    model.fit(X_train, Y_train, sample_weight=classes_weights_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')


TypeError: fit() got an unexpected keyword argument 'sample_weight'

## Model benchmarking - BorutaShap feature selection

In [42]:
X_boruta_sel = pd.read_csv("2021-11-19_selected_features_training_data.csv", header=0)
X_boruta_sel.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X_boruta_sel.columns.values
]

In [43]:
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X_boruta_sel, Y, test_size=0.2, random_state=0)
classes_weights_train = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y_train_boruta
)

In [44]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X_boruta_sel , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X_boruta_sel , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([53.2495141 , 57.86611295, 55.35747099, 54.30866885, 52.22345304]), 'score_time': array([0.00477004, 0.00446701, 0.00481415, 0.00500607, 0.00476193]), 'test_accuracy': array([0.65306122, 0.7755102 , 0.85714286, 0.79166667, 0.77083333]), 'test_balanced_accuracy': array([0.64444444, 0.7003367 , 0.77146465, 0.76689977, 0.66363636]), 'test_f1_weighted': array([0.62934888, 0.74400617, 0.83932493, 0.78510802, 0.70287056]), 'test_precision_weighted': array([0.61294063, 0.81709957, 0.87380952, 0.78207672, 0.83717105]), 'test_recall_weighted': array([0.65306122, 0.7755102 , 0.85714286, 0.79166667, 0.77083333])} 

XGB Accuracy Nested CV Average 0.7696428571428571
XGB Balanced Accuracy Nested CV Average 0.7093563843563844
XGB F1 Nested CV Average 0.7401317115108192
XGB Precision Nested CV Average 0.7846194987830326
XGB Recall Nested CV Average 0.7696428571428571
Best Parameters: 
OrderedDict([('learning_rate', 0.4885189317071115), ('max_d

In [44]:
for name, model in models:
    model.fit(X_train_boruta, Y_train_boruta,  sample_weight=classes_weights_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')


Best Parameters: 
OrderedDict([('learning_rate', 0.3142036021192166), ('max_depth', 3), ('n_estimators', 50), ('reg_alpha', 1), ('reg_lambda', 1)])

Non-nested CV Results:
XGB Train accuracy: 0.9742268041237113 Test accuracy: 0.6122448979591837
XGB Train balanced accuracy: 0.9859943977591037 Test balanced accuracy: 0.6222222222222222
XGB Train F1 0.9743759511026445 Test F1: 0.5980983302411873
XGB Train recall: 0.9742268041237113 Test recall: 0.6122448979591837
XGB Train precision: 0.9758739632586622 Test precision: 0.5886621315192743


Best Parameters: 
OrderedDict([('criterion', 'entropy'), ('max_depth', 4), ('max_features', 'log2'), ('n_estimators', 20)])

Non-nested CV Results:
RF Train accuracy: 0.9175257731958762 Test accuracy: 0.6122448979591837
RF Train balanced accuracy: 0.8977231918408389 Test balanced accuracy: 0.6555555555555556
RF Train F1 0.9153398200961462 Test F1: 0.6168498168498169
RF Train recall: 0.9175257731958762 Test recall: 0.6122448979591837
RF Train precision: 0

In [None]:
X2_boruta_sel = MinMaxScaler().fit_transform(X_boruta_sel)
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X2_boruta_sel, Y, test_size=0.2, random_state=0)

In [None]:
classes_weights_train = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=Y_train_boruta
)

In [25]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2_boruta_sel , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2_boruta_sel , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

KNN Nested CV results for all scores: 
 {'fit_time': array([42.36386037, 45.76399589, 44.42084885, 48.70277309, 45.12233591]), 'score_time': array([0.00371099, 0.00355411, 0.00468826, 0.00357103, 0.00330877]), 'test_accuracy': array([0.69387755, 0.69387755, 0.75510204, 0.85416667, 0.72916667]), 'test_balanced_accuracy': array([0.61349206, 0.64935065, 0.59469697, 0.78787879, 0.61795735]), 'test_f1_weighted': array([0.63324703, 0.69356358, 0.73704867, 0.82879002, 0.66363304]), 'test_precision_weighted': array([0.62364405, 0.69451531, 0.7466248 , 0.88510101, 0.79922027]), 'test_recall_weighted': array([0.69387755, 0.69387755, 0.75510204, 0.85416667, 0.72916667])} 

KNN Accuracy Nested CV Average 0.7452380952380951
KNN Balanced Accuracy Nested CV Average 0.652675164341831
KNN F1 Nested CV Average 0.7112564669642738
KNN Precision Nested CV Average 0.7498210890222168
KNN Recall Nested CV Average 0.7452380952380951
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 15), (

In [None]:
for name, model in othermodels:
    model.fit(X_train_boruta, Y_train_boruta, sample_weight=classes_weights_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

In [14]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss', learning_rate=0.4885189317071115,
                           max_depth=3, n_estimators=49, reg_alpha=7, reg_lambda=3) 


gb = GradientBoostingClassifier(random_state=seed, criterion='mae', learning_rate=0.260757764244599, max_depth=2, max_features='sqrt',
                               n_estimators=21)

rf = RandomForestClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='auto', n_estimators=27)

dt = DecisionTreeClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='log2')


extra = ExtraTreesClassifier(random_state=seed, criterion='gini', max_depth=3, max_features='log2', n_estimators=29)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=15, weights='distance')

svm = SVC(C=95.4041342841097, gamma='auto', kernel='rbf')

lr = LogisticRegression(solver='liblinear',multi_class='auto',random_state=seed, C=1, max_iter=1000, penalty='l2')


In [23]:
target_names = ['most likely', 'probable', 'least likely']
xgb.fit(X_train_boruta, Y_train_boruta)
predictions = list(xgb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.67      0.17      0.27        12
    probable       0.76      0.93      0.84        30
least likely       0.78      1.00      0.88         7

    accuracy                           0.76        49
   macro avg       0.73      0.70      0.66        49
weighted avg       0.74      0.76      0.70        49



In [25]:
gb.fit(X_train_boruta, Y_train_boruta)
predictions = list(gb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.43      0.25      0.32        12
    probable       0.76      0.83      0.79        30
least likely       0.78      1.00      0.88         7

    accuracy                           0.71        49
   macro avg       0.65      0.69      0.66        49
weighted avg       0.68      0.71      0.69        49



In [26]:
rf.fit(X_train_boruta, Y_train_boruta)
predictions = list(rf.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.17      0.08      0.11        12
    probable       0.65      0.80      0.72        30
least likely       0.83      0.71      0.77         7

    accuracy                           0.61        49
   macro avg       0.55      0.53      0.53        49
weighted avg       0.56      0.61      0.58        49



In [27]:
dt.fit(X_train_boruta, Y_train_boruta)
predictions = list(dt.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.43      0.25      0.32        12
    probable       0.70      0.87      0.78        30
least likely       1.00      0.71      0.83         7

    accuracy                           0.69        49
   macro avg       0.71      0.61      0.64        49
weighted avg       0.68      0.69      0.67        49



In [28]:
extra.fit(X_train_boruta, Y_train_boruta)
predictions = list(extra.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.50      0.08      0.14        12
    probable       0.72      0.97      0.83        30
least likely       0.86      0.86      0.86         7

    accuracy                           0.73        49
   macro avg       0.69      0.64      0.61        49
weighted avg       0.69      0.73      0.66        49



In [29]:
knn.fit(X_train_boruta, Y_train_boruta)
predictions = list(knn.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.67      0.17      0.27        12
    probable       0.74      0.87      0.80        30
least likely       0.45      0.71      0.56         7

    accuracy                           0.67        49
   macro avg       0.62      0.58      0.54        49
weighted avg       0.68      0.67      0.63        49



In [30]:
svm.fit(X_train_boruta, Y_train_boruta)
predictions = list(svm.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.14      0.08      0.11        12
    probable       0.65      0.80      0.72        30
least likely       1.00      0.71      0.83         7

    accuracy                           0.61        49
   macro avg       0.60      0.53      0.55        49
weighted avg       0.57      0.61      0.58        49



In [31]:
lr.fit(X_train_boruta, Y_train_boruta)
predictions = list(lr.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.33      0.17      0.22        12
    probable       0.72      0.87      0.79        30
least likely       0.57      0.57      0.57         7

    accuracy                           0.65        49
   macro avg       0.54      0.53      0.53        49
weighted avg       0.61      0.65      0.62        49

