# __Machine Learning for Prioritizing Blood Pressure Genes__ 

## Import modules:

In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)


from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.datasets import make_imbalance


from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings
warnings.filterwarnings('ignore')

filterwarnings("ignore")

seed = 0

# Load data:

In [2]:
data = pd.read_csv("2021-11-19_training_cleaned.csv", header=0, sep=",")

In [3]:
data['BPlabel_encoded'] = data['BPlabel'].map( {'most likely':1,'probable':2, 'least likely':3})
Y = data["BPlabel_encoded"] 
Y2 = Y
data = data.drop(["BPlabel"],1)
data.shape  #Data has IPA and ensembl features without possible label

(243, 103)

In [4]:
X = pd.read_csv("2021-11-19_imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [5]:
print('Before OverSampling, the shape of X: {}'.format(X.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y.shape))

sm = SMOTE(random_state=seed)
X, Y = sm.fit_resample(X, Y)

print('After OverSampling, the shape of X: {}'.format(X.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y==3)))

Before OverSampling, the shape of X: (243, 101)
Before OverSampling, the shape of y: (243,) 

After OverSampling, the shape of X: (447, 101)
After OverSampling, the shape of y: (447,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Building Models:
- Parameter tuning with Bayesian optimization over hyper parameters

In [7]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss') 
xgb_params = {
    'max_depth':  (1, 4), #Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    'learning_rate': (0.01, 0.5, 'log-uniform'),  #Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features
    'n_estimators':  (10, 50), #Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'reg_alpha':  (1, 10, 'log-uniform'), #L1 regularization term on weights. Increasing this value will make model more conservative.
    'reg_lambda':  (1, 10, 'log-uniform')} #L2 regularization term on weights. Increasing this value will make model more conservative.


gb = GradientBoostingClassifier(random_state=seed)
gb_params = {
    'learning_rate': (0.01, 0.5),
    'max_depth': (1, 4),
    "max_features":["log2","sqrt", "auto"],
    "criterion": ["friedman_mse", "mse", "mae"],
    'n_estimators': (10, 50)
    }

rf = RandomForestClassifier(random_state=seed)
rf_params={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]} 

dt = DecisionTreeClassifier(random_state=seed)
dt_params= {"criterion": ["gini", "entropy"],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : (1, 4)}

extra = ExtraTreesClassifier(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]}


inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []

models.append(('XGB', BayesSearchCV(xgb, xgb_params, cv=inner_cv, iid=False, n_jobs=1))) 
models.append(('RF', BayesSearchCV(rf, rf_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('GB', BayesSearchCV(gb, gb_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


## Model Benchmarking - all features:
### Tree-based models:

In [16]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([68.7247901 , 70.22632599, 81.06028295, 69.1443491 , 66.36307406]), 'score_time': array([0.00567508, 0.00560474, 0.00560713, 0.00544095, 0.00571179]), 'test_accuracy': array([0.87777778, 0.84444444, 0.85393258, 0.8988764 , 0.8988764 ]), 'test_balanced_accuracy': array([0.86517783, 0.86125966, 0.85490196, 0.90029762, 0.89402174]), 'test_f1_weighted': array([0.8742252 , 0.84151585, 0.85324681, 0.89896089, 0.89905719]), 'test_precision_weighted': array([0.88126984, 0.84284398, 0.85468433, 0.89936384, 0.9037659 ]), 'test_recall_weighted': array([0.87777778, 0.84444444, 0.85393258, 0.8988764 , 0.8988764 ])} 

XGB Accuracy Nested CV Average 0.8747815230961298
XGB Balanced Accuracy Nested CV Average 0.8751317614518876
XGB F1 Nested CV Average 0.8734011866417388
XGB Precision Nested CV Average 0.8763855767439054
XGB Recall Nested CV Average 0.8747815230961298
Best Parameters: 
OrderedDict([('learning_rate', 0.49999999999999994), ('max_

### Others models:
- all other models require feature scaling before running

In [8]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan', 'minkowski']}


lr = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto',random_state=seed)
lr_params= {
    'penalty':['l1', 'l2'], 
    'C': [0.5, 1, 5, 10], 
    'max_iter':[500, 1000, 2500]}

svc = SVC()
svc_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': ['scale', 'auto']}

othermodels = []

othermodels.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('LR', BayesSearchCV(lr, lr_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


In [9]:
X2 = MinMaxScaler().fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2, random_state=0)

In [19]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2 , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2 , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results) 

KNN Nested CV results for all scores: 
 {'fit_time': array([51.11454415, 43.7562201 , 48.19915128, 49.96571183, 53.49711418]), 'score_time': array([0.00604296, 0.00554085, 0.00598884, 0.00545311, 0.00550771]), 'test_accuracy': array([0.8       , 0.71111111, 0.70786517, 0.78651685, 0.84269663]), 'test_balanced_accuracy': array([0.78163772, 0.74272133, 0.74156863, 0.80059524, 0.81748188]), 'test_f1_weighted': array([0.78905386, 0.69633279, 0.65682681, 0.76172256, 0.83694235]), 'test_precision_weighted': array([0.81866763, 0.72046176, 0.77165082, 0.82020501, 0.84197296]), 'test_recall_weighted': array([0.8       , 0.71111111, 0.70786517, 0.78651685, 0.84269663])} 

KNN Accuracy Nested CV Average 0.7696379525593009
KNN Balanced Accuracy Nested CV Average 0.7768009595364805
KNN F1 Nested CV Average 0.7481756714519485
KNN Precision Nested CV Average 0.7945916362787007
KNN Recall Nested CV Average 0.7696379525593009
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 7), (

## Model benchmarking - BorutaShap feature selection

In [10]:
X_boruta_sel = pd.read_csv("2021-11-19_selected_features_training_data.csv", header=0)
X_boruta_sel.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X_boruta_sel.columns.values
]

In [11]:
print('Before OverSampling, the shape of X: {}'.format(X_boruta_sel.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y2.shape))

sm = SMOTE(random_state=seed)
X_boruta_sel, Y2 = sm.fit_resample(X_boruta_sel, Y2)

print('After OverSampling, the shape of X: {}'.format(X_boruta_sel.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y2.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y2==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y2==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y2==3)))

Before OverSampling, the shape of X: (243, 8)
Before OverSampling, the shape of y: (243,) 

After OverSampling, the shape of X: (447, 8)
After OverSampling, the shape of y: (447,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149


In [12]:
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X_boruta_sel, Y2, test_size=0.2, random_state=0)

In [30]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X_boruta_sel , Y2, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X_boruta_sel , Y2, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([71.16217709, 61.01804209, 58.0935061 , 56.19244814, 56.32305813]), 'score_time': array([0.00542712, 0.0054822 , 0.00525498, 0.00480199, 0.00499487]), 'test_accuracy': array([0.81111111, 0.76666667, 0.83146067, 0.88764045, 0.87640449]), 'test_balanced_accuracy': array([0.80479735, 0.79233512, 0.83176471, 0.88839286, 0.86911232]), 'test_f1_weighted': array([0.80632035, 0.75945479, 0.83234129, 0.88681636, 0.8771488 ]), 'test_precision_weighted': array([0.81329778, 0.76972222, 0.83577124, 0.8863504 , 0.88083033]), 'test_recall_weighted': array([0.81111111, 0.76666667, 0.83146067, 0.88764045, 0.87640449])} 

XGB Accuracy Nested CV Average 0.8346566791510612
XGB Balanced Accuracy Nested CV Average 0.8372804701829535
XGB F1 Nested CV Average 0.8324163155814748
XGB Precision Nested CV Average 0.8371943936813204
XGB Recall Nested CV Average 0.8346566791510612
Best Parameters: 
OrderedDict([('learning_rate', 0.328070202737032), ('max_de

In [31]:
X2_boruta_sel = MinMaxScaler().fit_transform(X_boruta_sel)
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X2_boruta_sel, Y2, test_size=0.2, random_state=0)

In [32]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2_boruta_sel , Y2, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2_boruta_sel , Y2, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

KNN Nested CV results for all scores: 
 {'fit_time': array([46.83808827, 47.92462993, 48.16604519, 52.48140407, 44.60405922]), 'score_time': array([0.00366807, 0.00394177, 0.00403595, 0.00392318, 0.00411892]), 'test_accuracy': array([0.8       , 0.68888889, 0.76404494, 0.73033708, 0.85393258]), 'test_balanced_accuracy': array([0.78577337, 0.7228164 , 0.78      , 0.74297003, 0.83258738]), 'test_f1_weighted': array([0.7960436 , 0.67814754, 0.75381585, 0.71048137, 0.85078389]), 'test_precision_weighted': array([0.79424991, 0.68585637, 0.78423075, 0.74502717, 0.8501133 ]), 'test_recall_weighted': array([0.8       , 0.68888889, 0.76404494, 0.73033708, 0.85393258])} 

KNN Accuracy Nested CV Average 0.7674406991260925
KNN Balanced Accuracy Nested CV Average 0.7728294362650873
KNN F1 Nested CV Average 0.7578544496047168
KNN Precision Nested CV Average 0.7718955011836707
KNN Recall Nested CV Average 0.7674406991260925
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 7), (

In [13]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss', learning_rate=0.328070202737032,
                           max_depth=4, n_estimators=50, reg_alpha=1, reg_lambda=1) 

gb = GradientBoostingClassifier(random_state=seed, criterion='friedman_mse', learning_rate=0.2805312174298497, max_depth=3, max_features='log2',
                               n_estimators=50)

rf = RandomForestClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='auto', n_estimators=43)

dt = DecisionTreeClassifier(random_state=seed, criterion='entropy', max_depth=3, max_features='log2')

extra = ExtraTreesClassifier(random_state=seed, criterion='gini', max_depth=4, max_features='log2', n_estimators=24)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')

svm = SVC(C=681.3426161935705, gamma='scale', kernel='rbf')

lr = LogisticRegression(solver='liblinear', multi_class='auto', random_state=seed, C=5.0, max_iter=500, penalty='l2')


In [14]:
target_names = ['most likely', 'probable', 'least likely']
xgb.fit(X_train_boruta, Y_train_boruta)
predictions = list(xgb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.88      0.71      0.79        31
    probable       0.77      0.88      0.82        26
least likely       0.94      1.00      0.97        33

    accuracy                           0.87        90
   macro avg       0.86      0.86      0.86        90
weighted avg       0.87      0.87      0.86        90



In [15]:
gb.fit(X_train_boruta, Y_train_boruta)
predictions = list(gb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.92      0.71      0.80        31
    probable       0.73      0.92      0.81        26
least likely       1.00      1.00      1.00        33

    accuracy                           0.88        90
   macro avg       0.88      0.88      0.87        90
weighted avg       0.89      0.88      0.88        90



In [16]:
rf.fit(X_train_boruta, Y_train_boruta)
predictions = list(rf.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.80      0.77      0.79        31
    probable       0.73      0.73      0.73        26
least likely       0.97      1.00      0.99        33

    accuracy                           0.84        90
   macro avg       0.83      0.83      0.83        90
weighted avg       0.84      0.84      0.84        90



In [17]:
dt.fit(X_train_boruta, Y_train_boruta)
predictions = list(dt.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.85      0.55      0.67        31
    probable       0.63      0.85      0.72        26
least likely       0.91      0.97      0.94        33

    accuracy                           0.79        90
   macro avg       0.80      0.79      0.78        90
weighted avg       0.81      0.79      0.78        90



In [18]:
extra.fit(X_train_boruta, Y_train_boruta)
predictions = list(extra.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.79      0.74      0.77        31
    probable       0.60      0.69      0.64        26
least likely       0.87      0.82      0.84        33

    accuracy                           0.76        90
   macro avg       0.75      0.75      0.75        90
weighted avg       0.77      0.76      0.76        90



In [19]:
knn.fit(X_train_boruta, Y_train_boruta)
predictions = list(knn.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.80      0.77      0.79        31
    probable       0.84      0.62      0.71        26
least likely       0.80      1.00      0.89        33

    accuracy                           0.81        90
   macro avg       0.82      0.80      0.80        90
weighted avg       0.81      0.81      0.80        90



In [20]:
svm.fit(X_train_boruta, Y_train_boruta)
predictions = list(svm.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.80      0.77      0.79        31
    probable       0.70      0.62      0.65        26
least likely       0.89      1.00      0.94        33

    accuracy                           0.81        90
   macro avg       0.80      0.80      0.79        90
weighted avg       0.80      0.81      0.81        90



In [21]:
lr.fit(X_train_boruta, Y_train_boruta)
predictions = list(lr.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.74      0.65      0.69        31
    probable       0.47      0.62      0.53        26
least likely       0.86      0.76      0.81        33

    accuracy                           0.68        90
   macro avg       0.69      0.67      0.68        90
weighted avg       0.71      0.68      0.69        90

