# __Machine Learning for Prioritizing Blood Pressure Genes__ 

## Import modules:

In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings
warnings.filterwarnings('ignore')

filterwarnings("ignore")

seed = 0

# Load data:

In [2]:
data = pd.read_csv("2021-11-19_training_cleaned.csv", header=0, sep=",")

In [3]:
data['BPlabel_encoded'] = data['BPlabel'].map( {"most likely": 1, "probable": 2,  "possible": 3, "least likely": 4})
Y = data["BPlabel_encoded"] 
data = data.drop(["BPlabel"],1)
data.shape 

(327, 109)

In [4]:
X = pd.read_csv("2021-11-19_imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Building Models:
- Parameter tuning with Bayesian optimization over hyper parameters

In [6]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss') 
xgb_params = {
    'max_depth':  (1, 4), #Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    'learning_rate': (0.01, 0.5, 'log-uniform'),  #Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features
    'n_estimators':  (10, 50), #Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'reg_alpha':  (1, 10, 'log-uniform'), #L1 regularization term on weights. Increasing this value will make model more conservative.
    'reg_lambda':  (1, 10, 'log-uniform')} #L2 regularization term on weights. Increasing this value will make model more conservative.


gb = GradientBoostingClassifier(random_state=seed)
gb_params = {
    'learning_rate': (0.01, 0.5),
    'max_depth': (1, 4),
    "max_features":["log2","sqrt", "auto"],
    "criterion": ["friedman_mse", "mse", "mae"],
    'n_estimators': (10, 50)
    }

rf = RandomForestClassifier(random_state=seed)
rf_params={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]} 

dt = DecisionTreeClassifier(random_state=seed)
dt_params= {"criterion": ["gini", "entropy"],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : (1, 4)}

extra = ExtraTreesClassifier(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]}


inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []

models.append(('XGB', BayesSearchCV(xgb, xgb_params, cv=inner_cv, iid=False, n_jobs=1))) 
models.append(('RF', BayesSearchCV(rf, rf_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('GB', BayesSearchCV(gb, gb_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


## Model Benchmarking - all features:
### Tree-based models:

In [7]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([64.80858803, 63.89787507, 59.02371597, 63.03747582, 65.11796403]), 'score_time': array([0.00574422, 0.00547695, 0.00478101, 0.00519896, 0.00508308]), 'test_accuracy': array([0.59090909, 0.54545455, 0.6       , 0.55384615, 0.49230769]), 'test_balanced_accuracy': array([0.55022321, 0.45921921, 0.54380342, 0.50288462, 0.48075739]), 'test_f1_weighted': array([0.54641332, 0.47677161, 0.57545924, 0.49082777, 0.47723866]), 'test_precision_weighted': array([0.56456612, 0.56895769, 0.58230191, 0.44881165, 0.47459096]), 'test_recall_weighted': array([0.59090909, 0.54545455, 0.6       , 0.55384615, 0.49230769])} 

XGB Accuracy Nested CV Average 0.5565034965034965
XGB Balanced Accuracy Nested CV Average 0.5073775691694907
XGB F1 Nested CV Average 0.5133421198420437
XGB Precision Nested CV Average 0.527845667048939
XGB Recall Nested CV Average 0.5565034965034965
Best Parameters: 
OrderedDict([('learning_rate', 0.01), ('max_depth', 3), ('n_

### Others models:
- all other models require feature scaling before running

In [8]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan', 'minkowski']}


lr = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto',random_state=seed)
lr_params= {
    'penalty':['l1', 'l2'], 
    'C': [0.5, 1, 5, 10], 
    'max_iter':[500, 1000, 2500]}

svc = SVC()
svc_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': ['scale', 'auto']}

othermodels = []

othermodels.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('LR', BayesSearchCV(lr, lr_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


In [9]:
X2 = MinMaxScaler().fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2, random_state=0)

In [10]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2 , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2 , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results) 

KNN Nested CV results for all scores: 
 {'fit_time': array([50.26002002, 50.16565108, 48.5950501 , 44.82649708, 50.59111404]), 'score_time': array([0.00375915, 0.0064888 , 0.005862  , 0.00524998, 0.00477386]), 'test_accuracy': array([0.42424242, 0.40909091, 0.44615385, 0.49230769, 0.47692308]), 'test_balanced_accuracy': array([0.31473214, 0.31754032, 0.32834758, 0.44775641, 0.43965517]), 'test_f1_weighted': array([0.36943509, 0.36250358, 0.34621735, 0.42124451, 0.40014036]), 'test_precision_weighted': array([0.36352041, 0.34158981, 0.30372781, 0.38182958, 0.39311966]), 'test_recall_weighted': array([0.42424242, 0.40909091, 0.44615385, 0.49230769, 0.47692308])} 

KNN Accuracy Nested CV Average 0.44974358974358974
KNN Balanced Accuracy Nested CV Average 0.369606325291114
KNN F1 Nested CV Average 0.37990817741218497
KNN Precision Nested CV Average 0.35675745355434574
KNN Recall Nested CV Average 0.44974358974358974
Best Parameters: 
OrderedDict([('metric', 'minkowski'), ('n_neighbors', 17

## Model benchmarking - BorutaShap feature selection

In [7]:
X_boruta_sel = pd.read_csv("2021-11-19_selected_features_training_data.csv", header=0)
X_boruta_sel.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X_boruta_sel.columns.values
]

In [8]:
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X_boruta_sel, Y, test_size=0.2, random_state=0)

In [13]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X_boruta_sel , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X_boruta_sel , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([62.71318293, 54.88111281, 66.87375093, 59.91220093, 64.24479914]), 'score_time': array([0.00586295, 0.00561619, 0.00560021, 0.00496197, 0.00529289]), 'test_accuracy': array([0.62121212, 0.51515152, 0.67692308, 0.50769231, 0.64615385]), 'test_balanced_accuracy': array([0.56994048, 0.48895711, 0.59170228, 0.48466117, 0.6253592 ]), 'test_f1_weighted': array([0.5733198 , 0.48261395, 0.63828203, 0.49217391, 0.63739287]), 'test_precision_weighted': array([0.60576599, 0.47752936, 0.7467366 , 0.51141167, 0.66503945]), 'test_recall_weighted': array([0.62121212, 0.51515152, 0.67692308, 0.50769231, 0.64615385])} 

XGB Accuracy Nested CV Average 0.5934265734265733
XGB Balanced Accuracy Nested CV Average 0.5521240468786353
XGB F1 Nested CV Average 0.5647565143307052
XGB Precision Nested CV Average 0.6012966123238168
XGB Recall Nested CV Average 0.5934265734265733
Best Parameters: 
OrderedDict([('learning_rate', 0.03772101259984952), ('max_

In [14]:
X2_boruta_sel = MinMaxScaler().fit_transform(X_boruta_sel)
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X2_boruta_sel, Y, test_size=0.2, random_state=0)

In [15]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2_boruta_sel , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2_boruta_sel , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

KNN Nested CV results for all scores: 
 {'fit_time': array([50.18574595, 51.53358197, 49.24860287, 47.97537804, 49.53082395]), 'score_time': array([0.00379801, 0.00373316, 0.00505018, 0.00517917, 0.00524998]), 'test_accuracy': array([0.56060606, 0.46969697, 0.61538462, 0.55384615, 0.50769231]), 'test_balanced_accuracy': array([0.57217262, 0.40281342, 0.52475071, 0.49876374, 0.47834565]), 'test_f1_weighted': array([0.51483539, 0.42741925, 0.58610409, 0.52698804, 0.47855214]), 'test_precision_weighted': array([0.50291944, 0.44113117, 0.63838303, 0.56445601, 0.52307692]), 'test_recall_weighted': array([0.56060606, 0.46969697, 0.61538462, 0.55384615, 0.50769231])} 

KNN Accuracy Nested CV Average 0.5414452214452214
KNN Balanced Accuracy Nested CV Average 0.49536922651772486
KNN F1 Nested CV Average 0.5067797825335347
KNN Precision Nested CV Average 0.5339933131134902
KNN Recall Nested CV Average 0.5414452214452214
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 9), 

In [9]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=4, objective='multi:softmax', eval_metric='mlogloss', learning_rate=0.03772101259984952,
                           max_depth=3, n_estimators=42, reg_alpha=1, reg_lambda=1) 


gb = GradientBoostingClassifier(random_state=seed, criterion='mse', learning_rate=0.19910731251957545, max_depth=3, max_features='sqrt',
                               n_estimators=23)


rf = RandomForestClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='sqrt', n_estimators=38)


dt = DecisionTreeClassifier(random_state=seed, criterion='entropy', max_depth=3, max_features='log2')


extra = ExtraTreesClassifier(random_state=seed, criterion='gini', max_depth=4, max_features='sqrt', n_estimators=24)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=9, weights='distance')


svm = SVC(C=422.07858224451616, gamma='auto', kernel='rbf')

lr = LogisticRegression(solver='liblinear',multi_class='auto',random_state=seed, C=5.0, max_iter=2500, penalty='l2')


In [10]:
target_names = ['most likely', 'probable', 'possible', 'least likely']
xgb.fit(X_train_boruta, Y_train_boruta)
predictions = list(xgb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.17      0.17      0.17         6
    probable       0.65      0.88      0.75        32
    possible       0.67      0.29      0.40        21
least likely       0.88      1.00      0.93         7

    accuracy                           0.64        66
   macro avg       0.59      0.58      0.56        66
weighted avg       0.64      0.64      0.60        66



In [11]:
gb.fit(X_train_boruta, Y_train_boruta)
predictions = list(gb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.29      0.33      0.31         6
    probable       0.62      0.72      0.67        32
    possible       0.44      0.33      0.38        21
least likely       1.00      0.86      0.92         7

    accuracy                           0.58        66
   macro avg       0.59      0.56      0.57        66
weighted avg       0.57      0.58      0.57        66



In [12]:
rf.fit(X_train_boruta, Y_train_boruta)
predictions = list(rf.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.33      0.17      0.22         6
    probable       0.63      1.00      0.77        32
    possible       0.75      0.14      0.24        21
least likely       0.88      1.00      0.93         7

    accuracy                           0.65        66
   macro avg       0.65      0.58      0.54        66
weighted avg       0.67      0.65      0.57        66



In [13]:
dt.fit(X_train_boruta, Y_train_boruta)
predictions = list(dt.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.00      0.00      0.00         6
    probable       0.54      0.81      0.65        32
    possible       0.31      0.24      0.27        21
least likely       1.00      0.29      0.44         7

    accuracy                           0.50        66
   macro avg       0.46      0.33      0.34        66
weighted avg       0.47      0.50      0.45        66



In [14]:
extra.fit(X_train_boruta, Y_train_boruta)
predictions = list(extra.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.00      0.00      0.00         6
    probable       0.54      0.91      0.67        32
    possible       0.43      0.14      0.21        21
least likely       1.00      0.57      0.73         7

    accuracy                           0.55        66
   macro avg       0.49      0.41      0.40        66
weighted avg       0.50      0.55      0.47        66



In [15]:
knn.fit(X_train_boruta, Y_train_boruta)
predictions = list(knn.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.20      0.17      0.18         6
    probable       0.59      0.69      0.64        32
    possible       0.56      0.48      0.51        21
least likely       0.67      0.57      0.62         7

    accuracy                           0.56        66
   macro avg       0.50      0.48      0.49        66
weighted avg       0.55      0.56      0.55        66



In [16]:
svm.fit(X_train_boruta, Y_train_boruta)
predictions = list(svm.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.00      0.00      0.00         6
    probable       0.53      0.66      0.58        32
    possible       0.17      0.10      0.12        21
least likely       1.00      0.57      0.73         7

    accuracy                           0.41        66
   macro avg       0.42      0.33      0.36        66
weighted avg       0.41      0.41      0.40        66



In [17]:
lr.fit(X_train_boruta, Y_train_boruta)
predictions = list(lr.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.00      0.00      0.00         6
    probable       0.55      0.88      0.67        32
    possible       0.43      0.14      0.21        21
least likely       0.67      0.57      0.62         7

    accuracy                           0.53        66
   macro avg       0.41      0.40      0.38        66
weighted avg       0.47      0.53      0.46        66

