# __Machine Learning for Prioritizing Blood Pressure Genes__ 

## Import modules:

In [1]:
import re

import numpy as np
import pandas as pd
from numpy import sort
from scipy.cluster import hierarchy
from scipy.stats import spearmanr

regex = re.compile(r"\[|\]|<", re.IGNORECASE)

import seaborn as sns
import shap
import statsmodels.api as sm

%matplotlib inline
%config InlineBackend.figure_format ='retina'
import statsmodels.stats.api as sms
import xgboost
from sklearn import datasets, metrics, model_selection, preprocessing
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score
)
from sklearn.metrics import *
from sklearn.model_selection import (
    GridSearchCV,
    KFold,
    RandomizedSearchCV,
    RepeatedKFold,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    learning_curve,
    train_test_split,
    validation_curve,
)


from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC, RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.datasets import make_imbalance


from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from skopt.plots import plot_convergence

sns.set_style("darkgrid")
sns.mpl.rcParams["figure.figsize"] = (15.0, 9.0)

import warnings

import matplotlib
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=FutureWarning)
from warnings import filterwarnings
warnings.filterwarnings('ignore')

filterwarnings("ignore")

seed = 0

# Load data:

In [2]:
data = pd.read_csv("2021-11-19_training_cleaned.csv", header=0, sep=",")

In [3]:
data['BPlabel_encoded'] = data['BPlabel'].map( {'most likely':1,'probable':2, 'possible':3, 'least likely':4})
Y = data["BPlabel_encoded"] 
Y2 = Y
data = data.drop(["BPlabel"],1)
data.shape  #Data has IPA and ensembl features without possible label

(327, 109)

In [4]:
X = pd.read_csv("2021-11-19_imputed_training_data.csv", header=0)
X.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X.columns.values
]

In [5]:
print('Before OverSampling, the shape of X: {}'.format(X.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y.shape))

sm = SMOTE(random_state=seed)
X, Y = sm.fit_resample(X, Y)

print('After OverSampling, the shape of X: {}'.format(X.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y==3)))
print("After OverSampling, counts of label '4': {}".format(sum(Y==4)))

Before OverSampling, the shape of X: (327, 107)
Before OverSampling, the shape of y: (327,) 

After OverSampling, the shape of X: (596, 107)
After OverSampling, the shape of y: (596,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149
After OverSampling, counts of label '4': 149


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Building Models:
- Parameter tuning with Bayesian optimization over hyper parameters

In [10]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss') 
xgb_params = {
    'max_depth':  (1, 4), #Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit.
    'learning_rate': (0.01, 0.5, 'log-uniform'),  #Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features
    'n_estimators':  (10, 50), #Number of gradient boosted trees. Equivalent to number of boosting rounds.
    'reg_alpha':  (1, 10, 'log-uniform'), #L1 regularization term on weights. Increasing this value will make model more conservative.
    'reg_lambda':  (1, 10, 'log-uniform')} #L2 regularization term on weights. Increasing this value will make model more conservative.


gb = GradientBoostingClassifier(random_state=seed)
gb_params = {
    'learning_rate': (0.01, 0.5),
    'max_depth': (1, 4),
    "max_features":["log2","sqrt", "auto"],
    "criterion": ["friedman_mse", "mse", "mae"],
    'n_estimators': (10, 50)
    }

rf = RandomForestClassifier(random_state=seed)
rf_params={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]} 

dt = DecisionTreeClassifier(random_state=seed)
dt_params= {"criterion": ["gini", "entropy"],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : (1, 4)}

extra = ExtraTreesClassifier(random_state=seed)
extra_params ={'n_estimators': (10, 50), 
             'max_features': ['auto', 'sqrt', 'log2'],
             'max_depth' : (1, 4),
             'criterion' :["gini", "entropy"]}


inner_cv = KFold(n_splits=5, shuffle=True, random_state=seed)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

models = []

models.append(('XGB', BayesSearchCV(xgb, xgb_params, cv=inner_cv, iid=False, n_jobs=1))) 
models.append(('RF', BayesSearchCV(rf, rf_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('GB', BayesSearchCV(gb, gb_params, cv=inner_cv,iid=False, n_jobs=1)))
models.append(('DT', BayesSearchCV(dt, dt_params, cv=inner_cv, iid=False, n_jobs=1)))
models.append(('ExtraTrees', BayesSearchCV(extra, extra_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
#models.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


## Model Benchmarking - all features:
### Tree-based models:

In [11]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([75.04053569, 84.01658392, 76.79741311, 76.62657213, 81.51650786]), 'score_time': array([0.00570416, 0.00551081, 0.00544214, 0.0060091 , 0.00647497]), 'test_accuracy': array([0.75833333, 0.74789916, 0.80672269, 0.74789916, 0.76470588]), 'test_balanced_accuracy': array([0.76560407, 0.75836406, 0.79787212, 0.75348039, 0.76830357]), 'test_f1_weighted': array([0.75507565, 0.74100258, 0.79958552, 0.74210849, 0.76311296]), 'test_precision_weighted': array([0.75415675, 0.75212894, 0.79864559, 0.74619301, 0.76416492]), 'test_recall_weighted': array([0.75833333, 0.74789916, 0.80672269, 0.74789916, 0.76470588])} 

XGB Accuracy Nested CV Average 0.7651120448179272
XGB Balanced Accuracy Nested CV Average 0.7687248415772084
XGB F1 Nested CV Average 0.7601770408870026
XGB Precision Nested CV Average 0.76305784270752
XGB Recall Nested CV Average 0.7651120448179272
Best Parameters: 
OrderedDict([('learning_rate', 0.22364286948058623), ('max_de

### Others models:
- all other models require feature scaling before running

In [12]:
knn = KNeighborsClassifier()
knn_params = {
    'n_neighbors':[7,9,11,13,15,17],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan', 'minkowski']}


lr = LogisticRegression(penalty='l1', solver='liblinear',multi_class='auto',random_state=seed)
lr_params= {
    'penalty':['l1', 'l2'], 
    'C': [0.5, 1, 5, 10], 
    'max_iter':[500, 1000, 2500]}

svc = SVC()
svc_params = {
    'kernel': ['rbf'],
   'C': (1e0, 1e3),
   'gamma': ['scale', 'auto']}

othermodels = []

othermodels.append(('KNN', BayesSearchCV(knn, knn_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('SVC', BayesSearchCV(svc, svc_params, cv=inner_cv, iid=False, n_jobs=1)))
othermodels.append(('LR', BayesSearchCV(lr, lr_params, cv=inner_cv, iid=False, n_jobs=1)))

results = []
results1 = []
results2 = []
results3 = []
names = []
names2 =[]
scoring = ['accuracy', 'balanced_accuracy', 'f1_weighted', 
          'precision_weighted','recall_weighted'] #https://scikit-learn.org/stable/modules/model_evaluation.html


In [13]:
X2 = MinMaxScaler().fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2, random_state=0)

In [14]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2 , Y, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2 , Y, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train, Y_train)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)
    print(name, 'Train accuracy:', accuracy_score(Y_train, y_pred_train), 'Test accuracy:', accuracy_score(Y_test, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test, y_pred))
    print(name, 'Train F1', f1_score(Y_train, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results) 

KNN Nested CV results for all scores: 
 {'fit_time': array([45.73063397, 45.01727414, 46.19174695, 47.83670712, 44.06176591]), 'score_time': array([0.00728798, 0.00749803, 0.00714302, 0.00731897, 0.00761104]), 'test_accuracy': array([0.80833333, 0.7394958 , 0.7394958 , 0.72268908, 0.71428571]), 'test_balanced_accuracy': array([0.79708623, 0.76857143, 0.72810324, 0.74244281, 0.71733631]), 'test_f1_weighted': array([0.80068617, 0.69678406, 0.72377004, 0.67413358, 0.68180516]), 'test_precision_weighted': array([0.80816983, 0.7585699 , 0.73178033, 0.76631749, 0.72905632]), 'test_recall_weighted': array([0.80833333, 0.7394958 , 0.7394958 , 0.72268908, 0.71428571])} 

KNN Accuracy Nested CV Average 0.7448599439775911
KNN Balanced Accuracy Nested CV Average 0.7507080033266174
KNN F1 Nested CV Average 0.7154358019534685
KNN Precision Nested CV Average 0.7587787727645342
KNN Recall Nested CV Average 0.7448599439775911
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 7), (

## Model benchmarking - BorutaShap feature selection

In [7]:
X_boruta_sel = pd.read_csv("2021-11-19_selected_features_training_data.csv", header=0)
X_boruta_sel.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in X_boruta_sel.columns.values
]

In [8]:
print('Before OverSampling, the shape of X: {}'.format(X_boruta_sel.shape))
print('Before OverSampling, the shape of y: {} \n'.format(Y2.shape))

sm = SMOTE(random_state=seed)
X_boruta_sel, Y2 = sm.fit_resample(X_boruta_sel, Y2)

print('After OverSampling, the shape of X: {}'.format(X_boruta_sel.shape))
print('After OverSampling, the shape of y: {} \n'.format(Y2.shape))

print("After OverSampling, counts of label '1': {}".format(sum(Y2==1)))
print("After OverSampling, counts of label '2': {}".format(sum(Y2==2)))
print("After OverSampling, counts of label '3': {}".format(sum(Y2==3)))
print("After OverSampling, counts of label '4': {}".format(sum(Y2==4)))

Before OverSampling, the shape of X: (327, 6)
Before OverSampling, the shape of y: (327,) 

After OverSampling, the shape of X: (596, 6)
After OverSampling, the shape of y: (596,) 

After OverSampling, counts of label '1': 149
After OverSampling, counts of label '2': 149
After OverSampling, counts of label '3': 149
After OverSampling, counts of label '4': 149


In [9]:
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X_boruta_sel, Y2, test_size=0.2, random_state=0)

In [18]:
for name, model in models:
    nested_cv_results = model_selection.cross_validate(model, X_boruta_sel , Y2, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X_boruta_sel , Y2, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

XGB Nested CV results for all scores: 
 {'fit_time': array([54.93386698, 50.17822003, 53.28910208, 65.77923989, 52.71464276]), 'score_time': array([0.005162  , 0.00528002, 0.00525904, 0.00514293, 0.00522232]), 'test_accuracy': array([0.65      , 0.7394958 , 0.76470588, 0.70588235, 0.71428571]), 'test_balanced_accuracy': array([0.67418707, 0.74443548, 0.75660199, 0.69735294, 0.71863198]), 'test_f1_weighted': array([0.65265543, 0.73864912, 0.76338582, 0.70634345, 0.70808351]), 'test_precision_weighted': array([0.67290487, 0.7414081 , 0.7630932 , 0.7110463 , 0.71162077]), 'test_recall_weighted': array([0.65      , 0.7394958 , 0.76470588, 0.70588235, 0.71428571])} 

XGB Accuracy Nested CV Average 0.714873949579832
XGB Balanced Accuracy Nested CV Average 0.7182418911154494
XGB F1 Nested CV Average 0.7138234659837958
XGB Precision Nested CV Average 0.7200146474711575
XGB Recall Nested CV Average 0.714873949579832
Best Parameters: 
OrderedDict([('learning_rate', 0.49999999999999994), ('max_de

In [19]:
X2_boruta_sel = MinMaxScaler().fit_transform(X_boruta_sel)
X_train_boruta, X_test_boruta, Y_train_boruta, Y_test_boruta = train_test_split(X2_boruta_sel, Y2, test_size=0.2, random_state=0)

In [20]:
for name, model in othermodels:
    nested_cv_results = model_selection.cross_validate(model, X2_boruta_sel , Y2, cv=outer_cv, scoring=scoring, error_score="raise")
    nested_cv_results2 = model_selection.cross_val_score(model, X2_boruta_sel , Y2, cv=outer_cv, scoring='balanced_accuracy', error_score="raise")
    results.append(nested_cv_results2)
    names.append(name)
    print(name, 'Nested CV results for all scores:', '\n', nested_cv_results, '\n')
    print(name, 'Accuracy Nested CV Average', np.mean(nested_cv_results['test_accuracy']))
    print(name, 'Balanced Accuracy Nested CV Average', np.mean(nested_cv_results['test_balanced_accuracy'] ))
    print(name, 'F1 Nested CV Average', np.mean(nested_cv_results['test_f1_weighted'] ))
    print(name, 'Precision Nested CV Average', np.mean(nested_cv_results['test_precision_weighted'] ))
    print(name, 'Recall Nested CV Average', np.mean(nested_cv_results['test_recall_weighted'] ))
    model.fit(X_train_boruta, Y_train_boruta)
    print("Best Parameters: \n{}\n".format(model.best_params_))
    print('Non-nested CV Results:')
    y_pred_train = model.predict(X_train_boruta)
    y_pred = model.predict(X_test_boruta)
    print(name, 'Train accuracy:', accuracy_score(Y_train_boruta, y_pred_train), 'Test accuracy:', accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train balanced accuracy:', balanced_accuracy_score(Y_train, y_pred_train), 'Test balanced accuracy:', balanced_accuracy_score(Y_test_boruta, y_pred))
    print(name, 'Train F1', f1_score(Y_train_boruta, y_pred_train, average='weighted'), 'Test F1:', f1_score(Y_test_boruta, y_pred, average='weighted'))
    print(name, 'Train recall:', recall_score(Y_train_boruta, y_pred_train, average='weighted'),'Test recall:', recall_score(Y_test_boruta, y_pred,average='weighted'))
    print(name, 'Train precision:', precision_score(Y_train_boruta, y_pred_train,average='weighted'), 'Test precision:', precision_score(Y_test_boruta, y_pred,average='weighted'))
    print('\n')

    
print('All balanced accuracy results:', results)    

KNN Nested CV results for all scores: 
 {'fit_time': array([44.11364293, 46.92315698, 47.5888021 , 43.04875612, 45.40666294]), 'score_time': array([0.00383282, 0.00414491, 0.00374079, 0.00370193, 0.00381899]), 'test_accuracy': array([0.63333333, 0.74789916, 0.76470588, 0.68907563, 0.71428571]), 'test_balanced_accuracy': array([0.6753136 , 0.75193548, 0.75707576, 0.68487745, 0.72114122]), 'test_f1_weighted': array([0.63262374, 0.74186013, 0.75957142, 0.68307302, 0.70802666]), 'test_precision_weighted': array([0.67764661, 0.74333925, 0.76738782, 0.69472989, 0.707024  ]), 'test_recall_weighted': array([0.63333333, 0.74789916, 0.76470588, 0.68907563, 0.71428571])} 

KNN Accuracy Nested CV Average 0.709859943977591
KNN Balanced Accuracy Nested CV Average 0.7180687033708228
KNN F1 Nested CV Average 0.7050309936330413
KNN Precision Nested CV Average 0.7180255144535623
KNN Recall Nested CV Average 0.709859943977591
Best Parameters: 
OrderedDict([('metric', 'manhattan'), ('n_neighbors', 7), ('w

In [15]:
xgb = xgboost.XGBClassifier(random_state=seed, num_class=3, objective='multi:softmax', eval_metric='mlogloss', learning_rate=0.49999999999999994,
                           max_depth=3, n_estimators=32, reg_alpha=1, reg_lambda=1) 


gb = GradientBoostingClassifier(random_state=seed, criterion='friedman_mse', learning_rate=0.28483530005634594, max_depth=4, max_features='sqrt',
                               n_estimators=41)

rf = RandomForestClassifier(random_state=seed, criterion='gini', max_depth=4, max_features='auto', n_estimators=25)

dt = DecisionTreeClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='sqrt')

extra = ExtraTreesClassifier(random_state=seed, criterion='entropy', max_depth=4, max_features='sqrt', n_estimators=50)

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=7, weights='distance')

svm = SVC(C=1000, gamma='scale', kernel='rbf')

lr = LogisticRegression(solver='liblinear',multi_class='auto',random_state=seed, C=0.5, max_iter=1000, penalty='l1')


In [16]:
target_names = ['most likely', 'probable', 'possible', 'least likely']
xgb.fit(X_train_boruta, Y_train_boruta)
predictions = list(xgb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.83      0.57      0.68        42
    probable       0.50      0.48      0.49        23
    possible       0.52      0.68      0.59        34
least likely       0.84      1.00      0.91        21

    accuracy                           0.66       120
   macro avg       0.67      0.68      0.67       120
weighted avg       0.68      0.66      0.66       120



In [17]:
gb.fit(X_train_boruta, Y_train_boruta)
predictions = list(gb.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.79      0.55      0.65        42
    probable       0.48      0.52      0.50        23
    possible       0.48      0.59      0.53        34
least likely       0.88      1.00      0.93        21

    accuracy                           0.63       120
   macro avg       0.66      0.66      0.65       120
weighted avg       0.66      0.63      0.64       120



In [18]:
rf.fit(X_train_boruta, Y_train_boruta)
predictions = list(rf.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.74      0.40      0.52        42
    probable       0.36      0.57      0.44        23
    possible       0.53      0.59      0.56        34
least likely       0.91      1.00      0.95        21

    accuracy                           0.59       120
   macro avg       0.63      0.64      0.62       120
weighted avg       0.64      0.59      0.59       120



In [19]:
dt.fit(X_train_boruta, Y_train_boruta)
predictions = list(dt.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.57      0.19      0.29        42
    probable       0.24      0.22      0.23        23
    possible       0.45      0.82      0.58        34
least likely       0.91      1.00      0.95        21

    accuracy                           0.52       120
   macro avg       0.54      0.56      0.51       120
weighted avg       0.53      0.52      0.48       120



In [20]:
extra.fit(X_train_boruta, Y_train_boruta)
predictions = list(extra.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.71      0.40      0.52        42
    probable       0.40      0.78      0.53        23
    possible       0.50      0.18      0.26        34
least likely       0.54      1.00      0.70        21

    accuracy                           0.52       120
   macro avg       0.54      0.59      0.50       120
weighted avg       0.56      0.52      0.48       120



In [21]:
knn.fit(X_train_boruta, Y_train_boruta)
predictions = list(knn.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.83      0.60      0.69        42
    probable       0.38      0.39      0.38        23
    possible       0.58      0.74      0.65        34
least likely       0.87      0.95      0.91        21

    accuracy                           0.66       120
   macro avg       0.66      0.67      0.66       120
weighted avg       0.68      0.66      0.66       120



In [22]:
svm.fit(X_train_boruta, Y_train_boruta)
predictions = list(svm.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.61      0.26      0.37        42
    probable       0.25      0.65      0.36        23
    possible       0.50      0.12      0.19        34
least likely       0.48      0.76      0.59        21

    accuracy                           0.38       120
   macro avg       0.46      0.45      0.38       120
weighted avg       0.49      0.38      0.35       120



In [23]:
lr.fit(X_train_boruta, Y_train_boruta)
predictions = list(lr.predict(X_test_boruta))
print(classification_report(Y_test_boruta, predictions, target_names=target_names))

              precision    recall  f1-score   support

 most likely       0.62      0.50      0.55        42
    probable       0.32      0.70      0.44        23
    possible       1.00      0.06      0.11        34
least likely       0.56      0.90      0.69        21

    accuracy                           0.48       120
   macro avg       0.62      0.54      0.45       120
weighted avg       0.66      0.48      0.43       120

