In [3]:
import pandas as pd, numpy as np
import os, json
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from skopt import BayesSearchCV, Optimizer
from skopt.space import Real, Categorical, Integer
import xgboost as xgb

In [19]:
# for experiment in experiments, set pipeline flags and run bayesian optimisation on each model
# record results in new scoreboard - are there any improvements?
# if yes, use these models for ensemble, otherwise use gridsearch acquired models for ensemble

# to dynamically toggle flags for each experiment

experiments = {'snv_FS_svmsmote':{'SNV':True, 'SVMSMOTE':True, 'FEATURE_SELECT':True}, 
               'kmeanssmote':{'KMEANSSMOTE':True}, 
               'augmentedv3_FS':{'AUGMENTEDv3':True, 'FEATURE_SELECT':True}, 
               'bordersmote':{'BORDERSMOTE':True}, 
               'snv_augmentedv3':{'SNV':True, 'AUGMENTEDv3':True}, 
               'augmentedv3':{'AUGMENTEDv3':True}, 
               'adasynsmote':{'ADASYNSMOTE':True}, 
               'smote':{'SMOTE':True}, 
               'raw':{}}

for experiment in experiments:
    SNV = False
    FEATURE_SELECT = False
    FEATURE_SELECTv2 = False
    BALANCED = False
    MY_BALANCED = False
    MY_BALANCEDv2 = False
    AUGMENTED = False # oversampling of neoplasia
    AUGMENTEDv2 = False # actual augmented
    AUGMENTEDv3 = False # augmented neoplasia only
    SMOTE = False
    SVMSMOTE = False
    KMEANSSMOTE = False
    ADASYNSMOTE = False
    BORDERSMOTE = False
    
    # set flags
    try:
        SNV = experiments[experiment]['SNV']
    except:
        pass
        # print(f'SNV not in {experiment}')

    try:
        FEATURE_SELECT = experiments[experiment]['FEATURE_SELECT']
    except:
        pass
        # print(f'FEATURE_SELECT not in {experiment}')

    try:
        FEATURE_SELECTv2 = experiments[experiment]['FEATURE_SELECTv2']
    except:
        pass
        # print(f'FEATURE_SELECTv2 not in {experiment}')

    try:
        BALANCED = experiments[experiment]['BALANCED']
    except:
        pass
        # print(f'BALANCED not in {experiment}')

    try:
        AUGMENTED = experiments[experiment]['AUGMENTED']
    except:
        pass
        # print(f'AUGMENTED not in {experiment}')
    
    try:
        MY_BALANCEDv2 = experiments[experiment]['MY_BALANCEDv2']
    except:
        pass
        # print(f'MY_BALANCEDv2 not in {experiment}')
    
    try:
        AUGMENTEDv2 = experiments[experiment]['AUGMENTEDv2']
    except:
        pass
        # print(f'AUGMENTEDv2 not in {experiment}')
    
    try:
        AUGMENTEDv3 = experiments[experiment]['AUGMENTEDv3']
    except:
        pass
        # print(f'AUGMENTEDv3 not in {experiment}')

    try:
        SMOTE = experiments[experiment]['SMOTE']
    except:
        pass
        # print(f'SMOTE not in {experiment}')

    try:
        SVMSMOTE = experiments[experiment]['SVMSMOTE']
    except:
        pass
        # print(f'SVMSMOTE not in {experiment}')

    try:
        KMEANSSMOTE = experiments[experiment]['KMEANSSMOTE']
    except:
        pass
        # print(f'KMEANSSMOTE not in {experiment}')

    try:
        ADASYNSMOTE = experiments[experiment]['ADASYNSMOTE']
    except:
        pass
        # print(f'ADASYNSMOTE not in {experiment}')

    try:
        BORDERSMOTE = experiments[experiment]['BORDERSMOTE']
    except:
        pass
        # print(f'BORDERSMOTE not in {experiment}')

    # Choose dataset
    DATASET = 'raw'
    FILENAME_ACC = 'BOmetrics/raw_dataset/model_metrics_accuracy_ensemble.json'
    FILENAME_RECALL = 'BOmetrics/raw_dataset/model_metrics_recall_ensemble.json'

    if BALANCED:
        DATASET = 'balanced'
        FILENAME_ACC = 'BOmetrics/balanced_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/balanced_dataset/model_metrics_recall_ensemble.json'
        if SNV:
            DATASET = 'snv_balanced'
            FILENAME_ACC = 'BOmetrics/balanced_dataset/model_metrics_accuracy_ensemble_snv.json'
            FILENAME_RECALL = 'BOmetrics/balanced_dataset/model_metrics_recall_ensemble_snv.json'
            if FEATURE_SELECT:
                DATASET = 'snv_FS_balanced'
                FILENAME_ACC = 'BOmetrics/balanced_dataset/model_metrics_accuracy_ensemble_snv_FS.json'
                FILENAME_RECALL = 'BOmetrics/balanced_dataset/model_metrics_recall_ensemble_snv_FS.json'
    elif MY_BALANCED:
        DATASET = 'my_balanced'
        FILENAME_ACC = 'BOmetrics/my_balanced_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/my_balanced_dataset/model_metrics_recall_ensemble.json'
        if FEATURE_SELECT:
            DATASET = 'FS_my_balanced'
            FILENAME_ACC = 'BOmetrics/my_balanced_dataset/model_metrics_accuracy_ensemble_FS.json'
            FILENAME_RECALL = 'BOmetrics/my_balanced_dataset/model_metrics_recall_ensemble_FS.json'
            if SNV:
                DATASET = 'snv_FS_my_balanced'
                FILENAME_ACC = 'BOmetrics/my_balanced_dataset/model_metrics_accuracy_ensemble_snv_FS.json'
                FILENAME_RECALL = 'BOmetrics/my_balanced_dataset/model_metrics_recall_ensemble_snv_FS.json'
        elif FEATURE_SELECTv2:
            DATASET = 'FSv2_my_balanced'
            FILENAME_ACC = 'BOmetrics/my_balanced_dataset/model_metrics_accuracy_ensemble_FSv2.json'
            FILENAME_RECALL = 'BOmetrics/my_balanced_dataset/model_metrics_recall_ensemble_FSv2.json'
        elif SNV:
            DATASET = 'snv_my_balanced'
            FILENAME_ACC = 'BOmetrics/my_balanced_dataset/model_metrics_accuracy_ensemble_snv.json'
            FILENAME_RECALL = 'BOmetrics/my_balanced_dataset/model_metrics_recall_ensemble_snv.json'
    elif AUGMENTED:
        DATASET = 'augmented'
        FILENAME_ACC = 'BOmetrics/augmented_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/augmented_dataset/model_metrics_recall_ensemble.json'
        if FEATURE_SELECT:
            DATASET = 'augmented_FS'
            FILENAME_ACC = 'BOmetrics/augmented_dataset/model_metrics_accuracy_ensemble_FS.json'
            FILENAME_RECALL = 'BOmetrics/augmented_dataset/model_metrics_recall_ensemble_FS.json'
    elif AUGMENTEDv2:
        DATASET = 'augmentedv2'
        FILENAME_ACC = 'BOmetrics/augmentedv2_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/augmentedv2_dataset/model_metrics_recall_ensemble.json'
        if FEATURE_SELECT:
            DATASET = 'augmentedv2_FS'
            FILENAME_ACC = 'BOmetrics/augmentedv2_dataset/model_metrics_accuracy_ensemble_FS.json'
            FILENAME_RECALL = 'BOmetrics/augmentedv2_dataset/model_metrics_recall_ensemble_FS.json'
    elif AUGMENTEDv3:
        DATASET = 'augmentedv3'
        FILENAME_ACC = 'BOmetrics/augmentedv3_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/augmentedv3_dataset/model_metrics_recall_ensemble.json'
        if SNV:
            DATASET = 'snv_augmentedv3'
            FILENAME_ACC = 'BOmetrics/augmentedv3_dataset/model_metrics_accuracy_ensemble_snv.json'
            FILENAME_RECALL = 'BOmetrics/augmentedv3_dataset/model_metrics_recall_ensemble_snv.json'
        if FEATURE_SELECT:
            DATASET = 'augmentedv3_FS'
            FILENAME_ACC = 'BOmetrics/augmentedv3_dataset/model_metrics_accuracy_ensemble_FS.json'
            FILENAME_RECALL = 'BOmetrics/augmentedv3_dataset/model_metrics_recall_ensemble_FS.json'
    elif SMOTE:
        DATASET = 'smote'
        FILENAME_ACC = 'BOmetrics/smote/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/smote/model_metrics_recall_ensemble.json'
    elif SVMSMOTE:
        DATASET = 'svmsmote'
        FILENAME_ACC = 'BOmetrics/svmsmote/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/svmsmote/model_metrics_recall_ensemble.json'
        if FEATURE_SELECT:
            DATASET = 'FS_svmsmote'
            FILENAME_ACC = 'BOmetrics/svmsmote/model_metrics_accuracy_ensemble_FS.json'
            FILENAME_RECALL = 'BOmetrics/svmsmote/model_metrics_recall_ensemble_FS.json'
            if SNV:
                DATASET = 'snv_FS_svmsmote'
                FILENAME_ACC = 'BOmetrics/svmsmote/model_metrics_accuracy_ensemble_snv_FS.json'
                FILENAME_RECALL = 'BOmetrics/svmsmote/model_metrics_recall_ensemble_snv_FS.json'
        if SNV:
                DATASET = 'snv_svmsmote'
                FILENAME_ACC = 'BOmetrics/svmsmote/model_metrics_accuracy_ensemble_FS.json'
                FILENAME_RECALL = 'BOmetrics/svmsmote/model_metrics_recall_ensemble_FS.json'
    elif KMEANSSMOTE:
        DATASET = 'kmeanssmote'
        FILENAME_ACC = 'BOmetrics/kmeanssmote/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/kmeanssmote/model_metrics_recall_ensemble.json'
    elif ADASYNSMOTE:
        DATASET = 'adasynsmote'
        FILENAME_ACC = 'BOmetrics/adasynsmote/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/adasynsmote/model_metrics_recall_ensemble.json'
    elif BORDERSMOTE:
        DATASET = 'bordersmote'
        FILENAME_ACC = 'BOmetrics/bordersmote/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/bordersmote/model_metrics_recall_ensemble.json'
    elif MY_BALANCEDv2:
        DATASET = 'my_balancedv2'
        FILENAME_ACC = 'BOmetrics/my_balancedv2_dataset/model_metrics_accuracy_ensemble.json'
        FILENAME_RECALL = 'BOmetrics/my_balancedv2_dataset/model_metrics_recall_ensemble.json'
    elif SNV:
        DATASET = 'snv_raw'
        FILENAME_ACC = 'BOmetrics/raw_dataset/model_metrics_accuracy_snv.json'
        FILENAME_RECALL = 'BOmetrics/raw_dataset/model_metrics_recall_snv.json'
    elif FEATURE_SELECT:
        DATASET = 'feature_select'
        FILENAME_ACC = 'BOmetrics/selected_features/model_metrics_accuracy.json'
        FILENAME_RECALL = 'BOmetrics/selected_features/model_metrics_recall.json'
    elif FEATURE_SELECTv2:
        DATASET = 'feature_selectv2'
        FILENAME_ACC = 'BOmetrics/selected_features/model_metrics_accuracy2.json'
        FILENAME_RECALL = 'BOmetrics/selected_features/model_metrics_recall2.json'

    # choose dataset and set x_train, x_test, y_train, y_test
    if MY_BALANCED:
        print('my_balanced')
        # x_train
        training_data = pd.read_csv('../data/balanced_data/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/balanced_data/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/balanced_data/test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/balanced_data/test_label.csv', header = None)
    elif BALANCED:
        print('balanced')
        # x_train
        training_data = pd.read_csv('../data/original_data/balanced_train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/original_data/balanced_train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/balanced_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/balanced_test_label.csv', header = None)
    elif AUGMENTED:
        print('augmented')
        # x_train
        training_data = pd.read_csv('../data/augmented_data/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/augmented_data/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif AUGMENTEDv2:
        print('augmentedv2')
        # x_train
        training_data = pd.read_csv('../data/augmented_datav2/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/augmented_datav2/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif AUGMENTEDv3:
        print('augmentedv3')
        # x_train
        training_data = pd.read_csv('../data/augmented_datav3/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/augmented_datav3/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif MY_BALANCEDv2:
        print('my_balancedv2')
        # x_train
        training_data = pd.read_csv('../data/balancedv2_data/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/balancedv2_data/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif SMOTE:
        print('smote')
        # x_train
        training_data = pd.read_csv('../data/SMOTE/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/SMOTE/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif SVMSMOTE:
        print('svmsmote')
        # x_train
        training_data = pd.read_csv('../data/SMOTE/svm/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/SMOTE/svm/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif KMEANSSMOTE:
        print('kmeanssmote')
        # x_train
        training_data = pd.read_csv('../data/SMOTE/kmeans/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/SMOTE/kmeans/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif ADASYNSMOTE:
        print('adasynsmote')
        # x_train
        training_data = pd.read_csv('../data/SMOTE/adasyn/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/SMOTE/adasyn/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    elif BORDERSMOTE:
        print('bordersmote')
        # x_train
        training_data = pd.read_csv('../data/SMOTE/border/train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/SMOTE/border/train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)
    else:
        print('raw')
        # x_train
        training_data = pd.read_csv('../data/original_data/noExclusion_train_data.csv', header = None)
        # y_train
        training_labels = pd.read_csv('../data/original_data/noExclusion_train_label.csv', header = None)
        # x_test
        testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
        # y_test
        testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)

    # print(f"training labels vc: \n{training_labels.value_counts()}, \ntesting labels vc: \n{testing_labels.value_counts()}")
    # print(len(training_data), len(training_labels), len(testing_data), len(testing_labels))

    # type cast labels to ints
    training_labels[0] = training_labels[0].astype(int)
    # testing_labels
    testing_labels[0] = testing_labels[0].astype(int)

    # encode labels, using sklearn, to pass to xgboost
    # this code was inspired by the snippet from:
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
    le = LabelEncoder()
    # fit the classes to the encoder and transform labels to be 0,1,2
    training_labels = le.fit_transform(training_labels[0].to_list())
    testing_labels = le.fit_transform(testing_labels[0].to_list())

    # bayessearch results
    gs_results = {DATASET:{}}
    # print(np.unique(training_labels))

    # FEATURE SELECT
    # RUN THIS TO APPLY FEATURE SELECTION TO TRAINING DATA
    if FEATURE_SELECT == True:
        ADD_POSSIBLE_FIGURES = True

        # figures contain features from (figure_num*4)+1 
        selected_figures = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
                            21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
                            39,40,41,42,43,44,45,46,47,48,49,50,51,52]

        # figures that MAY be decent - noisy but different peaks
        possible_figures = [53,54,56,57,58,59,60,61,62,63,64,65]

        # decent features - eyeballed

        # generate set of selected features
        selected_features = []

        for figure_num in selected_figures:
            for i in range(0,4):
                # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
                selected_features.append((figure_num*4)+i)

        # to add possible features
        if ADD_POSSIBLE_FIGURES == True:
            for figure_num in possible_figures:
                for i in range(0,4):
                    # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
                    selected_features.append((figure_num*4)+i)
                    
        training_data = training_data[selected_features]
        testing_data = testing_data[selected_features]
    else:
        print('no feature selection')

    # FSv2
    if FEATURE_SELECTv2:    
        # selected features round 2
        selected_features = []
        a = np.arange(30,85).tolist()
        b = np.arange(203,235).tolist()

        selected_features = np.concatenate([a,b]).tolist()
        print(selected_features)
        # (Cena, 2018)
        training_data = training_data[selected_features]
        testing_data = testing_data[selected_features]
    else:
        print('no FSv2')

    # SNV
    # apply SNV to training data - inspired by code from my ML CW
    # (Hamzah Hafejee, 2022, COMP3611_Coursework_Assessment.ipynb, Comp 3611, University of Leeds)
    # (Sklearn, 2023)
    if SNV == True:
        print(len(training_data))
        # fit to training data
        scaler = StandardScaler().fit(training_data)
        training_data = scaler.transform(training_data)
        testing_data = scaler.transform(testing_data)
        print("After: \n", len(training_data))
        len(training_data)
    else:
        print('no SNV standardisation')

    #### train the models ####

    # CART
    # bayes search experiment (Skopt, 2017)
    clf_cart = tree.DecisionTreeClassifier(criterion="gini", random_state=1)
    # find optimal parameter values for CART
    NoneList = [None]
    NumList = range(1,40)
    for num in NumList:
        NoneList.append(num)

    params = {
        'max_depth': Categorical(NoneList), # control overfitting,
        'max_features': Categorical([None, 'sqrt', 'log2']) # performance 
    }
    
    bayes_search = BayesSearchCV(clf_cart, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_cart = bayes_search.best_estimator_
    gs_results[DATASET]['CART'] = {'accuracy':best_score, 'params':best_params}

    # make Gaussian Naive Bayes classifier
    clf_nb = GaussianNB()
    params = {
        # use log-uniform for quicker convergence (Lewinson, 2022)
        'var_smoothing':Real(1e-20, 0.1, prior='log-uniform') # from less smoothing to more aggressive smoothing
    }
    bayes_search = BayesSearchCV(clf_nb, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_nb = bayes_search.best_estimator_
    gs_results[DATASET]['GNB'] = {'accuracy':best_score, 'params':best_params}

    # make k-Nearest Neighbours classifier
    clf_knn = KNeighborsClassifier(n_jobs=-1) # use all processes for parellelisation
    params = {
        'n_neighbors': Integer(1,15)
    }
    bayes_search = BayesSearchCV(clf_knn, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_knn = bayes_search.best_estimator_
    gs_results[DATASET]['kNN'] = {'accuracy':best_score, 'params':best_params}

    # make SVM-RBF classifier
    clf_svmrbf = SVC(kernel='rbf', random_state=1)
    params = {
        'C': Real(0.1,5000, prior='log-uniform'), # high to low regularisation strength
        'gamma' : Categorical(['scale', 'auto']), # need to research this parameter more
    }

    bayes_search = BayesSearchCV(clf_svmrbf, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_svmrbf = bayes_search.best_estimator_
    gs_results[DATASET]['SVM-RBF'] = {'accuracy':best_score, 'params':best_params}

    # make SVM linear classifier
    clf_lin = SVC(kernel='linear', random_state=1)
    params = {
        # the different SVM kernels have different C values as I adjusetd it after seeing
        # how previous experiments went and where they most commonly performed best
        'C': Real(0.05, 1000, prior='log-uniform'), # high to low regularisation strength
        'gamma' : Categorical(['scale', 'auto']), # need to research this parameter more
        # 'gamma' : [], # need to research this parameter more - remember i started off 
        # testing each param separately to lower range of values for each 
    }

    bayes_search = BayesSearchCV(clf_lin, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_svmlin = bayes_search.best_estimator_
    gs_results[DATASET]['SVM-Lin'] = {'accuracy':best_score, 'params':best_params}

    # make svm sigmoidal classifier
    clf_sig = SVC(kernel='sigmoid', random_state=1)
    params = {
        'C': Real(0.00001, 100, prior='log-uniform'), # high to low regularisation strength
        'gamma' : Categorical(['scale', 'auto']), # need to research this parameter more
        # 'gamma' : [], # need to research this parameter more
    }
        
    bayes_search = BayesSearchCV(clf_sig, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_svmsig = bayes_search.best_estimator_
    gs_results[DATASET]['SVM-Sig'] = {'accuracy':best_score, 'params':best_params}

    # make xgboost classifier (Piotr Płoński, 2021)
    clf_xgb = xgb.XGBClassifier(random_state = 1)

    params = {
        'n_estimators': Integer(10, 1000, prior='log-uniform'), # no. boosting rounds
        'max_depth': Integer(1,30) # control overfitting
    }

    bayes_search = BayesSearchCV(clf_xgb, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_xgb = bayes_search.best_estimator_
    gs_results[DATASET]['XGB'] = {'accuracy':best_score, 'params':best_params}

    # make adaboost classifier
    clf_ada = AdaBoostClassifier(random_state=1)
    params = {
        'n_estimators': Integer(10,1000,prior='log-uniform'),
        'learning_rate': Real(0.001, 10, prior='log-uniform') # weight applied to each clf at each boosting iteration
    }

    bayes_search = BayesSearchCV(clf_ada, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_ada = bayes_search.best_estimator_
    gs_results[DATASET]['ADA'] = {'accuracy':best_score, 'params':best_params}

    # # make Logistic Regressor
    # clf_lr = LogisticRegression(random_state=1, max_iter=1000)
    # params = {
    #     'penalty': ['l1', 'l2'], # type of regularisation 
    #     'C': [0.1, 1, 10, 100], # regularisation strength
    #     'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'] # approach to finding best weights
    # }

    # # print(DATASET)

    # if DATASET == 'raw':
    #     # raw dataset
    #     params = {
    #         'penalty': ['l2'], # type of regularisation 
    #         'C': [0.1], # regularisation strength
    #         'solver': ['lbfgs'] # approach to finding best weights
    #     }
    # elif DATASET == 'feature_select':
    #     # selected + possible features
    #     params = {
    #         'penalty': ['l2'], # type of regularisation 
    #         'C': [0.1], # regularisation strength
    #         'solver': ['newton-cg'] # approach to finding best weights
    #     }
    # elif DATASET == 'feature_selectv2':
    #     # FSv2
    #     params = {'C': [1], 'penalty': ['l2'], 'solver': ['lbfgs']} 
    # elif DATASET == 'snv_raw':
    #     # SNV + raw
    #     params = {'C': [0.1], 'penalty': ['l2'], 'solver': ['lbfgs']} 
    # elif DATASET == 'balanced':
    #     params = {'C': [100], 'penalty': ['l2'], 'solver': ['saga']} 
    # elif DATASET == 'snv_balanced':
    #     params = {'C': [0.1], 'penalty': ['l2'], 'solver': ['saga']} 

    # elif DATASET == 'my_balanced':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['saga']} 
        
    # elif DATASET == 'FS_my_balanced':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['saga']} 
        
    # elif DATASET == 'FSv2_my_balanced':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['saga']} 
        
    # elif DATASET == 'snv_my_balanced':
    #     params = {'C': [1], 'penalty': ['l1'], 'solver': ['saga']} 
        
    # elif DATASET == 'snv_FS_my_balanced':
    #     params = {'C': [1], 'penalty': ['l2'], 'solver': ['saga']} 
        
    # elif DATASET == 'augmented':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # elif DATASET == 'augmented_FS':
    #     params = {'C': [100], 'penalty': ['l1'], 'solver': ['liblinear']} 

    # elif DATASET == 'smote':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['saga']} 

    # elif DATASET == 'augmentedv3':
    #     params = {'C': [100], 'penalty': ['l1'], 'solver': ['liblinear']} 
        
    # elif DATASET == 'snv_augmentedv3':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # elif DATASET == 'snv_FS_my_balanced':
    #     params = {'C': [10], 'penalty': ['l1'], 'solver': ['saga']} 

    # elif DATASET == 'augmentedv3_FS':
    #     params = {'C': [100], 'penalty': ['l1'], 'solver': ['liblinear']} 
        
    # elif DATASET == 'svmsmote':
    #     params = {'C': [100], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # elif DATASET == 'kmeanssmote':
    #     params = {'C': [10], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # elif DATASET == 'adasynsmote':
    #     params = {'C': [100], 'penalty': ['l1'], 'solver': ['liblinear']} 
        
    # elif DATASET == 'bordersmote':
    #     params = {'C': [100], 'penalty': ['l1'], 'solver': ['liblinear']} 
        
    # elif DATASET == 'snv_svmsmote':
    #     params = {'C': [100], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # elif DATASET == 'snv_FS_svmsmote':
    #     params = {'C': [100], 'penalty': ['l2'], 'solver': ['lbfgs']} 
        
    # grid_search = GridSearchCV(clf_lr, params, scoring='accuracy', cv=10)
    # grid_search.fit(training_data, np.ravel(training_labels))
    # best_params = grid_search.best_params_
    # best_score = grid_search.best_score_
    # print(f"params = {best_params} \nbest_score: {best_score}")
    # gs_results['LR'] = {'accuracy':best_score, 'params':best_params}
    # best_lr = bayes_search.best_estimator_

    
    # make Random Forest classifier
    clf_rf = RandomForestClassifier(random_state=1)
    NoneList = [None]
    NumList = range(1,50)
    for num in NumList:
        NoneList.append(num)
        
    params = {
    'n_estimators': Integer(10,300,prior='log-uniform'),
    'max_depth': Categorical(NoneList),
    "max_features" : Categorical(NoneList)
    }

    bayes_search = BayesSearchCV(clf_rf, params, scoring='accuracy', cv=5, n_iter=100, random_state=1)
    _ = bayes_search.fit(training_data, np.ravel(training_labels))
    best_params = bayes_search.best_params_
    best_score = bayes_search.best_score_
    print(f"params = {best_params} \nbest_score: {best_score}")
    best_rf = bayes_search.best_estimator_
    gs_results[DATASET]['RF'] = {'accuracy':best_score, 'params':best_params}


    # ensemble model (Sklearn, 2014), (Sklearn, 2023)
    ensemble = VotingClassifier(estimators=[
        ('rf', best_rf), 
        ('knn', best_knn), 
        ('xgb', best_xgb), 
        ('svmrbf', best_svmrbf), 
        ('nb', best_nb)],
        voting='hard',
        n_jobs=-1)

    ensemble.fit(training_data, training_labels)

    accuracy = ensemble.score(testing_data, testing_labels)
    predictions = ensemble.transform(testing_data)
    gs_results[DATASET]['Ensemble'] = {'accuracy':accuracy}


    # sorted GS models
    print(gs_results)
    # print params to file
    with open(f'BOmetrics/params/{DATASET}', 'a+') as fp:
        json.dump(gs_results, fp)
        fp.close()

    gs_sorted_models = dict(sorted(gs_results[DATASET].items(), key=lambda item: item[1]['accuracy'], reverse=True))
    print(gs_sorted_models.keys())



    # evaluate models (Sklearn, 2023)
    # model_metrics = {'accuracy', 'recall', 'precision', 'F1-score', 'ROC-AUC'}
    model_metrics = {}

    # all of the models
    models = [best_cart, best_rf, best_nb, best_knn, best_svmrbf, best_svmlin, best_svmsig, best_xgb, best_ada, ensemble]
    model_names = ['CART', 'RF', 'GNB', 'kNN', 'SVM-RBF', 'SVM-Lin', 'SVM-Sig', 'XGB', 'ADA', 'Ensemble']
    i=0
    for model in models:
        # train on test set
        predicted = model.predict(testing_data)
        # generate cm against test labels
        cm = confusion_matrix(testing_labels, predicted)
        # print(cm)
        accuracy = accuracy_score(testing_labels, predicted)
        recall = recall_score(testing_labels, predicted, average=None)
        precision = precision_score(testing_labels, predicted, average=None)
        f1 = f1_score(testing_labels, predicted, average=None)

        try:
            predicted_prob = model.predict_proba(testing_data)
            roc = roc_auc_score(testing_labels, predicted_prob, average=None, multi_class='ovr') 
            # print(accuracy, recall, precision, f1, roc)
            model_metrics[model_names[i]] = {
                                                'accuracy':accuracy, 
                                                'recall':{
                                                    1:recall[0], 
                                                    2:recall[1], 
                                                    3:recall[2]
                                                },
                                                'precision':{
                                                    1:precision[0], 
                                                    2:precision[1], 
                                                    3:precision[2]
                                                },
                                                'f1_score':{
                                                    1:f1[0], 
                                                    2:f1[1], 
                                                    3:f1[2]
                                                },
                                                'ROC-AUC':{
                                                    1:roc[0], 
                                                    2:roc[1], 
                                                    3:roc[2]
                                                }
            }
        except:
            # print(f"can't predict class probilities for {model_names[i]}")
            # print(accuracy, recall, precision, f1)
            model_metrics[model_names[i]] = {
                                                'accuracy':accuracy, 
                                                'recall':{
                                                    1:recall[0], 
                                                    2:recall[1], 
                                                    3:recall[2]
                                                },
                                                'precision':{
                                                    1:precision[0], 
                                                    2:precision[1], 
                                                    3:precision[2]
                                                },
                                                'f1_score':{
                                                    1:f1[0], 
                                                    2:f1[1], 
                                                    3:f1[2]
                                                }
            }
        i+=1

    # (Gern Blanston, 2009)- sort by neoplasia recall
    sorted_metrics = dict(sorted(model_metrics.items(), key=lambda item: item[1]['recall'][3], reverse=True))
    # (holys, 2013)
    with open(FILENAME_RECALL, 'w+') as fp:
        json.dump(sorted_metrics, fp)
        fp.close()

    # redo but sort by accuracy
    # (Gern Blanston, 2009)
    sorted_metrics_acc = dict(sorted(model_metrics.items(), key=lambda item: item[1]['accuracy'], reverse=True))
    # (holys, 2013)
    with open(FILENAME_ACC, 'w+') as fp:
        json.dump(sorted_metrics_acc, fp)
        fp.close()



    # (Neekhara, 2019)
    # DATASET = 'snv_svmsmote'
    # function to add to JSON
    def write_json(new_data, ds, filename='BOmetrics/scoreboard.json'):
        with open(filename,'r+') as file:
            # First we load existing data into a dict.
            file_data = json.load(file)
            # Join new_data with file_data inside emp_details
            file_data[ds] = (new_data)
            file_data = dict(sorted(file_data.items(), key=lambda item: item[1]['all']['accuracy'], reverse=True))
            # Sets file's current position at offset.
            file.seek(0)
            # convert back to json.
            json.dump(file_data, file, indent = 4)
            file.close()

    # calculate avg accuracy and recall
    accuracy = 0
    recall = 0
    count = 0
    for model in model_names:
        # print(model_metrics[model]['accuracy'])
        accuracy += model_metrics[model]['accuracy']
        recall += model_metrics[model]['recall'][3]
        count +=1

    t6acc = 0
    t6rec = 0
    count2 = 0
    for key in sorted_metrics_acc:
        if count2 ==6:
            break
        t6acc += sorted_metrics_acc[key]['accuracy']
        t6rec += sorted_metrics_acc[key]['recall'][3]
        count2 +=1

    avg = {'all' : {'accuracy': accuracy/count, 'neoplasia recall': recall/count},
        'top6' : {'accuracy': t6acc/count2, 'recall':t6rec/count2}}
    # print(accuracy, recall, count)
    # print(t6acc, t6rec, count2)
    if DATASET != 'test':
        write_json(avg, ds=DATASET)
    else:
        print(DATASET)


svmsmote
no FSv2
771
After: 
 771




params = OrderedDict([('max_depth', 9), ('max_features', 'log2')]) 
best_score: 0.839204021784667




params = OrderedDict([('var_smoothing', 0.04266297483349839)]) 
best_score: 0.6783493925429409




params = OrderedDict([('n_neighbors', 1)]) 
best_score: 0.8885043988269794
params = OrderedDict([('C', 3520.1620340718778), ('gamma', 'auto')]) 
best_score: 0.907993297025555
params = OrderedDict([('C', 867.4830574123015), ('gamma', 'auto')]) 
best_score: 0.8068789275240889
params = OrderedDict([('C', 0.060516359494983865), ('gamma', 'scale')]) 
best_score: 0.6459572685379136




params = OrderedDict([('max_depth', 9), ('n_estimators', 21)]) 
best_score: 0.8625974025974026




params = OrderedDict([('learning_rate', 0.0011271706792373793), ('n_estimators', 1000)]) 
best_score: 0.6822454964390448




params = OrderedDict([('max_depth', 46), ('max_features', 8), ('n_estimators', 300)]) 
best_score: 0.8729451193967324
{'snv_svmsmote': {'CART': {'accuracy': 0.839204021784667, 'params': OrderedDict([('max_depth', 9), ('max_features', 'log2')])}, 'GNB': {'accuracy': 0.6783493925429409, 'params': OrderedDict([('var_smoothing', 0.04266297483349839)])}, 'kNN': {'accuracy': 0.8885043988269794, 'params': OrderedDict([('n_neighbors', 1)])}, 'SVM-RBF': {'accuracy': 0.907993297025555, 'params': OrderedDict([('C', 3520.1620340718778), ('gamma', 'auto')])}, 'SVM-Lin': {'accuracy': 0.8068789275240889, 'params': OrderedDict([('C', 867.4830574123015), ('gamma', 'auto')])}, 'SVM-Sig': {'accuracy': 0.6459572685379136, 'params': OrderedDict([('C', 0.060516359494983865), ('gamma', 'scale')])}, 'XGB': {'accuracy': 0.8625974025974026, 'params': OrderedDict([('max_depth', 9), ('n_estimators', 21)])}, 'ADA': {'accuracy': 0.6822454964390448, 'params': OrderedDict([('learning_rate', 0.0011271706792373793), ('



params = OrderedDict([('max_depth', 10), ('max_features', 'log2')]) 
best_score: 0.8696774193548388




params = OrderedDict([('var_smoothing', 0.0002236267099962189)]) 
best_score: 0.7238709677419355




params = OrderedDict([('n_neighbors', 1)]) 
best_score: 0.9083870967741936




params = OrderedDict([('C', 3108.199200662262), ('gamma', 'scale')]) 
best_score: 0.9058064516129033
params = OrderedDict([('C', 16.13326760979134), ('gamma', 'auto')]) 
best_score: 0.8103225806451613




params = OrderedDict([('C', 5.5525133262431385), ('gamma', 'auto')]) 
best_score: 0.6812903225806451




params = OrderedDict([('max_depth', 10), ('n_estimators', 109)]) 
best_score: 0.8967741935483872
params = OrderedDict([('learning_rate', 0.006901967393669901), ('n_estimators', 95)]) 
best_score: 0.8077419354838711




params = OrderedDict([('max_depth', 49), ('max_features', 4), ('n_estimators', 300)]) 
best_score: 0.9083870967741936
{'kmeanssmote': {'CART': {'accuracy': 0.8696774193548388, 'params': OrderedDict([('max_depth', 10), ('max_features', 'log2')])}, 'GNB': {'accuracy': 0.7238709677419355, 'params': OrderedDict([('var_smoothing', 0.0002236267099962189)])}, 'kNN': {'accuracy': 0.9083870967741936, 'params': OrderedDict([('n_neighbors', 1)])}, 'SVM-RBF': {'accuracy': 0.9058064516129033, 'params': OrderedDict([('C', 3108.199200662262), ('gamma', 'scale')])}, 'SVM-Lin': {'accuracy': 0.8103225806451613, 'params': OrderedDict([('C', 16.13326760979134), ('gamma', 'auto')])}, 'SVM-Sig': {'accuracy': 0.6812903225806451, 'params': OrderedDict([('C', 5.5525133262431385), ('gamma', 'auto')])}, 'XGB': {'accuracy': 0.8967741935483872, 'params': OrderedDict([('max_depth', 10), ('n_estimators', 109)])}, 'ADA': {'accuracy': 0.8077419354838711, 'params': OrderedDict([('learning_rate', 0.006901967393669901), 



params = OrderedDict([('max_depth', 4), ('max_features', None)]) 
best_score: 0.792




params = OrderedDict([('var_smoothing', 2.2400352698102787e-07)]) 
best_score: 0.6599999999999999




params = OrderedDict([('n_neighbors', 1)]) 
best_score: 0.8266666666666665




params = OrderedDict([('C', 4999.999999999999), ('gamma', 'scale')]) 
best_score: 0.8813333333333333




params = OrderedDict([('C', 1000.0), ('gamma', 'scale')]) 
best_score: 0.8093333333333333




params = OrderedDict([('C', 0.0016331202762812677), ('gamma', 'scale')]) 
best_score: 0.4746666666666666




params = OrderedDict([('max_depth', 5), ('n_estimators', 46)]) 
best_score: 0.8266666666666665




params = OrderedDict([('learning_rate', 0.10224582835490328), ('n_estimators', 18)]) 
best_score: 0.72
params = OrderedDict([('max_depth', 41), ('max_features', 6), ('n_estimators', 79)]) 
best_score: 0.8426666666666668
{'augmentedv3_FS': {'CART': {'accuracy': 0.792, 'params': OrderedDict([('max_depth', 4), ('max_features', None)])}, 'GNB': {'accuracy': 0.6599999999999999, 'params': OrderedDict([('var_smoothing', 2.2400352698102787e-07)])}, 'kNN': {'accuracy': 0.8266666666666665, 'params': OrderedDict([('n_neighbors', 1)])}, 'SVM-RBF': {'accuracy': 0.8813333333333333, 'params': OrderedDict([('C', 4999.999999999999), ('gamma', 'scale')])}, 'SVM-Lin': {'accuracy': 0.8093333333333333, 'params': OrderedDict([('C', 1000.0), ('gamma', 'scale')])}, 'SVM-Sig': {'accuracy': 0.4746666666666666, 'params': OrderedDict([('C', 0.0016331202762812677), ('gamma', 'scale')])}, 'XGB': {'accuracy': 0.8266666666666665, 'params': OrderedDict([('max_depth', 5), ('n_estimators', 46)])}, 'ADA': {'accuracy': 0.

  _warn_prf(average, modifier, msg_start, len(result))


bordersmote
no feature selection
no FSv2
no SNV standardisation




params = OrderedDict([('max_depth', 9), ('max_features', 'sqrt')]) 
best_score: 0.8457059069962295




params = OrderedDict([('var_smoothing', 0.0032573414545298976)]) 
best_score: 0.6563049853372434




params = OrderedDict([('n_neighbors', 1)]) 
best_score: 0.9001591956430666




params = OrderedDict([('C', 4999.999999999999), ('gamma', 'scale')]) 
best_score: 0.907984918307499
params = OrderedDict([('C', 293.5206434095224), ('gamma', 'scale')]) 
best_score: 0.8055048177628823




params = OrderedDict([('C', 0.11938924466862906), ('gamma', 'auto')]) 
best_score: 0.45378299120234605




params = OrderedDict([('max_depth', 17), ('n_estimators', 56)]) 
best_score: 0.9014914118139924
params = OrderedDict([('learning_rate', 1.8277468780511987), ('n_estimators', 1000)]) 
best_score: 0.7354168412232929
params = OrderedDict([('max_depth', 32), ('max_features', 11), ('n_estimators', 300)]) 
best_score: 0.9092668621700879
{'bordersmote': {'CART': {'accuracy': 0.8457059069962295, 'params': OrderedDict([('max_depth', 9), ('max_features', 'sqrt')])}, 'GNB': {'accuracy': 0.6563049853372434, 'params': OrderedDict([('var_smoothing', 0.0032573414545298976)])}, 'kNN': {'accuracy': 0.9001591956430666, 'params': OrderedDict([('n_neighbors', 1)])}, 'SVM-RBF': {'accuracy': 0.907984918307499, 'params': OrderedDict([('C', 4999.999999999999), ('gamma', 'scale')])}, 'SVM-Lin': {'accuracy': 0.8055048177628823, 'params': OrderedDict([('C', 293.5206434095224), ('gamma', 'scale')])}, 'SVM-Sig': {'accuracy': 0.45378299120234605, 'params': OrderedDict([('C', 0.11938924466862906), ('gamma', 'auto')]



params = OrderedDict([('var_smoothing', 2.2400352698102787e-07)]) 
best_score: 0.6759999999999999




params = OrderedDict([('n_neighbors', 4)]) 
best_score: 0.8306666666666667
params = OrderedDict([('C', 183.6149954203013), ('gamma', 'scale')]) 
best_score: 0.8813333333333334


In [18]:
# TO USE AVG RECALL AS METRIC FOR GS
# (gunes, 2019)
gs_recall = make_scorer(recall_score, average='macro')
DATASET

'snv_svmsmote'

In [59]:
# print highest acc models from gridsearch
print(f"gs_sorted_models (acc): \n{gs_sorted_models.keys()}\n")

# highest acc models from test set
print(f"sorted models (acc): \n{sorted_metrics_acc.keys()}\n")

# highest recall from test set
print(f"sorted models (recall): \n{sorted_metrics.keys()}")


gs_sorted_models (acc): 
dict_keys(['SVM-RBF', 'RF', 'XGB', 'kNN', 'Ensemble', 'CART', 'SVM-Lin', 'LR', 'ADA', 'SVM-Sig', 'GNB'])

sorted models (acc): 
dict_keys(['SVM-RBF', 'Ensemble', 'XGB', 'kNN', 'CART', 'RF', 'LR', 'SVM-Lin', 'GNB', 'ADA', 'SVM-Sig'])

sorted models (recall): 
dict_keys(['GNB', 'SVM-RBF', 'CART', 'kNN', 'Ensemble', 'XGB', 'LR', 'RF', 'SVM-Lin', 'SVM-Sig', 'ADA'])


In [60]:
DATASET

'feature_selectv2'

In [14]:

avg



{'all': {'accuracy': 0.737062937062937,
  'neoplasia recall': 0.6982758620689655},
 'top6': {'accuracy': 0.81002331002331, 'recall': 0.6896551724137931}}

# observations on raw dataset
XGBoost and ADAboost seem to have really overfit, because they severely underperform on unseen test data, compared to the accuracies they were achieving with gridsearch. IGNORE THIS: it is just because the test labels were not normalised!

Although GNB has higher recall for neoplasia than kNN, kNN seems to be the best classifier overall. While GNB has highest recall for neoplasia, has 3rd lowest accuracy.

Top models based on accuracy, from gridsearch, were RF, kNN, XGB, SVM-RBF, CART. Top models based on accuracy, from test set, were RF, kNN, SVM-RBF, CART, SVM-Lin. Therefore, RF, kNN, SVM-RBF, CART seem to perform well, in terms of accuracy, and don't seem to produce drastically different results with the test set, suggesting there isn't much overfitting

gs_sorted_models (acc): 
(['RF', 'kNN', 'XGB', 'SVM-RBF', 'CART', 'LR', 'SVM-Lin', 'ADA', 'SVM-Sig', 'GNB'])

sorted models (acc): 
(['RF', 'XGB', 'kNN', 'SVM-RBF', 'CART', 'SVM-Lin', 'LR', 'SVM-Sig', 'GNB', 'ADA'])

sorted models (recall): 
(['GNB', 'kNN', 'CART', 'RF', 'SVM-Lin', 'XGB', 'LR', 'SVM-RBF', 'SVM-Sig', 'ADA'])

# observations on feature selected dataset
Some models decreased in performance, some increased, with largest increase being 6% increase in accuracy for SVM-Lin model. But overall, not worth, since the max accuracy of any of the models was lower than without feature selection. Maybe better feature selection is needed - an analytical solution rather than eyeball

gs_sorted_models (acc): 
(['RF', 'SVM-RBF', 'kNN', 'XGB', 'CART', 'SVM-Lin', 'LR', 'ADA', 'GNB', 'SVM-Sig'])

sorted models (acc): 
(['RF', 'XGB', 'kNN', 'SVM-RBF', 'CART', 'LR', 'SVM-Lin', 'GNB', 'ADA', 'SVM-Sig'])

sorted models (recall): 
(['GNB', 'SVM-RBF', 'XGB', 'RF', 'kNN', 'SVM-Lin', 'CART', 'LR', 'SVM-Sig', 'ADA'])