In [15]:
import pandas as pd, numpy as np
import os, json
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb

In [36]:
# x_train
training_data = pd.read_csv('../data/original_data/noExclusion_train_data.csv', header = None)
# y_train
training_labels = pd.read_csv('../data/original_data/noExclusion_train_label.csv', header = None)
# x_test
testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
# y_test
testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)


In [37]:
# type cast labels to ints
training_labels[0] = training_labels[0].astype(int)
# testing_labels
testing_labels[0] = testing_labels[0].astype(int)

# encode labels, using sklearn, to pass to xgboost
# this code was inspired by the snippet from:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = LabelEncoder()
# fit the classes to the encoder and transform labels to be 0,1,2
training_labels = le.fit_transform(training_labels[0].to_list())
testing_labels = le.fit_transform(testing_labels[0].to_list())

# apply SNV to training data - inspired by code from my ML CW
# (Hamzah Hafejee, 2022, COMP3611_Coursework_Assessment.ipynb, Comp 3611, University of Leeds)
# (Anon, 2023b)
print(type(training_data))
# fit to training data
scaler = StandardScaler().fit(training_data)
training_data = scaler.transform(training_data)
testing_data = scaler.transform(testing_data)
print("After: \n", type(training_data))

# gridsearch results
gs_results = {}

<class 'pandas.core.frame.DataFrame'>
After: 
 <class 'numpy.ndarray'>


In [38]:
len(training_data)

572

In [18]:
# RUN THIS TO APPLY FEATURE SELECTION TO TRAINING DATA
ADD_POSSIBLE_FIGURES = True

# figures contain features from (figure_num*4)+1 
selected_figures = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
                    21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
                    39,40,41,42,43,44,45,46,47,48,49,50,51,52]

# figures that MAY be decent - noisy but different peaks
possible_figures = [53,54,56,57,58,59,60,61,62,63,64,65]

# decent features - eyeballed

# generate set of selected features
selected_features = []

for figure_num in selected_figures:
    for i in range(0,4):
        # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
        selected_features.append((figure_num*4)+i)

# to add possible features
if ADD_POSSIBLE_FIGURES == True:
    for figure_num in possible_figures:
        for i in range(0,4):
            # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
            selected_features.append((figure_num*4)+i)

# selected features round 2
selected_features = []
a = np.arange(30,85).tolist()
b = np.arange(203,235).tolist()

selected_features = np.concatenate([a,b]).tolist()
print(selected_features)
# (Cena, 2018)
training_data = training_data[selected_features]
testing_data = testing_data[selected_features]


[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234]


IndexError: index 203 is out of bounds for axis 0 with size 143

In [27]:
# make CART classifier
clf_cart = tree.DecisionTreeClassifier(criterion="gini", random_state=1)
# find optimal parameter values for CART
params = {
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40], # control overfitting,
    'max_features': [None, 'sqrt', 'log2'] # performance 
}

# raw dataset
# params = {
#     'max_depth': [None], # control overfitting,
#     'max_features': ['log2'] # performance 
# }

# selected + possible features
# params = {
#     'max_depth': [5], # control overfitting,
#     'max_features': [None] # performance 
# }

# feature select v2
# params = {'max_depth': [None], 'max_features': ['sqrt']} 

# SNV + raw
# params = {'max_depth': [None], 'max_features': ['sqrt']} 

grid_search = GridSearchCV(clf_cart, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_cart = grid_search.best_estimator_
gs_results['CART'] = {'accuracy':best_score, 'params':best_params}

params = {'max_depth': None, 'max_features': 'log2'} 
best_score: 0.806019358741682


In [14]:
# make Logistic Regressor
clf_lr = LogisticRegression(random_state=1)
params = {
    'penalty': ['l1', 'l2'], # type of regularisation 
    'C': [0.1, 1, 10, 100], # regularisation strength
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'] # approach to finding best weights
}

# raw dataset
# params = {
#     'penalty': ['l2'], # type of regularisation 
#     'C': [0.1], # regularisation strength
#     'solver': ['lbfgs'] # approach to finding best weights
# }

# selected + possible features
# params = {
#     'penalty': ['l2'], # type of regularisation 
#     'C': [0.1], # regularisation strength
#     'solver': ['newton-cg'] # approach to finding best weights
# }

# FSv2
# params = {'C': [1], 'penalty': ['l2'], 'solver': ['lbfgs']} 

# SNV + raw

grid_search = GridSearchCV(clf_lr, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
gs_results['LR'] = {'accuracy':best_score, 'params':best_params}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

params = {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7570780399274046


80 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packa

In [19]:
print(f"params = {best_params} \nbest_score: {best_score}")
best_lr = grid_search.best_estimator_

params = {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7570780399274046


In [28]:
# make Gaussian Naive Bayes classifier
clf_nb = GaussianNB()
params = {
    'var_smoothing':[1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9], # from less smoothing to more aggressive smoothing
}
grid_search = GridSearchCV(clf_nb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_nb = grid_search.best_estimator_
gs_results['GNB'] = {'accuracy':best_score, 'params':best_params}

params = {'var_smoothing': 1e-20} 
best_score: 0.6347549909255898


In [29]:
# make k-Nearest Neighbours classifier
clf_knn = KNeighborsClassifier(n_jobs=-1) # use all processes for parellelisation
params = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
}
grid_search = GridSearchCV(clf_knn, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_knn = grid_search.best_estimator_
gs_results['kNN'] = {'accuracy':best_score, 'params':best_params}

params = {'n_neighbors': 3} 
best_score: 0.7990320629159104


In [30]:
# make SVM-RBF classifier
clf_svmrbf = SVC(kernel='rbf', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
}

# raw dataset
# params = {
#     'C': [100], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# selected + possible features
# params = {
#     'C': [1000], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# selected + possible features v2
# params = {
#     'C': [1000], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# SNV + raw
# params = {
#     'C': [1000], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

grid_search = GridSearchCV(clf_svmrbf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_svmrbf = grid_search.best_estimator_
gs_results['SVM-RBF'] = {'accuracy':best_score, 'params':best_params}

params = {'C': 10, 'gamma': 'scale'} 
best_score: 0.8323049001814882


In [31]:
# make SVM linear classifier
clf_lin = SVC(kernel='linear', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}

# raw dataset
# params = {
#     'C': [10], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# selected + possible features
# params = {
#     'C': [1], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# FSv2
# params = {
#     'C': [1], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# SNV + raw
# params = {
#     'C': [1000], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

grid_search = GridSearchCV(clf_lin, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmlin = grid_search.best_estimator_
gs_results['SVM-Lin'] = {'accuracy':best_score, 'params':best_params}

best_params: {'C': 0.1, 'gamma': 'scale'} 
best_score: 0.7501814882032668


In [32]:
# make svm sigmoidal classifier
clf_sig = SVC(kernel='sigmoid', random_state=1)
params = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}

# raw dataset
# params = {
#     'C': [10], # high to low regularisation strength
#     'gamma' : ['auto'], # need to research this parameter more
# }

# selected + possible features
# params = {
#     'C': [1e-05], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

# FSv2
# params = {
#     'C': [0.1], # high to low regularisation strength
#     'gamma' : ['auto'], # need to research this parameter more
# }

# SNV + raw
# params = {
#     'C': [1000], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

grid_search = GridSearchCV(clf_sig, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_svmsig = grid_search.best_estimator_
gs_results['SVM-Sig'] = {'accuracy':best_score, 'params':best_params}

params = {'C': 0.1, 'gamma': 'scale'} 
best_score: 0.5909558378705384


In [33]:
# make xgboost classifier
clf_xgb = xgb.XGBClassifier(random_state = 1)

params = {
    'n_estimators': [10,100, 500, 1000], # no. boosting rounds
    'max_depth': [3,5,7,10,15] # control overfitting
}

# raw dataset
# params = {
#     'n_estimators': [100], # no. boosting rounds
#     'max_depth': [10] # control overfitting
# }

# selected + possible features
# params = {
#     'n_estimators': [10], # no. boosting rounds
#     'max_depth': [5] # control overfitting
# }

# FSv2
# params = {
#     'n_estimators': [100], # no. boosting rounds
#     'max_depth': [5] # control overfitting
# }

# SNV + raw
# params = {
#     'n_estimators': [100], # no. boosting rounds
#     'max_depth': [5] # control overfitting
# }

grid_search = GridSearchCV(clf_xgb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, training_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_xgb = grid_search.best_estimator_
gs_results['XGB'] = {'accuracy':best_score, 'params':best_params}

params = {'max_depth': 10, 'n_estimators': 100} 
best_score: 0.8463399879007865


In [34]:
# make adaboost classifier
clf_ada = AdaBoostClassifier(random_state=1)
params = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'learning_rate': [0, 0.01, 0.1, 1, 10] # weight applied to each clf at each boosting iteration
}

# raw dataset
# params = {
#     'n_estimators': [50],
#     'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
# }

# selected + possible features
# params = {
#     'n_estimators': [10],
#     'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
# }

# FSv2
# params = {
#     'n_estimators': [50],
#     'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
# }

# SNV + raw
# params = {
#     'n_estimators': [50],
#     'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
# }

grid_search = GridSearchCV(clf_ada, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, training_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_ada = grid_search.best_estimator_
gs_results['ADA'] = {'accuracy':best_score, 'params':best_params}

50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 124, in fit
    self._validate_params()
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\base.py", line 600, in 

params = {'learning_rate': 0.01, 'n_estimators': 50} 
best_score: 0.6696007259528131


In [39]:
# make Random Forest classifier
clf_rf = RandomForestClassifier(random_state=1)
params = {
  'n_estimators': [10, 50, 100, 200, 300],
  'max_depth': [None, 5, 10, 20, 30, 40],
  "max_features" : [None, 1, 5, 10, 20, 30]
}

# raw dataset
# params = {
#   'n_estimators': [50],
#   'max_depth': [10],
#   "max_features" : [30]
# }


# selected + possible features
# params = {
#   'n_estimators': [200],
#   'max_depth': [None],
#   "max_features" : [5]
# }

# feature select v2
# params = {'max_depth': [None], 'max_features': [5], 'n_estimators': [300]}

# SNV + raw
# params = {'max_depth': [None], 'max_features': [5], 'n_estimators': [300]}


grid_search  = GridSearchCV(clf_rf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"params = {best_params} \nbest_score: {best_score}")
best_rf = grid_search.best_estimator_
gs_results['RF'] = {'accuracy':best_score, 'params':best_params}

params = {'max_depth': 10, 'max_features': 30, 'n_estimators': 50} 
best_score: 0.851482153660012


In [40]:
# sorted GS models
print(gs_results)
gs_sorted_models = dict(sorted(gs_results.items(), key=lambda item: item[1]['accuracy'], reverse=True))
print(gs_sorted_models.keys())

{'RF': {'accuracy': 0.851482153660012, 'params': {'max_depth': 10, 'max_features': 30, 'n_estimators': 50}}}
dict_keys(['RF'])


In [41]:
# evaluate models (Anon, 2023b)
# model_metrics = {'accuracy', 'recall', 'precision', 'F1-score', 'ROC-AUC'}
model_metrics = {}

# all of the models
models = [best_cart, best_rf, best_lr, best_nb, best_knn, best_svmrbf, best_svmlin, best_svmsig, best_xgb, best_ada]
model_names = ['CART', 'RF', 'LR', 'GNB', 'kNN', 'SVM-RBF', 'SVM-Lin', 'SVM-Sig', 'XGB', 'ADA']
i=0
for model in models:
    # train on test set
    predicted = model.predict(testing_data)
    # generate cm against test labels
    cm = confusion_matrix(testing_labels, predicted)
    # print(cm)
    accuracy = accuracy_score(testing_labels, predicted)
    recall = recall_score(testing_labels, predicted, average=None)
    precision = precision_score(testing_labels, predicted, average=None)
    f1 = f1_score(testing_labels, predicted, average=None)

    try:
        predicted_prob = model.predict_proba(testing_data)
        roc = roc_auc_score(testing_labels, predicted_prob, average=None, multi_class='ovr') 
        # print(accuracy, recall, precision, f1, roc)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            },
                                            'ROC-AUC':{
                                                1:roc[0], 
                                                2:roc[1], 
                                                3:roc[2]
                                            }
        }
    except:
        print(f"can't predict class probilities for {model_names[i]}")
        # print(accuracy, recall, precision, f1)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            }
        }
    i+=1

# (Gern Blanston, 2009)- sort by neoplasia recall
sorted_metrics = dict(sorted(model_metrics.items(), key=lambda item: item[1]['recall'][3], reverse=True))
# (holys, 2013)
with open('metrics/raw_dataset/model_metrics_recall_snv.json', 'w') as fp:
    json.dump(sorted_metrics, fp)

# redo but sort by accuracy
# (Gern Blanston, 2009)
sorted_metrics_acc = dict(sorted(model_metrics.items(), key=lambda item: item[1]['accuracy'], reverse=True))
# (holys, 2013)
with open('metrics/raw_dataset/model_metrics_accuracy_snv.json', 'w') as fp:
    json.dump(sorted_metrics_acc, fp)
model_metrics
sorted_metrics


can't predict class probilities for SVM-RBF
can't predict class probilities for SVM-Lin
can't predict class probilities for SVM-Sig


  _warn_prf(average, modifier, msg_start, len(result))


{'GNB': {'accuracy': 0.6573426573426573,
  'recall': {1: 1.0, 2: 0.47619047619047616, 3: 0.7241379310344828},
  'precision': {1: 0.6470588235294118,
   2: 0.6976744186046512,
   3: 0.6363636363636364},
  'f1_score': {1: 0.7857142857142858,
   2: 0.5660377358490566,
   3: 0.6774193548387097},
  'ROC-AUC': {1: 0.9504132231404959,
   2: 0.7593253968253968,
   3: 0.7606490872210954}},
 'SVM-Lin': {'accuracy': 0.7342657342657343,
  'recall': {1: 0.9545454545454546,
   2: 0.6825396825396826,
   3: 0.7068965517241379},
  'precision': {1: 0.875, 2: 0.7288135593220338, 3: 0.6833333333333333},
  'f1_score': {1: 0.9130434782608695,
   2: 0.7049180327868851,
   3: 0.6949152542372882}},
 'CART': {'accuracy': 0.7762237762237763,
  'recall': {1: 0.9090909090909091,
   2: 0.8412698412698413,
   3: 0.6551724137931034},
  'precision': {1: 0.8695652173913043,
   2: 0.7464788732394366,
   3: 0.7755102040816326},
  'f1_score': {1: 0.888888888888889,
   2: 0.791044776119403,
   3: 0.7102803738317757},
  'RO

In [42]:
# print highest acc models from gridsearch
print(f"gs_sorted_models (acc): \n{gs_sorted_models.keys()}\n")

# highest acc models from test set
print(f"sorted models (acc): \n{sorted_metrics_acc.keys()}\n")

# highest recall from test set
print(f"sorted models (recall): \n{sorted_metrics.keys()}")


gs_sorted_models (acc): 
dict_keys(['RF'])

sorted models (acc): 
dict_keys(['RF', 'XGB', 'SVM-RBF', 'CART', 'kNN', 'SVM-Lin', 'LR', 'GNB', 'ADA', 'SVM-Sig'])

sorted models (recall): 
dict_keys(['GNB', 'SVM-Lin', 'CART', 'RF', 'kNN', 'LR', 'XGB', 'SVM-RBF', 'SVM-Sig', 'ADA'])


# observations on raw dataset
XGBoost and ADAboost seem to have really overfit, because they severely underperform on unseen test data, compared to the accuracies they were achieving with gridsearch. IGNORE THIS: it is just because the test labels were not normalised!

Although GNB has higher recall for neoplasia than kNN, kNN seems to be the best classifier overall. While GNB has highest recall for neoplasia, has 3rd lowest accuracy.

Top models based on accuracy, from gridsearch, were RF, kNN, XGB, SVM-RBF, CART. Top models based on accuracy, from test set, were RF, kNN, SVM-RBF, CART, SVM-Lin. Therefore, RF, kNN, SVM-RBF, CART seem to perform well, in terms of accuracy, and don't seem to produce drastically different results with the test set, suggesting there isn't much overfitting

gs_sorted_models (acc): 
(['RF', 'kNN', 'XGB', 'SVM-RBF', 'CART', 'LR', 'SVM-Lin', 'ADA', 'SVM-Sig', 'GNB'])

sorted models (acc): 
(['RF', 'XGB', 'kNN', 'SVM-RBF', 'CART', 'SVM-Lin', 'LR', 'SVM-Sig', 'GNB', 'ADA'])

sorted models (recall): 
(['GNB', 'kNN', 'CART', 'RF', 'SVM-Lin', 'XGB', 'LR', 'SVM-RBF', 'SVM-Sig', 'ADA'])

# observations on feature selected dataset
Some models decreased in performance, some increased, with largest increase being 6% increase in accuracy for SVM-Lin model. But overall, not worth, since the max accuracy of any of the models was lower than without feature selection. Maybe better feature selection is needed - an analytical solution rather than eyeball

gs_sorted_models (acc): 
(['RF', 'SVM-RBF', 'kNN', 'XGB', 'CART', 'SVM-Lin', 'LR', 'ADA', 'GNB', 'SVM-Sig'])

sorted models (acc): 
(['RF', 'XGB', 'kNN', 'SVM-RBF', 'CART', 'LR', 'SVM-Lin', 'GNB', 'ADA', 'SVM-Sig'])

sorted models (recall): 
(['GNB', 'SVM-RBF', 'XGB', 'RF', 'kNN', 'SVM-Lin', 'CART', 'LR', 'SVM-Sig', 'ADA'])