In [1]:
import pandas as pd, numpy as np
import os, json
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
# x_train
training_data = pd.read_csv('../data/original_data/noExclusion_train_data.csv', header = None)
# y_train
training_labels = pd.read_csv('../data/original_data/noExclusion_train_label.csv', header = None)
# x_test
testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
# y_test
testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)


In [3]:
# type cast labels to ints
training_labels[0] = training_labels[0].astype(int)
testing_labels[0] = testing_labels[0].astype(int)
# testing_labels
# gridsearch results
gs_results = {}

In [4]:
# RUN THIS TO APPLY FEATURE SELECTION TO TRAINING DATA
ADD_POSSIBLE_FIGURES = False

# figures contain features from (figure_num*4)+1 
selected_figures = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,
                    21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
                    39,40,41,42,43,44,45,46,47,48,49,50,51,52]

# figures that MAY be decent - noisy but different peaks
possible_figures = [53,54,56,57,58,59,60,61,62,63,64,65]

# decent features - eyeballed

# generate set of selected features
selected_features = []

for figure_num in selected_figures:
    for i in range(0,4):
        # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
        selected_features.append((figure_num*4)+i)

# to add possible features
if ADD_POSSIBLE_FIGURES == True:
    for figure_num in possible_figures:
        for i in range(0,4):
            # print(f"figure: {figure_num}, feature: feature_{(figure_num*4)+i}")
            selected_features.append((figure_num*4)+i)

# (Cena, 2018)
training_data = training_data[selected_features]
testing_data = testing_data[selected_features]


In [5]:
# make CART classifier
clf_cart = tree.DecisionTreeClassifier(criterion="gini", random_state=1)
# find optimal parameter values for CART
params = {
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40], # control overfitting,
    'max_features': [None, 'sqrt', 'log2'] # performance 
}

# params = {
#     'max_depth': [None], # control overfitting,
#     'max_features': ['log2'] # performance 
# }

grid_search = GridSearchCV(clf_cart, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_cart = grid_search.best_estimator_
gs_results['CART'] = {'accuracy':best_score, 'params':best_params}

best_params: {'max_depth': None, 'max_features': None} 
best_score: 0.8236237144585601


In [6]:
# make Random Forest classifier
clf_rf = RandomForestClassifier(random_state=1)
params = {
  'n_estimators': [10, 50, 100, 200, 300],
  'max_depth': [None, 5, 10, 20, 30, 40],
  "max_features" : [None, 1, 5, 10, 20, 30]
}

# so it doesn't redo lengthy GS
# params = {
#   'n_estimators': [50],
#   'max_depth': [10],
#   "max_features" : [30]
# }

grid_search  = GridSearchCV(clf_rf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_rf = grid_search.best_estimator_
gs_results['RF'] = {'accuracy':best_score, 'params':best_params}

best_params: {'max_depth': 10, 'max_features': 30, 'n_estimators': 50} 
best_score: 0.846098003629764


In [7]:
# make Logistic Regressor
clf_lr = LogisticRegression(random_state=1)
params = {
    'penalty': ['l1', 'l2'], # type of regularisation 
    'C': [0.1, 1, 10, 100], # regularisation strength
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'] # approach to finding best weights
}
# so it doesn't redo lengthy GS
# params = {
#     'penalty': ['l2'], # type of regularisation 
#     'C': [0.1], # regularisation strength
#     'solver': ['lbfgs'] # approach to finding best weights
# }

grid_search = GridSearchCV(clf_lr, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
gs_results['LR'] = {'accuracy':best_score, 'params':best_params}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'} 
best_score: 0.7501209921355112


80 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packa

In [8]:
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_lr = grid_search.best_estimator_

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'} 
best_score: 0.7501209921355112


In [9]:
# make Gaussian Naive Bayes classifier
clf_nb = GaussianNB()
params = {
    'var_smoothing':[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9], # from less smoothing to more aggressive smoothing
}
grid_search = GridSearchCV(clf_nb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_nb = grid_search.best_estimator_
gs_results['GNB'] = {'accuracy':best_score, 'params':best_params}

best_params: {'var_smoothing': 1e-15} 
best_score: 0.6049304295220811


In [10]:
# make k-Nearest Neighbours classifier
clf_knn = KNeighborsClassifier(n_jobs=-1) # use all processes for parellelisation
params = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
}
grid_search = GridSearchCV(clf_knn, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_knn = grid_search.best_estimator_
gs_results['kNN'] = {'accuracy':best_score, 'params':best_params}

best_params: {'n_neighbors': 3} 
best_score: 0.8286146400483968


In [11]:
# make SVM-RBF classifier
clf_svmrbf = SVC(kernel='rbf', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
}

# so it doesn't redo lengthy GS
# params = {
#     'C': [100], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }

grid_search = GridSearchCV(clf_svmrbf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmrbf = grid_search.best_estimator_
gs_results['SVM-RBF'] = {'accuracy':best_score, 'params':best_params}

best_params: {'C': 1000, 'gamma': 'scale'} 
best_score: 0.8287356321839081


In [12]:
# make SVM linear classifier
clf_lin = SVC(kernel='linear', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}

# so it doesn't redo lengthy GS
# params = {
#     'C': [10], # high to low regularisation strength
#     'gamma' : ['scale'], # need to research this parameter more
# }


grid_search = GridSearchCV(clf_lin, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmlin = grid_search.best_estimator_
gs_results['SVM-Lin'] = {'accuracy':best_score, 'params':best_params}

best_params: {'C': 1, 'gamma': 'scale'} 
best_score: 0.7571385359951603


In [13]:
# make svm sigmoidal classifier
clf_sig = SVC(kernel='sigmoid', random_state=1)
params = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}

# so it doesn't redo lengthy GS
# params = {
#     'C': [10], # high to low regularisation strength
#     'gamma' : ['auto'], # need to research this parameter more
# }

grid_search = GridSearchCV(clf_sig, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmsig = grid_search.best_estimator_
gs_results['SVM-Sig'] = {'accuracy':best_score, 'params':best_params}

best_params: {'C': 1e-05, 'gamma': 'scale'} 
best_score: 0.4493042952208105


In [14]:
# make xgboost classifier
clf_xgb = xgb.XGBClassifier(random_state = 1)

# encode labels, using sklearn, to pass to xgboost
# this code was inspired by the snippet from:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = LabelEncoder()
# fit the classes to the encoder and transform labels
e_train_labels = le.fit_transform(training_labels[0].to_list())

params = {
    'n_estimators': [10,100, 500, 1000], # no. boosting rounds
    'max_depth': [3,5,7,10,15] # control overfitting
}

# so it doesn't redo lengthy GS
# params = {
#     'n_estimators': [100], # no. boosting rounds
#     'max_depth': [10] # control overfitting
# }

grid_search = GridSearchCV(clf_xgb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_xgb = grid_search.best_estimator_
gs_results['XGB'] = {'accuracy':best_score, 'params':best_params}

best_params: {'max_depth': 10, 'n_estimators': 100} 
best_score: 0.8269812462189957


In [15]:
# make adaboost classifier
clf_ada = AdaBoostClassifier(random_state=1)
params = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'learning_rate': [0, 0.01, 0.1, 1, 10] # weight applied to each clf at each boosting iteration
}

# so it doesn't redo lengthy GS
# params = {
#     'n_estimators': [50],
#     'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
# }

grid_search = GridSearchCV(clf_ada, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_ada = grid_search.best_estimator_
gs_results['ADA'] = {'accuracy':best_score, 'params':best_params}

best_params: {'learning_rate': 0.1, 'n_estimators': 10} 
best_score: 0.6696007259528131


50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 124, in fit
    self._validate_params()
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\base.py", line 600, in 

In [16]:
# sorted GS models
print(gs_results)
gs_sorted_models = dict(sorted(gs_results.items(), key=lambda item: item[1]['accuracy'], reverse=True))
print(gs_sorted_models.keys())

{'CART': {'accuracy': 0.8236237144585601, 'params': {'max_depth': None, 'max_features': None}}, 'RF': {'accuracy': 0.846098003629764, 'params': {'max_depth': 10, 'max_features': 30, 'n_estimators': 50}}, 'LR': {'accuracy': 0.7501209921355112, 'params': {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}}, 'GNB': {'accuracy': 0.6049304295220811, 'params': {'var_smoothing': 1e-15}}, 'kNN': {'accuracy': 0.8286146400483968, 'params': {'n_neighbors': 3}}, 'SVM-RBF': {'accuracy': 0.8287356321839081, 'params': {'C': 1000, 'gamma': 'scale'}}, 'SVM-Lin': {'accuracy': 0.7571385359951603, 'params': {'C': 1, 'gamma': 'scale'}}, 'SVM-Sig': {'accuracy': 0.4493042952208105, 'params': {'C': 1e-05, 'gamma': 'scale'}}, 'XGB': {'accuracy': 0.8269812462189957, 'params': {'max_depth': 10, 'n_estimators': 100}}, 'ADA': {'accuracy': 0.6696007259528131, 'params': {'learning_rate': 0.1, 'n_estimators': 10}}}
dict_keys(['RF', 'SVM-RBF', 'kNN', 'XGB', 'CART', 'SVM-Lin', 'LR', 'ADA', 'GNB', 'SVM-Sig'])


In [19]:
# evaluate models (Anon, 2023b)
# model_metrics = {'accuracy', 'recall', 'precision', 'F1-score', 'ROC-AUC'}
model_metrics = {}

# all of the models
models = [best_cart, best_rf, best_lr, best_nb, best_knn, best_svmrbf, best_svmlin, best_svmsig, best_xgb, best_ada]
model_names = ['CART', 'RF', 'LR', 'GNB', 'kNN', 'SVM-RBF', 'SVM-Lin', 'SVM-Sig', 'XGB', 'ADA']
i=0
for model in models:
    # train on test set
    predicted = model.predict(testing_data)
    # generate cm against test labels
    cm = confusion_matrix(testing_labels, predicted)
    print(cm)
    accuracy = accuracy_score(testing_labels, predicted)
    recall = recall_score(testing_labels, predicted, average=None)
    precision = precision_score(testing_labels, predicted, average=None)
    f1 = f1_score(testing_labels, predicted, average=None)

    try:
        predicted_prob = model.predict_proba(testing_data)
        roc = roc_auc_score(testing_labels, predicted_prob, average=None, multi_class='ovr') 
        # print(accuracy, recall, precision, f1, roc)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            },
                                            'ROC-AUC':{
                                                1:roc[0], 
                                                2:roc[1], 
                                                3:roc[2]
                                            }
        }
    except:
        print(f"can't predict class probilities for {model_names[i]}")
        # print(accuracy, recall, precision, f1)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            }
        }
    i+=1

# (Gern Blanston, 2009)
sorted_metrics = dict(sorted(model_metrics.items(), key=lambda item: item[1]['recall'][3], reverse=True))
# (holys, 2013)
with open('metrics/selected_features/model_metrics_recall.json', 'w') as fp:
    json.dump(sorted_metrics, fp)

# redo but sort by accuracy
# (Gern Blanston, 2009)
sorted_metrics_acc = dict(sorted(model_metrics.items(), key=lambda item: item[1]['accuracy'], reverse=True))
# (holys, 2013)
with open('metrics/selected_features/model_metrics_accuracy.json', 'w') as fp:
    json.dump(sorted_metrics_acc, fp)
model_metrics
sorted_metrics


[[21  1  0]
 [ 0 53 10]
 [ 1 19 38]]
[[21  1  0]
 [ 0 60  3]
 [ 0 20 38]]
[[21  1  0]
 [ 1 53  9]
 [ 2 24 32]]
[[21  1  0]
 [ 8 26 29]
 [ 3 13 42]]
[[22  0  0]
 [ 0 57  6]
 [ 2 17 39]]
[[21  0  1]
 [ 1 58  4]
 [ 1 20 37]]
can't predict class probilities for SVM-RBF
[[22  0  0]
 [ 1 50 12]
 [ 2 21 35]]
can't predict class probilities for SVM-Lin
[[ 0 22  0]
 [ 0 63  0]
 [ 0 58  0]]
can't predict class probilities for SVM-Sig
[[ 0  0  0  0]
 [21  1  0  0]
 [ 0 58  5  0]
 [ 1 18 39  0]]
[[ 0  0  0  0]
 [20  2  0  0]
 [ 0 63  0  0]
 [ 1 57  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'GNB': {'accuracy': 0.6223776223776224,
  'recall': {1: 0.9545454545454546,
   2: 0.4126984126984127,
   3: 0.7241379310344828},
  'precision': {1: 0.65625, 2: 0.65, 3: 0.5915492957746479},
  'f1_score': {1: 0.7777777777777778,
   2: 0.5048543689320388,
   3: 0.6511627906976745},
  'ROC-AUC': {1: 0.9564237415477085,
   2: 0.7830357142857143,
   3: 0.7481744421906694}},
 'kNN': {'accuracy': 0.8251748251748252,
  'recall': {1: 1.0, 2: 0.9047619047619048, 3: 0.6724137931034483},
  'precision': {1: 0.9166666666666666,
   2: 0.7702702702702703,
   3: 0.8666666666666667},
  'f1_score': {1: 0.9565217391304348,
   2: 0.832116788321168,
   3: 0.7572815533980582},
  'ROC-AUC': {1: 0.9994365138993238,
   2: 0.8826388888888888,
   3: 0.8644016227180528}},
 'CART': {'accuracy': 0.7832167832167832,
  'recall': {1: 0.9545454545454546,
   2: 0.8412698412698413,
   3: 0.6551724137931034},
  'precision': {1: 0.9545454545454546,
   2: 0.726027397260274,
   3: 0.7916666666666666},
  'f1_score': {1: 0.954

In [20]:
# print highest acc models from gridsearch
print(f"gs_sorted_models (acc): \n{gs_sorted_models.keys()}\n")

# highest acc models from test set
print(f"sorted models (acc): \n{sorted_metrics_acc.keys()}\n")

# highest recall from test set
print(f"sorted models (recall): \n{sorted_metrics.keys()}")


gs_sorted_models (acc): 
dict_keys(['RF', 'SVM-RBF', 'kNN', 'XGB', 'CART', 'SVM-Lin', 'LR', 'ADA', 'GNB', 'SVM-Sig'])

sorted models (acc): 
dict_keys(['RF', 'kNN', 'SVM-RBF', 'CART', 'SVM-Lin', 'LR', 'GNB', 'SVM-Sig', 'XGB', 'ADA'])

sorted models (recall): 
dict_keys(['GNB', 'kNN', 'CART', 'RF', 'SVM-RBF', 'SVM-Lin', 'LR', 'XGB', 'SVM-Sig', 'ADA'])


# observations on raw dataset
XGBoost and ADAboost seem to have really overfit, because they severely underperform on unseen test data, compared to the accuracies they were achieving with gridsearch.

Although GNB has higher recall for neoplasia than kNN, kNN seems to be the best classifier overall. While GNB has highest recall for neoplasia, has 3rd lowest accuracy.

Top models based on accuracy, from gridsearch, were RF, kNN, XGB, SVM-RBF, CART. Top models based on accuracy, from test set, were RF, kNN, SVM-RBF, CART, SVM-Lin. Therefore, RF, kNN, SVM-RBF, CART seem to perform well, in terms of accuracy, and don't seem to produce drastically different results with the test set, suggesting there isn't much overfitting