In [55]:
import pandas as pd, numpy as np
import os, json
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, confusion_matrix, accuracy_score, f1_score, precision_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
# x_train
training_data = pd.read_csv('../data/original_data/noExclusion_train_data.csv', header = None)
# y_train
training_labels = pd.read_csv('../data/original_data/noExclusion_train_label.csv', header = None)
# x_test
testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
# y_test
testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)


In [3]:
# type cast labels to ints
# training_labels.columns
# training_labels[0]
training_labels[0] = training_labels[0].astype(int)
testing_labels[0] = testing_labels[0].astype(int)
# testing_labels

In [6]:
# make CART classifier
clf_cart = tree.DecisionTreeClassifier(criterion="gini", random_state=1)
# find optimal parameter values for CART
params = {
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40], # control overfitting,
    'max_features': [None, 'sqrt', 'log2'] # performance 
}

grid_search = GridSearchCV(clf_cart, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_cart = grid_search.best_estimator_


best_params: {'max_depth': None, 'max_features': 'log2'} 
best_score: 0.806019358741682


In [8]:
# make Random Forest classifier
clf_rf = RandomForestClassifier(random_state=1)
params = {
  'n_estimators': [10, 50, 100, 200, 300],
  'max_depth': [None, 5, 10, 20, 30, 40],
  "max_features" : [None, 1, 5, 10, 20, 30]
}

# so it doesn't redo lengthy GS
params = {
  'n_estimators': [50],
  'max_depth': [10],
  "max_features" : [30]
}

grid_search  = GridSearchCV(clf_rf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_rf = grid_search.best_estimator_

best_params: {'max_depth': 10, 'max_features': 30, 'n_estimators': 50} 
best_score: 0.851482153660012


In [10]:
# make Logistic Regressor
clf_lr = LogisticRegression(random_state=1)
params = {
    'penalty': ['l1', 'l2'], # type of regularisation 
    'C': [0.1, 1, 10, 100], # regularisation strength
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'] # approach to finding best weights
}
# so it doesn't redo lengthy GS
params = {
    'penalty': ['l2'], # type of regularisation 
    'C': [0.1], # regularisation strength
    'solver': ['lbfgs'] # approach to finding best weights
}

grid_search = GridSearchCV(clf_lr, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7658197217180883


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_lr = grid_search.best_estimator_

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7658197217180883


In [12]:
# make Gaussian Naive Bayes classifier
clf_nb = GaussianNB()
params = {
    'var_smoothing':[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9], # from less smoothing to more aggressive smoothing
}
grid_search = GridSearchCV(clf_nb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_nb = grid_search.best_estimator_

best_params: {'var_smoothing': 1e-15} 
best_score: 0.6347549909255898


In [14]:
# make k-Nearest Neighbours classifier
clf_knn = KNeighborsClassifier(n_jobs=-1) # use all processes for parellelisation
params = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
}
grid_search = GridSearchCV(clf_knn, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_knn = grid_search.best_estimator_

best_params: {'n_neighbors': 1} 
best_score: 0.8496067755595886


In [15]:
# make SVM-RBF classifier
clf_svmrbf = SVC(kernel='rbf', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
}

# so it doesn't redo lengthy GS
params = {
    'C': [100], # high to low regularisation strength
    'gamma' : ['scale'], # need to research this parameter more
}

grid_search = GridSearchCV(clf_svmrbf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmrbf = grid_search.best_estimator_

best_params: {'C': 100, 'gamma': 'scale'} 
best_score: 0.8357229280096792


In [16]:
# make SVM linear classifier
clf_lin = SVC(kernel='linear', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}
grid_search = GridSearchCV(clf_lin, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmlin = grid_search.best_estimator_

best_params: {'C': 10, 'gamma': 'scale'} 
best_score: 0.7553539019963702


In [17]:
# make svm sigmoidal classifier
clf_sig = SVC(kernel='sigmoid', random_state=1)
params = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}
grid_search = GridSearchCV(clf_sig, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmsig = grid_search.best_estimator_

best_params: {'C': 10, 'gamma': 'auto'} 
best_score: 0.6658802177858438


In [18]:
# make xgboost classifier
clf_xgb = xgb.XGBClassifier(random_state = 1)

# encode labels, using sklearn, to pass to xgboost
# this code was inspired by the snippet from:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = LabelEncoder()
# fit the classes to the encoder and transform labels
e_train_labels = le.fit_transform(training_labels[0].to_list())

params = {
    'n_estimators': [10,100, 500, 1000], # no. boosting rounds
    'max_depth': [3,5,7,10,15] # control overfitting
}

# so it doesn't redo lengthy GS
params = {
    'n_estimators': [100], # no. boosting rounds
    'max_depth': [10] # control overfitting
}

grid_search = GridSearchCV(clf_xgb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_xgb = grid_search.best_estimator_

best_params: {'max_depth': 10, 'n_estimators': 100} 
best_score: 0.8463399879007865


In [19]:
# make adaboost classifier
clf_ada = AdaBoostClassifier(random_state=1)
params = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'learning_rate': [0, 0.01, 0.1, 1, 10] # weight applied to each clf at each boosting iteration
}

# so it doesn't redo lengthy GS
params = {
    'n_estimators': [50],
    'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
}

grid_search = GridSearchCV(clf_ada, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_ada = grid_search.best_estimator_

best_params: {'learning_rate': 0.01, 'n_estimators': 50} 
best_score: 0.6696007259528131


In [65]:
# evaluate models (Anon, 2023b)
# model_metrics = {'accuracy', 'recall', 'precision', 'F1-score', 'ROC-AUC'}
model_metrics = {}

# all of the models
models = [best_cart, best_rf, best_lr, best_nb, best_knn, best_svmrbf, best_svmlin, best_svmsig, best_xgb, best_ada]
model_names = ['CART', 'RF', 'LR', 'GNB', 'kNN', 'SVM-RBF', 'SVM-Lin', 'SVM-Sig', 'XGB', 'ADA']
i=0
for model in models:
    # train on test set
    predicted = model.predict(testing_data)
    # generate cm against test labels
    cm = confusion_matrix(testing_labels, predicted)
    print(cm)
    accuracy = accuracy_score(testing_labels, predicted)
    recall = recall_score(testing_labels, predicted, average=None)
    precision = precision_score(testing_labels, predicted, average=None)
    f1 = f1_score(testing_labels, predicted, average=None)

    try:
        predicted_prob = model.predict_proba(testing_data)
        roc = roc_auc_score(testing_labels, predicted_prob, average=None, multi_class='ovr') 
        # print(accuracy, recall, precision, f1, roc)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            },
                                            'ROC-AUC':{
                                                1:roc[0], 
                                                2:roc[1], 
                                                3:roc[2]
                                            }
        }
    except:
        print(f"can't predict class probilities for {model_names[i]}")
        # print(accuracy, recall, precision, f1)
        model_metrics[model_names[i]] = {
                                            'accuracy':accuracy, 
                                            'recall':{
                                                1:recall[0], 
                                                2:recall[1], 
                                                3:recall[2]
                                            },
                                            'precision':{
                                                1:precision[0], 
                                                2:precision[1], 
                                                3:precision[2]
                                            },
                                            'f1_score':{
                                                1:f1[0], 
                                                2:f1[1], 
                                                3:f1[2]
                                            }
        }
    i+=1

# (Gern Blanston, 2009)
sorted_metrics = dict(sorted(model_metrics.items(), key=lambda item: item[1]['recall'][3], reverse=True))
# (holys, 2013)
with open('metrics/model_metrics_recall.json', 'w') as fp:
    json.dump(sorted_metrics, fp)

# redo but sort by accuracy
# (Gern Blanston, 2009)
sorted_metrics_acc = dict(sorted(model_metrics.items(), key=lambda item: item[1]['accuracy'], reverse=True))
# (holys, 2013)
with open('metrics/model_metrics_accuracy.json', 'w') as fp:
    json.dump(sorted_metrics_acc, fp)
model_metrics
sorted_metrics


[[20  0  2]
 [ 1 53  9]
 [ 2 18 38]]
[[20  2  0]
 [ 0 62  1]
 [ 1 19 38]]
[[21  1  0]
 [ 1 48 14]
 [ 2 21 35]]
[[22  0  0]
 [ 9 30 24]
 [ 3 13 42]]
[[20  1  1]
 [ 1 56  6]
 [ 0 17 41]]
[[22  0  0]
 [ 0 62  1]
 [ 2 23 33]]
can't predict class probilities for SVM-RBF
[[22  0  0]
 [ 0 50 13]
 [ 2 19 37]]
can't predict class probilities for SVM-Lin
[[18  4  0]
 [ 1 52 10]
 [ 6 24 28]]
can't predict class probilities for SVM-Sig
[[ 0  0  0  0]
 [22  0  0  0]
 [ 0 61  2  0]
 [ 1 20 37  0]]
[[ 0  0  0  0]
 [20  2  0  0]
 [ 0 63  0  0]
 [ 1 57  0  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'GNB': {'accuracy': 0.6573426573426573,
  'recall': {1: 1.0, 2: 0.47619047619047616, 3: 0.7241379310344828},
  'precision': {1: 0.6470588235294118,
   2: 0.6976744186046512,
   3: 0.6363636363636364},
  'f1_score': {1: 0.7857142857142858,
   2: 0.5660377358490566,
   3: 0.6774193548387097},
  'ROC-AUC': {1: 0.9504132231404959,
   2: 0.7593253968253968,
   3: 0.7606490872210954}},
 'kNN': {'accuracy': 0.8181818181818182,
  'recall': {1: 0.9090909090909091,
   2: 0.8888888888888888,
   3: 0.7068965517241379},
  'precision': {1: 0.9523809523809523,
   2: 0.7567567567567568,
   3: 0.8541666666666666},
  'f1_score': {1: 0.9302325581395349,
   2: 0.8175182481751826,
   3: 0.7735849056603773},
  'ROC-AUC': {1: 0.9504132231404959,
   2: 0.8319444444444444,
   3: 0.8122718052738336}},
 'CART': {'accuracy': 0.7762237762237763,
  'recall': {1: 0.9090909090909091,
   2: 0.8412698412698413,
   3: 0.6551724137931034},
  'precision': {1: 0.8695652173913043,
   2: 0.7464788732394366,
   3: 0.77551020

# observations
XGBoost and ADAboost seem to have really overfit, because they severely underperform on unseen test data, compared to the accuracies they were achieving with gridsearch.

Although GNB has higher recall for neoplasia than kNN, kNN seems to be the best classifier overall. While GNB has highest recall for neoplasia, has 3rd lowest accuracy.

Top models based on accuracy, from gridsearch, were RF, kNN, XGB, SVM-RBF, CART. Top models based on accuracy, from test set, were RF, kNN, SVM-RBF, CART, SVM-Lin. Therefore, RF, kNN, SVM-RBF, CART seem to perform well, in terms of accuracy, and don't seem to produce drastically different results with the test set, suggesting there isn't much overfitting