In [1]:
import pandas as pd, numpy as np
import os
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [2]:
# x_train
training_data = pd.read_csv('../data/original_data/noExclusion_train_data.csv', header = None)
# y_train
training_labels = pd.read_csv('../data/original_data/noExclusion_train_label.csv', header = None)
# x_test
testing_data = pd.read_csv('../data/original_data/noExclusion_test_data.csv', header = None)
# y_test
testing_labels = pd.read_csv('../data/original_data/noExclusion_test_label.csv', header = None)


In [3]:
# type cast labels to ints
# training_labels.columns
# training_labels[0]
training_labels[0] = training_labels[0].astype(int)
testing_labels[0] = testing_labels[0].astype(int)
# testing_labels

In [6]:
# make CART classifier
clf_cart = tree.DecisionTreeClassifier(criterion="gini", random_state=1)
# find optimal parameter values for CART
params = {
    'max_depth': [None, 5, 10, 15, 20, 25, 30, 35, 40], # control overfitting,
    'max_features': [None, 'sqrt', 'log2'] # performance 
}

grid_search = GridSearchCV(clf_cart, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_cart = grid_search.best_estimator_


best_params: {'max_depth': None, 'max_features': 'log2'} 
best_score: 0.806019358741682


In [8]:
# make Random Forest classifier
clf_rf = RandomForestClassifier(random_state=1)
params = {
  'n_estimators': [10, 50, 100, 200, 300],
  'max_depth': [None, 5, 10, 20, 30, 40],
  "max_features" : [None, 1, 5, 10, 20, 30]
}

# so it doesn't redo lengthy GS
params = {
  'n_estimators': [50],
  'max_depth': [10],
  "max_features" : [30]
}

grid_search  = GridSearchCV(clf_rf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_rf = grid_search.best_estimator_

best_params: {'max_depth': 10, 'max_features': 30, 'n_estimators': 50} 
best_score: 0.851482153660012


In [10]:
# make Logistic Regressor
clf_lr = LogisticRegression(random_state=1)
params = {
    'penalty': ['l1', 'l2'], # type of regularisation 
    'C': [0.1, 1, 10, 100], # regularisation strength
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'] # approach to finding best weights
}
# so it doesn't redo lengthy GS
params = {
    'penalty': ['l2'], # type of regularisation 
    'C': [0.1], # regularisation strength
    'solver': ['lbfgs'] # approach to finding best weights
}

grid_search = GridSearchCV(clf_lr, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7658197217180883


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_lr = grid_search.best_estimator_

best_params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'} 
best_score: 0.7658197217180883


In [12]:
# make Gaussian Naive Bayes classifier
clf_nb = GaussianNB()
params = {
    'var_smoothing':[1e-15, 1e-14, 1e-13, 1e-12, 1e-11, 1e-10, 1e-9], # from less smoothing to more aggressive smoothing
}
grid_search = GridSearchCV(clf_nb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_nb = grid_search.best_estimator_

best_params: {'var_smoothing': 1e-15} 
best_score: 0.6347549909255898


In [14]:
# make k-Nearest Neighbours classifier
clf_knn = KNeighborsClassifier(n_jobs=-1) # use all processes for parellelisation
params = {
    'n_neighbors': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
}
grid_search = GridSearchCV(clf_knn, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_knn = grid_search.best_estimator_

best_params: {'n_neighbors': 1} 
best_score: 0.8496067755595886


In [15]:
# make SVM-RBF classifier
clf_svmrbf = SVC(kernel='rbf', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
}

# so it doesn't redo lengthy GS
params = {
    'C': [100], # high to low regularisation strength
    'gamma' : ['scale'], # need to research this parameter more
}

grid_search = GridSearchCV(clf_svmrbf, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmrbf = grid_search.best_estimator_

best_params: {'C': 100, 'gamma': 'scale'} 
best_score: 0.8357229280096792


In [16]:
# make SVM linear classifier
clf_lin = SVC(kernel='linear', random_state=1)
params = {
    'C': [0.1, 1, 10, 100, 1000], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}
grid_search = GridSearchCV(clf_lin, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmlin = grid_search.best_estimator_

In [None]:
# make svm sigmoidal classifier
clf_sig = SVC(kernel='sigmoid', random_state=1)
params = {
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], # high to low regularisation strength
    'gamma' : ['scale', 'auto'], # need to research this parameter more
    # 'gamma' : [], # need to research this parameter more
}
grid_search = GridSearchCV(clf_sig, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, np.ravel(training_labels))
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_svmsig = grid_search.best_estimator_

best_params: {'C': 1e-05} 
best_score: 0.4493042952208105


In [None]:
# make xgboost classifier
clf_xgb = xgb.XGBClassifier(random_state = 1)

# encode labels, using sklearn, to pass to xgboost
# this code was inspired by the snippet from:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = LabelEncoder()
# fit the classes to the encoder and transform labels
e_train_labels = le.fit_transform(training_labels[0].to_list())

params = {
    'n_estimators': [10,100, 500, 1000], # no. boosting rounds
    'max_depth': [3,5,7,10,15] # control overfitting
}

# so it doesn't redo lengthy GS
params = {
    'n_estimators': [100], # no. boosting rounds
    'max_depth': [10] # control overfitting
}

grid_search = GridSearchCV(clf_xgb, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_xgb = grid_search.best_estimator_

best_params: {'max_depth': 10, 'n_estimators': 100} 
best_score: 0.8463399879007865


In [None]:
# make adaboost classifier
clf_ada = AdaBoostClassifier(random_state=1)
params = {
    'n_estimators': [10, 50, 100, 500, 1000],
    'learning_rate': [0, 0.01, 0.1, 1, 10] # weight applied to each clf at each boosting iteration
}

# so it doesn't redo lengthy GS
params = {
    'n_estimators': [50],
    'learning_rate': [0.01] # weight applied to each clf at each boosting iteration
}

grid_search = GridSearchCV(clf_ada, params, scoring='accuracy', cv=10)
grid_search.fit(training_data, e_train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f"best_params: {best_params} \nbest_score: {best_score}")
best_ada = grid_search.best_estimator_

50 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 124, in fit
    self._validate_params()
  File "c:\Users\sc19mhh\Desktop\Hamzah\Uni\CompSci\ThirdYear\FYP\MyCode\ML-cancer-detection\.venv\lib\site-packages\sklearn\base.py", line 600, in 

best_params: {'learning_rate': 0.01, 'n_estimators': 50} 
best_score: 0.6696007259528131
