In [2]:
#Basic imports for all datasets
import numpy as np 
import pandas as pd   # for data reading 
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import sklearn.naive_bayes
import sklearn.neural_network
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics           # For accuracy_score
import sklearn.model_selection   # For GridSearchCV and RandomizedSearchCV
import scipy
import scipy.stats               # For reciprocal distribution
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic, ExpSineSquared
from sklearn.impute import SimpleImputer

In [3]:
data = np.loadtxt('breast-cancer-wisconsin.data',dtype = 'str', delimiter=',')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
data[data == '?'] = np.nan
data = imp.fit_transform(data)
data = data.astype(np.int32)
X_train, X_test, y_train, y_test = train_test_split(
     data[:,:10], data[:,10], test_size=0.2, random_state=0)

In [4]:
#k-Nearest neighbours classification
knn_model = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
param_grid = {'n_neighbors':(np.arange(2,52, 5))}

mdls = sklearn.model_selection.GridSearchCV(knn_model, param_grid, verbose=1,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=17, p=2,
                     weights='uniform')


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.0s finished


0.5642857142857143

In [5]:
#SVM
svm_model = sklearn.svm.SVC(kernel = 'linear')
svm_model.fit(X_train, y_train);
y_pred = svm_model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.6785714285714286

In [6]:
#Decision tree classification
DTC_model = sklearn.tree.DecisionTreeClassifier(random_state=0)
Max_features = ['auto', 'sqrt', 'log2']
Max_depths = np.arange(1,34,2)
Min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
param_grid = {'max_features': Max_features, 'max_depth': Max_depths,  'min_samples_split': Min_samples_splits, 'min_samples_leaf': Min_samples_leafs}

mdls = sklearn.model_selection.GridSearchCV(DTC_model, param_grid, verbose=0,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.01, min_samples_split=0.1,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')


0.9571428571428572

In [7]:
#Random forest classification
RFC_model = sklearn.ensemble.RandomForestClassifier(random_state=0)
Estimators = np.arange(100,105,1)
Max_features = ['auto', 'sqrt', 'log2']
param_grid = {'n_estimators': Estimators,'max_features': Max_features, }

mdls = sklearn.model_selection.GridSearchCV(RFC_model, param_grid, verbose=0,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


0.9785714285714285

In [8]:
#AdaBoost classification
ABC_model = sklearn.ensemble.AdaBoostClassifier(random_state=0)
Estimators = np.arange(50,100,10)
Learning_rates = [0.01,0.05,0.1,0.3,1]
algorithm = ['SAMME', 'SAMME.R']
param_grid = {'n_estimators': Estimators, 'learning_rate': Learning_rates, 'algorithm': algorithm}

mdls = sklearn.model_selection.GridSearchCV(ABC_model, param_grid, verbose=0,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=70, random_state=0)


0.9428571428571428

In [9]:
#Logistic regression (for classification)
#Fit_intercept is set to True because we don't have bias
# logistic_model = sklearn.linear_model.LogisticRegression(fit_intercept=True)
logistic_model = sklearn.linear_model.LogisticRegression(n_jobs=-1)
param_grid = { "fit_intercept":[True], "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
             "max_iter":np.arange(100,400, 100)}


mdls = sklearn.model_selection.GridSearchCV(logistic_model, param_grid, verbose=0,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)


0.9714285714285714

In [10]:
#Gaussian naive Bayes classification

zero_prob = y_train[y_train == 2].shape[0]/y_train.shape[0]
one_prob = 1 - zero_prob
prob = np.array([zero_prob,one_prob])
GNB_model = sklearn.naive_bayes.GaussianNB(priors = prob)
GNB_model.fit(X_train, y_train)
# mdls = sklearn.model_selection.GridSearchCV(GNB_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
# print(mdls.best_estimator_)

y_pred = GNB_model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.7428571428571429

In [None]:
#Neural network classification
NNC_model = sklearn.neural_network.MLPClassifier()
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
learn_rate = [0.001, 0.01, 0.1]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8]
neurons = [1, 5, 10, 15, 20, 25, 30] 
activation = ['identity', 'logistic', 'tanh', 'relu']
alpha = [0.0001,0.002]
param_grid = {'batch_size':batch_size,  'momentum':momentum, 
              'activation' : activation, 'alpha':alpha }

mdls = sklearn.model_selection.GridSearchCV(NNC_model, param_grid, verbose=0,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)





















In [None]:
# #SVM classifier
# svm_model = sklearn.svm.SVC()
# Kernels = ['linear']
# Epsilons = [0.1,0.2,0.5,0.3]
# Cs = [0.001, 0.01, 0.1, 1, 10]
# Gammas = [0.001, 0.01, 0.1, 1]
# param_grid = {'C': Cs, 'gamma' : Gammas, 'kernel': Kernels}

# mdls = sklearn.model_selection.GridSearchCV(svm_model, param_grid, verbose=1,cv=3).fit(X_train, y_train)
# print(mdls.best_estimator_)

# y_pred = mdls.best_estimator_.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)