Breast Cancer Wisconsin Classification

In [2]:
# Import all libraries
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import pandas as pd
import numpy as np

In [3]:
# Load Breast Cancer Wisconsin Dataset
# We have known that the '?' represents NA, so we replace all the '?' to NA
df = pd.read_csv('breast-cancer-wisconsin.csv',na_values=["?"])
print(df.head())
print(df.info())

   Clump Thickness  Uniformity of Cell Size  Uniformity of Cell Shape  \
0                5                        1                         1   
1                5                        4                         4   
2                3                        1                         1   
3                6                        8                         8   
4                4                        1                         1   

   Marginal Adhesion   Single Epithelial Cell Size  Bare Nuclei  \
0                  1                             2          1.0   
1                  5                             7         10.0   
2                  1                             2          2.0   
3                  1                             3          4.0   
4                  3                             2          1.0   

   Bland Chromatin  Normal Nucleoli  Mitoses   class  
0                3                1        1  class1  
1                3                2        1  cl

In [4]:
# Pre-process dataset
# Replace missing attibute values with mean value of the column
replace_na_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
replace_na_mean.fit(df.drop(['class'], axis = 1))
df_replaced_na_array = replace_na_mean.transform(df.drop(['class'], axis = 1))

# Normalization
# Each attribute should be performed using a min-max scaler to normalise the values between [0,1]
minmaxscaler = MinMaxScaler()
minmaxscaler.fit(df_replaced_na_array)
scaled_df = minmaxscaler.transform(df_replaced_na_array)
# Create a new dataframe
df_cleaned = pd.DataFrame(scaled_df)

# Change the class values
# The classes class1 and class2 should be changed to 0 and 1 respectively
array_class = np.array(df['class'])
new_class = []
for i in array_class:
  if i == 'class1':
    new_class.append(0)
  else:
    new_class.append(1)
df_cleaned['class'] = new_class # Add 0 and 1 back to cleaned dataset
df_cleaned.columns=df.columns # Repalce the names of column
print('After replacing NA value, the total NA values in the dataframe are ' + str(df_cleaned.isnull().sum().sum())+'.\n')
print(df_cleaned.head(10))
print(df_cleaned.info())

After replacing NA value, the total NA values in the dataframe are 0.

   Clump Thickness  Uniformity of Cell Size  Uniformity of Cell Shape  \
0         0.444444                 0.000000                  0.000000   
1         0.444444                 0.333333                  0.333333   
2         0.222222                 0.000000                  0.000000   
3         0.555556                 0.777778                  0.777778   
4         0.333333                 0.000000                  0.000000   
5         0.777778                 1.000000                  1.000000   
6         0.000000                 0.000000                  0.000000   
7         0.111111                 0.000000                  0.111111   
8         0.111111                 0.000000                  0.000000   
9         0.333333                 0.111111                  0.000000   

   Marginal Adhesion   Single Epithelial Cell Size  Bare Nuclei  \
0           0.000000                      0.111111     0.0

In [5]:
# Print first ten rows of pre-processed dataset to 4 decimal places
pd.options.display.float_format = '{:.4f}'.format
df_cleaned.head(10)

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,class
0,0.4444,0.0,0.0,0.0,0.1111,0.0,0.2222,0.0,0.0,0
1,0.4444,0.3333,0.3333,0.4444,0.6667,1.0,0.2222,0.1111,0.0,0
2,0.2222,0.0,0.0,0.0,0.1111,0.1111,0.2222,0.0,0.0,0
3,0.5556,0.7778,0.7778,0.0,0.2222,0.3333,0.2222,0.6667,0.0,0
4,0.3333,0.0,0.0,0.2222,0.1111,0.0,0.2222,0.0,0.0,0
5,0.7778,1.0,1.0,0.7778,0.6667,1.0,0.8889,0.6667,0.0,1
6,0.0,0.0,0.0,0.0,0.1111,1.0,0.2222,0.0,0.0,0
7,0.1111,0.0,0.1111,0.0,0.1111,0.0,0.2222,0.0,0.0,0
8,0.1111,0.0,0.0,0.0,0.1111,0.0,0.0,0.0,0.4444,0
9,0.3333,0.1111,0.0,0.0,0.1111,0.0,0.1111,0.0,0.0,0


In [6]:
# Print the values of the first ten rows
output = ''
for rows in range(10):
  for columns in range(0, df_cleaned.shape[1]):
    if columns != df_cleaned.shape[1]-1:
      output += '{:.4f}'.format(df_cleaned.iloc[rows, columns]) + ','
    else:
      output += str(int(df_cleaned.iloc[rows, columns]))
  print(output)
  output = ''

0.4444,0.0000,0.0000,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.4444,0.3333,0.3333,0.4444,0.6667,1.0000,0.2222,0.1111,0.0000,0
0.2222,0.0000,0.0000,0.0000,0.1111,0.1111,0.2222,0.0000,0.0000,0
0.5556,0.7778,0.7778,0.0000,0.2222,0.3333,0.2222,0.6667,0.0000,0
0.3333,0.0000,0.0000,0.2222,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.7778,1.0000,1.0000,0.7778,0.6667,1.0000,0.8889,0.6667,0.0000,1
0.0000,0.0000,0.0000,0.0000,0.1111,1.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.1111,0.0000,0.1111,0.0000,0.2222,0.0000,0.0000,0
0.1111,0.0000,0.0000,0.0000,0.1111,0.0000,0.0000,0.0000,0.4444,0
0.3333,0.1111,0.0000,0.0000,0.1111,0.0000,0.1111,0.0000,0.0000,0


In [7]:
# Divide data into labels and features, and infer X and y
# Let X contains the attribute values and y contains the class
X = df_cleaned.drop(['class'], axis = 1)
y = df_cleaned['class']

### Part 1: Cross validation without parameter tuning

In [8]:
## Setting the 10 fold stratified cross-validation
cvKFold=StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# The stratified folds from cvKFold should be provided to the classifiers

In [9]:
# K-Nearest Neighbour
def kNNClassifier(X, y, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=cvKFold)
    return scores.mean()

In [10]:
# Logistic Regression
def logregClassifier(X, y):
    logreg = LogisticRegression()
    scores = cross_val_score(logreg, X, y, cv=cvKFold)
    return scores.mean()

In [11]:
# Naïve Bayes
# We will create a NB for this data which is a numeric dataset, so we will use the GaussianNB class to create the classifier
def nbClassifier(X, y):
    nb = GaussianNB()
    scores = cross_val_score(nb, X, y, cv=cvKFold)
    return scores.mean()

In [12]:
# Decision Tree
def dtClassifier(X, y):
    tree = DecisionTreeClassifier(criterion = 'entropy')
    scores = cross_val_score(tree, X, y, cv=cvKFold)
    return scores.mean()

In [13]:
# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    bag = BaggingClassifier(DecisionTreeClassifier(max_depth=max_depth,criterion = 'entropy'),n_estimators = n_estimators,max_samples = max_samples)
    scores = cross_val_score(bag, X, y, cv=cvKFold)
    return scores.mean()

def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=max_depth,criterion = 'entropy'),n_estimators = n_estimators,learning_rate = learning_rate)
    scores = cross_val_score(ada, X, y, cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    gb = GradientBoostingClassifier(n_estimators = n_estimators,learning_rate = learning_rate)
    scores = cross_val_score(gb, X, y, cv=cvKFold)
    return scores.mean()

### Part 1 Results

In [14]:
# Parameters for Part 1:
#KNN
k=3

#Bagging
bag_n_estimators = 50
bag_max_samples = 100
bag_max_depth = 5

#AdaBoost
ada_n_estimators = 50
ada_learning_rate = 0.5
ada_bag_max_depth = 5

#GB
gb_n_estimators = 50
gb_learning_rate = 0.5

# Print results for each classifier in part 1 to 4 decimal places here:
knn_score = kNNClassifier(X, y, k)
logreg_score = logregClassifier(X, y)
nb_score = nbClassifier(X, y)
tree_score = dtClassifier(X, y)
bagging_score = bagDTClassifier(X, y, bag_n_estimators, bag_max_samples, bag_max_depth)
ada_score = adaDTClassifier(X, y, ada_n_estimators, ada_learning_rate, ada_bag_max_depth)
gb_score = gbClassifier(X, y, gb_n_estimators, gb_learning_rate)
print("kNN average cross-validation accuracy: {:.4f}".format(knn_score))
print("LR average cross-validation accuracy: {:.4f}".format(logreg_score))
print("NB average cross-validation accuracy: {:.4f}".format(nb_score))
print("DT average cross-validation accuracy: {:.4f}".format(tree_score))
print("Bagging average cross-validation accuracy: {:.4f}".format(bagging_score))
print("AdaBoost average cross-validation accuracy: {:.4f}".format(ada_score))
print("GB average cross-validation accuracy: {:.4f}".format(gb_score))

kNN average cross-validation accuracy: 0.9642
LR average cross-validation accuracy: 0.9642
NB average cross-validation accuracy: 0.9585
DT average cross-validation accuracy: 0.9385
Bagging average cross-validation accuracy: 0.9585
AdaBoost average cross-validation accuracy: 0.9571
GB average cross-validation accuracy: 0.9642


### Part 2: Cross validation with parameter tuning

In [15]:
# Linear SVM
# You should use SVC from sklearn.svm with kernel set to 'linear'
C = {0.001, 0.01, 0.1, 1, 10, 100} 
gamma = {0.001, 0.01, 0.1, 1, 10, 100} 
def bestLinClassifier(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    pamarms = [{
    'C':list(C),
    'gamma':list(gamma)
    }]
    grid_search = GridSearchCV(SVC(kernel = 'linear'),pamarms,cv = cvKFold, return_train_score=True)
    grid_search.fit(X_train, y_train)
    test_set_score = grid_search.score(X_test, y_test)
    cross_validation_score = grid_search.best_score_
    best_C = grid_search.best_params_['C']
    best_gamma = grid_search.best_params_['gamma']
    return  test_set_score,cross_validation_score,best_C,best_gamma

In [16]:
# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = {10, 20, 30, 50, 100}
max_leaf_nodes = {4, 10, 16, 20, 30}
def bestRFClassifier(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    pamarms = [{
    'n_estimators':list(n_estimators),
    'max_leaf_nodes':list(max_leaf_nodes)
    }]
    grid_search = GridSearchCV(RandomForestClassifier(criterion = 'entropy',max_features = 'sqrt'),pamarms,cv = cvKFold, return_train_score=True)
    grid_search.fit(X_train, y_train)
    test_set_score = grid_search.score(X_test, y_test)
    cross_validation_score = grid_search.best_score_
    best_n = grid_search.best_params_['n_estimators']
    best_leaf_nodes = grid_search.best_params_['max_leaf_nodes']
    return  test_set_score,cross_validation_score,best_n,best_leaf_nodes

### Part 2 Results

In [17]:
# Perform Grid Search with 10-fold Stratified Cross Validation (GridSearchCV in sklearn). 
# The stratified folds from cvKFold should be provided to GridSearchV

# This should include using train_test_split from sklearn.model_selection with stratification and random_state=0
# Print results for each classifier here. All results should be printed to 4 decimal places except for
# "n_estimators" and "max_leaf_nodes" which should be printed as integers.
svm_test_set_score,svm_cross_validation_score,svm_best_C,svm_best_gamma = bestLinClassifier(X,y)
print("SVM best C:  {:.4f}".format(svm_best_C))
print("SVM best gamma:  {:.4f}".format(svm_best_gamma))
print("SVM cross-validation accuracy:  {:.4f}".format(svm_cross_validation_score))
print("SVM test set accuracy:  {:.4f}".format(svm_test_set_score))
print('')
RF_test_set_score,RF_cross_validation_score,RF_best_n,RF_best_leaf_nodes = bestRFClassifier(X,y)
print("RF best n_estimators: ",RF_best_n)
print("RF best max_leaf_nodes: ",RF_best_leaf_nodes)
print("RF cross-validation accuracy: {:.4f}".format(RF_cross_validation_score))
print("RF test set accuracy: {:.4f}".format(RF_test_set_score))

SVM best C:  10.0000
SVM best gamma:  0.1000
SVM cross-validation accuracy:  0.9714
SVM test set accuracy:  0.9600

RF best n_estimators:  100
RF best max_leaf_nodes:  20
RF cross-validation accuracy: 0.9734
RF test set accuracy: 0.9657
