# breast cancer dataset

In [1]:
from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler 

mm = MinMaxScaler()

df = pd.read_csv('breast_cancer.csv', index_col = 'id')
# print(df.head())
df['diagnosis'] = df['diagnosis'].map({'M': 0, 'B': 1})
y = df['diagnosis']
del df['Unnamed: 32']
del df['diagnosis']
X = df.copy()
X = mm.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75)




# 1. Decision Tree Classifier

## Training data size = 0.75 and hold out method

In [2]:
model = DecisionTreeClassifier(criterion = "gini")
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

dtc_hom = accuracy_score(y_test, y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',dtc_hom) 
cm

Accuracy Score : 91.6083916083916


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,52,3
True MALIGNANT,9,79


## Training set = 0.75 and random subsampling

In [3]:
model = DecisionTreeClassifier(criterion = "gini",random_state = 1)
model.fit(X_train, y_train)

acc = 0
for i in range(0,15):
    y_predict = model.predict(X_test)
    acc = acc + accuracy_score(y_test,y_predict) * 100

dtc_rs = acc / 15

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',dtc_rs) 
cm

Accuracy Score : 90.20979020979023


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,49,6
True MALIGNANT,8,80


## Training set = 0.75 and cross-validation(k-fold)

In [4]:
model = DecisionTreeClassifier(criterion = "gini",random_state = 1)
y_pred = cross_val_predict(model,X,y,cv = 4)

accuracy_score(y, y_pred)

dtc_cv = accuracy_score(y_test, y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',dtc_cv) 
cm

Accuracy Score : 90.20979020979021


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,49,6
True MALIGNANT,8,80


# 2. Naive Bayes Classifier

## Training data size = 0.75 and hold out method

In [5]:
model = GaussianNB()
model.fit(X_train,y_train)

y_predict = model.predict(X_test)

nb_hom = accuracy_score(y_test,y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',nb_hom) 
cm

Accuracy Score : 93.00699300699301


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,49,6
True MALIGNANT,4,84


## Training data size = 0.75 and random subsampling


In [6]:
model = GaussianNB()
model.fit(X_train, y_train)

acc = 0
for i in range(0,15):
    y_predict = model.predict(X_test)
    acc = acc + accuracy_score(y_test,y_predict) * 100

nb_rs = acc / 15

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',nb_rs) 
cm

Accuracy Score : 93.00699300699304


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,49,6
True MALIGNANT,4,84


# Training data size = 0.75 and cross validation(k-fold)


In [7]:
model = GaussianNB()
y_pred = cross_val_predict(model,X,y,cv = 4)

accuracy_score(y, y_pred)

nb_cv = accuracy_score(y_test, y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',nb_cv) 
cm

Accuracy Score : 93.00699300699301


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,49,6
True MALIGNANT,4,84


# 3. K-nearest Neighbours classifier

## 1.Training data size = 0.75,hold out method

In [8]:
model = KNeighborsClassifier()
model.fit(X_train,y_train)
y_predict = model.predict(X_test)

knn_hom = accuracy_score(y_test,y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',knn_hom) 
cm

Accuracy Score : 96.5034965034965


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,52,3
True MALIGNANT,2,86


## 2.Training data set = 0.75,random subsampling

In [9]:
model = KNeighborsClassifier(n_neighbors = 7)
model.fit(X_train,y_train)

acc = 0
for i in range(0,15):
    y_predict = model.predict(X_test)
    acc = acc + accuracy_score(y_test,y_predict) * 100

knn_rs = acc / 15

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',knn_rs) 
cm

Accuracy Score : 97.20279720279721


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,53,2
True MALIGNANT,2,86


## 3. Training data set = 0.75,cross validation(k - fold)

In [10]:
model = KNeighborsClassifier()
y_pred = cross_val_predict(model,X,y,cv = 4)

accuracy_score(y, y_pred)

knn_cv = accuracy_score(y_test, y_predict) * 100

cm = pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns = ['Predicted BENIGN', 'Predicted MALIGNANT'],
    index = ['True BENIGN', 'True MALIGNANT']
)

print('Accuracy Score :',knn_cv) 
cm

Accuracy Score : 97.2027972027972


Unnamed: 0,Predicted BENIGN,Predicted MALIGNANT
True BENIGN,53,2
True MALIGNANT,2,86


In [11]:
method1 = pd.Series({'Decision tree': dtc_hom,
                        'Naive Bayes': nb_hom,
                        'KNN': knn_hom})
method2 = pd.Series({'Decision tree': dtc_rs,
                        'Naive Bayes': nb_rs,
                        'KNN': knn_rs})
method3 = pd.Series({'Decision tree': dtc_cv,
                        'Naive Bayes': nb_cv,
                        'KNN': knn_cv})
purchase = pd.DataFrame([method1, method2, method3], index = ['Hold Out Method', 'Random Subsampling', 'Cross validation'])
purchase

Unnamed: 0,Decision tree,Naive Bayes,KNN
Hold Out Method,91.608392,93.006993,96.503497
Random Subsampling,90.20979,93.006993,97.202797
Cross validation,90.20979,93.006993,97.202797
