# Setting

In [210]:
import warnings
warnings.filterwarnings(action='ignore')

In [273]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mglearn

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neural_network import MLPClassifier

In [2]:
# cancer_data에 breast_cancer 데이터 담아주기
cancer_data = load_breast_cancer()
# 데이터 설명 확인
print(cancer_data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

# Preprocessing 

In [101]:
# 속성들은 cancer_data.data에, 악성/양성 데이터는 cancer_data.target에 저장
X = pd.DataFrame(cancer_data.data, columns = cancer_data.feature_names)
y = pd.DataFrame(cancer_data.target, columns = ['class'])

# train / test set 나눠주기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =5)
# 학습 중 경고메시지 나오지 않게 하기
y_yrain = y_train.values.ravel()

X_train.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
306,13.2,15.82,84.07,537.3,0.08511,0.05251,0.001461,0.003261,0.1632,0.05894,...,14.41,20.45,92.0,636.9,0.1128,0.1346,0.0112,0.025,0.2651,0.08385
410,11.36,17.57,72.49,399.8,0.08858,0.05313,0.02783,0.021,0.1601,0.05913,...,13.05,36.32,85.07,521.3,0.1453,0.1622,0.1811,0.08698,0.2973,0.07745
197,18.08,21.84,117.4,1024.0,0.07371,0.08642,0.1103,0.05778,0.177,0.0534,...,19.76,24.7,129.1,1228.0,0.08822,0.1963,0.2535,0.09181,0.2369,0.06558
376,10.57,20.22,70.15,338.3,0.09073,0.166,0.228,0.05941,0.2188,0.0845,...,10.85,22.82,76.51,351.9,0.1143,0.3619,0.603,0.1465,0.2597,0.12
244,19.4,23.5,129.1,1155.0,0.1027,0.1558,0.2049,0.08886,0.1978,0.06,...,21.65,30.53,144.9,1417.0,0.1463,0.2968,0.3458,0.1564,0.292,0.07614


# Check accuracy of breast cancer diagnosis via Classification model

## DecisionTree 

In [253]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)

tree_accuracy = accuracy_score(y_test, y_pred_tree)
print('DecisionTree accuracy(no_depth): ',tree_accuracy)

DecisionTree accuracy(no_depth):  0.9473684210526315


###  Bagging

In [254]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

bag_accuracy = accuracy_score(y_test, y_pred_bag)
print('Bagging accuracy: ',bag_accuracy)

Bagging accuracy:  0.9649122807017544


### RandomForest

In [255]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rnd = rnd_clf.predict(X_test)

rnd_accuracy = accuracy_score(y_test, y_pred_rnd)
print('Randomforest accuracy: ',rnd_accuracy)

Randomforest accuracy:  0.9649122807017544


### Adaboost 

In [256]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
y_pred_ada = ada_clf.predict(X_test)

ada_accuracy = accuracy_score(y_test, y_pred_ada)
print('Adaboost accuracy: ',ada_accuracy)

Adaboost accuracy:  0.9736842105263158


### DecisionTree + GridsearchCV

In [296]:
params = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30]}

tree_grid = GridSearchCV(tree_clf, param_grid=params, scoring='accuracy', cv=5)
tree_grid.fit(X_train, y_train)
y_pred_tree_grid = tree_grid.predict(X_test)
tree_grid_accuracy = accuracy_score(y_test, y_pred_tree_grid)

print('DecisionTree GridSearch CV accuracy: ',tree_grid_accuracy)

DecisionTree GridSearch CV accuracy:  0.956140350877193


## SVM 

In [258]:
svm_clf = svm.SVC(C=0.1,gamma='scale')
svm_clf.fit(X_train,y_train)
y_pred_svm = svm_clf.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print('SVM accuracy: ',svm_accuracy)

SVM accuracy:  0.9035087719298246


### SVM + K-fold CV

In [259]:
kfold = KFold(n_splits=5, shuffle=True,random_state=42)
svm_kfold2 = cross_val_score(svm_clf, X_train, y_train, cv= kfold, scoring='accuracy')

print('SVM K-fold CV accuracy: ', sum(svm_kfold)/5)

SVM K-fold CV accuracy:  0.8857142857142858


### SVM + StratifiedShuffle CV

In [260]:
stratified_shuffle_split = StratifiedShuffleSplit(train_size=0.7, test_size=0.3, n_splits=5, random_state=42)
svm_str = cross_val_score(svm_clf, X_train, y_train, cv= stratified_shuffle_split, scoring='accuracy')

print('SVM StratifiedShuffleSplit CV accuracy: ', sum(svm_str)/5)

SVM StratifiedShuffleSplit CV accuracy:  0.8715328467153285


### SVM + GridSearchCV

In [292]:
param_grid={'C':[1,10,100,0.1,0.01],'gamma':[1,10,0.1,0.01,0.001,0.0001]}
svm_grid=GridSearchCV(svm_clf,param_grid, cv=5)
svm_grid.fit(X_train, y_train)
y_pred_svm_grid = svm_grid.predict(X_test)
svm_grid_accuracy = accuracy_score(y_test, y_pred_svm_grid)

print('SVM GridSearch CV accuracy: ', svm_grid_accuracy)

SVM GridSearch CV accuracy:  0.9473684210526315


## LogisticRegression

In [251]:
lrn_clf = LogisticRegression()
lrn_clf.fit(X_train, y_train)
y_pred_lrn = lrn_clf.predict(X_test)

lrn_accuracy = accuracy_score(y_test, y_pred_lrn)
print('LogisticRegression accuracy: ', lrn_accuracy)

LogisticRegression accuracy:  0.956140350877193


### LogisticRegression + K-fold CV

In [250]:
kfold = KFold(n_splits=5, shuffle=True,random_state=42)
lrn_kfold = cross_val_score(lrn_clf, X_train, y_train, cv= kfold, scoring='accuracy')

print('LogisticRegression K-fold CV accuracy: ', sum(lrn_kfold)/5)

LogisticRegression K-fold CV accuracy:  0.9428571428571428


### LogisticRegression + StratifiedShuffle CV

In [249]:
stratified_shuffle_split = StratifiedShuffleSplit(train_size=0.7, test_size=0.3, n_splits=5, random_state=42)
lrn_str = cross_val_score(lrn_clf, X_train, y_train, cv= stratified_shuffle_split, scoring='accuracy')

print('LogisticRegression StratifiedShuffleSplit CV accuracy: ', sum(lrn_str)/5)

LogisticRegression StratifiedShuffleSplit CV accuracy:  0.9518248175182482


### LogisticRegression + GridSearchCV

In [248]:
param_grid={'C':np.logspace(-3,3,7), 'penalty':['l1','l2']}

lrn_grid=GridSearchCV(LogisticRegression(), param_grid=param_grid,scoring='accuracy',cv=5)
lrn_grid.fit(X_train,y_train)
y_pred_lrn_grid =lrn_grid.predict(X_test)
lrn_grid_accuracy = accuracy_score(y_test, y_pred_lrn_grid)

print('LogisticRegression GridSearch CV accuracy: ', lrn_grid_accuracy)

LogisticRegression GridSearch CV accuracy:  0.9649122807017544


## KNN

In [247]:
knn_clf = KNeighborsClassifier(weights='distance')
knn_clf.fit(X_train, y_train)
y_pred_knn = knn_clf.predict(X_test)

knn_accuracy = accuracy_score(y_test, y_pred_knn)
print('KNN accuracy: ', knn_accuracy)

KNN accuracy:  0.9385964912280702


### KNN+ K-fold CV

In [246]:
kfold = KFold(n_splits=5, shuffle=True,random_state=42)
knn_kfold = cross_val_score(knn_clf, X_train, y_train, cv= kfold, scoring='accuracy')

print('KNN K-fold CV accuracy: ', sum(knn_kfold)/5)

KNN K-fold CV accuracy:  0.9252747252747253


### KNN+ StratifiedShuffle CV

In [245]:
stratified_shuffle_split = StratifiedShuffleSplit(train_size=0.7, test_size=0.3, n_splits=5, random_state=42)
knn_str = cross_val_score(knn_clf, X_train, y_train, cv= stratified_shuffle_split, scoring='accuracy')

print('KNN StratifiedShuffleSplit CV accuracy: ', sum(knn_str)/5)

KNN StratifiedShuffleSplit CV accuracy:  0.927007299270073


### KNN+ GridSearchCV

In [244]:
param_grid = {'n_neighbors':(1, 3, 10), 'weights':('uniform', 'distance')}

knn_grid=GridSearchCV(KNeighborsClassifier(), param_grid=param_grid,scoring='accuracy',cv=5)
knn_grid.fit(X_train,y_train)
y_pred_knn_grid =lrn_grid.predict(X_test)
lrn_grid_accuracy = accuracy_score(y_test, y_pred_lrn_grid)

print('LogisticRegression GridSearch CV accuracy: ', lrn_grid_accuracy)

LogisticRegression GridSearch CV accuracy:  0.9649122807017544


## MLP

In [267]:
mlp_clf = MLPClassifier(random_state= 42)
mlp_clf.fit(X_train, y_train)
y_pred_mlp = mlp_clf.predict(X_test)

mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print('MLP accuracy: ', mlp_accuracy)

MLP accuracy:  0.9035087719298246


### MLP with more layers (layers=1000)

In [262]:
mlp1000_clf = MLPClassifier(max_iter=1000,random_state= 0)
mlp1000_clf.fit(X_train, y_train)
y_pred_mlp1000 = mlp1000_clf.predict(X_test)

mlp1000_accuracy = accuracy_score(y_test, y_pred_mlp1000)
print('MLP accuracy: ', mlp1000_accuracy)

MLP accuracy:  0.9473684210526315


## Normalization

In [276]:
#정규분포 분포 만들기
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)
X_train_norm = (X_train - X_mean) / X_std
X_test_norm = (X_test - X_mean) / X_std

### Models with no CV Normalization

In [303]:
models = {    
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42),
    'Bagging': BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500,
                                           max_samples=100, bootstrap=True, n_jobs=-1, random_state=42),
    'AdaBoost': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", 
                                             learning_rate=0.5, random_state=42),
    'SVM': svm.SVC(gamma='scale'),
    'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=2000),
    'KNN': KNeighborsClassifier(weights='distance'),
    'MLP':MLPClassifier(random_state= 42),
    'MLP_1000':MLPClassifier(max_iter=1000,random_state= 0)
    }

for name, model in models.items():
    scores = model.fit(X_train_norm, y_train)
    y_pred_model = model.predict(X_test_norm)
    print(name,'Norm accuracy:', accuracy_score(y_test, y_pred_model))

DecisionTree Norm accuracy: 0.9473684210526315
RandomForest Norm accuracy: 0.9649122807017544
Bagging Norm accuracy: 0.9649122807017544
AdaBoost Norm accuracy: 0.9736842105263158
SVM Norm accuracy: 0.9649122807017544
LogisticRegression Norm accuracy: 0.9736842105263158
KNN Norm accuracy: 0.9649122807017544
MLP Norm accuracy: 0.9824561403508771
MLP_1000 Norm accuracy: 0.9824561403508771


### Models with K-fold CV Normalization

In [300]:
kfold = KFold(n_splits=5, shuffle=True,random_state=42)

svm_kfold_norm = cross_val_score(svm_clf, X_train_norm, y_train, cv= kfold, scoring='accuracy')
knn_kfold_norm = cross_val_score(knn_clf, X_train_norm, y_train, cv= kfold, scoring='accuracy')
lrn_kfold_norm = cross_val_score(lrn_clf, X_train_norm, y_train, cv= kfold, scoring='accuracy')

print('SVM kfold CV Norm accuracy : ', sum(svm_kfold_norm)/5)
print('LogisticRegression kfold CV Norm accuracy : ', sum(lrn_kfold_norm)/5)
print('KNN kfold CV Norm accuracy : ', sum(knn_kfold_norm)/5)

SVM kfold CV Norm accuracy :  0.9384615384615383
LogisticRegression kfold CV Norm accuracy :  0.9758241758241757
KNN kfold CV Norm accuracy :  0.9736263736263737


### Models with StratifiedShuffle CV Normalization

In [301]:
stratified_shuffle_split = StratifiedShuffleSplit(train_size=0.7, test_size=0.3, n_splits=5, random_state=42)

svm_str_norm = cross_val_score(svm_clf, X_train_norm, y_train, cv= stratified_shuffle_split, scoring='accuracy')
lrn_str_norm = cross_val_score(lrn_clf, X_train_norm, y_train, cv= stratified_shuffle_split, scoring='accuracy')
knn_str_norm = cross_val_score(knn_clf, X_train_norm, y_train, cv= stratified_shuffle_split, scoring='accuracy')

print('SVM StratifiedShuffle CV Norm accuracy : ', sum(svm_str_norm)/5)
print('LogisticRegression StratifiedShuffle CV Norm accuracy : ', sum(lrn_str_norm)/5)
print('KNN StratifiedShuffle CV Norm accuracy : ', sum(knn_str_norm)/5)

SVM StratifiedShuffle CV Norm accuracy :  0.9372262773722626
LogisticRegression StratifiedShuffle CV Norm accuracy :  0.983941605839416
KNN StratifiedShuffle CV Norm accuracy :  0.9737226277372264


### Models with GridSearch CV Normalization

In [302]:
tree_param = {'max_depth' : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30]}
tree_grid_norm = GridSearchCV(tree_clf, param_grid=tree_param, scoring='accuracy', cv=5)
tree_grid_norm.fit(X_train_norm, y_train)
y_pred_tree_grid_norm= tree_grid_norm.predict(X_test_norm)
print('Tree GridSearch CV Norm accuracy :', accuracy_score(y_test, y_pred_tree_grid_norm))

svm_param={'C':[1,10,100,0.1,0.01],'gamma':[1,10,0.1,0.01,0.001,0.0001]}
svm_grid_norm=GridSearchCV(svm_clf,svm_param, cv=5)
svm_grid_norm.fit(X_train_norm, y_train)
y_pred_svm_grid_norm = svm_grid_norm.predict(X_test_norm)
print('SVM GridSearch CV Norm accuracy :', accuracy_score(y_test, y_pred_svm_grid_norm))

lrn_param={'C':np.logspace(-3,3,7), 'penalty':['l1','l2']}
lrn_grid_norm=GridSearchCV(LogisticRegression(), lrn_param,scoring='accuracy',cv=5)
lrn_grid_norm.fit(X_train_norm,y_train)
y_pred_lrn_grid_norm =lrn_grid_norm.predict(X_test_norm)
print('LogisticRegression GridSearch CV Norm accuracy :', accuracy_score(y_test, y_pred_lrn_grid_norm))

knn_param = {'n_neighbors':(1, 3, 10), 'weights':('uniform', 'distance')}
knn_grid_norm=GridSearchCV(KNeighborsClassifier(), knn_param,scoring='accuracy',cv=5)
knn_grid_norm.fit(X_train_norm,y_train)
y_pred_knn_grid_norm =knn_grid_norm.predict(X_test_norm)
print('KNN GridSearch CV Norm accuracy :', accuracy_score(y_test, y_pred_knn_grid_norm))

Tree GridSearch CV Norm accuracy : 0.956140350877193
SVM GridSearch CV Norm accuracy : 0.9649122807017544
LogisticRegression GridSearch CV Norm accuracy : 0.9736842105263158
KNN GridSearch CV Norm accuracy : 0.9649122807017544
