# Breast_cancer Classifer
---
#### 주요 모듈 import

In [1]:
from sklearn.datasets import load_breast_cancer # load breast_cancer data
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix # Use confusion_matrix
from sklearn.tree import DecisionTreeClassifier # Use DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier # Use RandomForestClassifier
from sklearn import svm # Use Support Vector Machine(SVM)
from sklearn.linear_model import SGDClassifier # Use Stochastic Gradient Descent Classifier (SGDClassifier)
from sklearn.linear_model import LogisticRegression # Use LogisticRegression
from sklearn.metrics import accuracy_score # 정확도 확인
import pandas as pd

## 데이터 이해하기
---
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [13]:
breast_cancer = load_breast_cancer() #Load Data
print(breast_cancer.keys()) #breast_cancer Data
breast_cancer_data = breast_cancer.data
print("breast_cancer_data.shape: {}".format(breast_cancer_data.shape)) 
breast_cancer_label = breast_cancer.target
print("breast_cancer_label.shape: {}".format(breast_cancer_label.shape))
print("breast_cancer.target_names: {}".format(breast_cancer.target_names)) 
# malignant 0일 때 악성종양 , benign 1일 때 양성 종양
#print(breast_cancer.DESCR) #정보 출력 -> 30개의 속성
print(breast_cancer.feature_names)
breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)
breast_cancer_df["label"] = breast_cancer_label
print(breast_cancer_df)

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
breast_cancer_data.shape: (569, 30)
breast_cancer_label.shape: (569,)
breast_cancer.target_names: ['malignant' 'benign']
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474 

## train,test 데이터 분리
---
데이터 분리를 위해 sklearn.model_selection 사용, 추가로 shuffle 속성을 True로 함

In [9]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, 
                                                    breast_cancer_label, 
                                                    test_size=0.2,
                                                    random_state=5,
                                                    shuffle=True)

## 다양한 모델로 학습시켜보기
---
- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

In [10]:
# Use DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred_DC = decision_tree.predict(X_test)
DC_accuary = accuracy_score(y_test,y_pred_DC)
print("DecisionTree accuarcy: {}".format(DC_accuary))

# Use RandomForestClassifier
random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred_RF = random_forest.predict(X_test)
RF_accuary = accuracy_score(y_test,y_pred_RF)
print("RandomForestClassifier accuarcy: {}".format(RF_accuary))

# Use Support Vector Machine(SVM)
clf = svm.SVC(random_state=32)
clf.fit(X_train, y_train)
y_pred_SVM = clf.predict(X_test)
SVM_accuary = accuracy_score(y_test,y_pred_SVM)
print("Support Vector Machine accuarcy: {}".format(SVM_accuary))

# Use Stochastic Gradient Descent Classifier (SGDClassifier)
clf = SGDClassifier(loss="perceptron").fit(X_train, y_train)
y_pred_SGDC = clf.predict(X_test)
SGDC_accuary = accuracy_score(y_test,y_pred_SGDC)
print("SGDClassifier accuarcy: {}".format(SGDC_accuary))

# Use LogisticRegression
logistic_model = LogisticRegression(max_iter=3000)
clf = logistic_model.fit(X_train, y_train)
y_pred_LOGR = clf.predict(X_test)
LOGR_accuary = accuracy_score(y_test,y_pred_LOGR)
print("LogisticRegression accuarcy: {}".format(LOGR_accuary))

DecisionTree accuarcy: 0.9385964912280702
RandomForestClassifier accuarcy: 0.9736842105263158
Support Vector Machine accuarcy: 0.9385964912280702
SGDClassifier accuarcy: 0.8947368421052632
LogisticRegression accuarcy: 0.9649122807017544


## Using confusion_matrix 

In [14]:
print("Use DecisionTreeClassifier")
print(classification_report(y_test, y_pred_DC))
print("Use RandomForestClassifier")
print(classification_report(y_test, y_pred_RF))
print("Use Support Vector Machine(SVM)")
print(classification_report(y_test, y_pred_SVM))
print("Use Stochastic Gradient Descent Classifier (SGDClassifier)")
print(classification_report(y_test, y_pred_SGDC))
print("Use LogisticRegression")
print(classification_report(y_test, y_pred_LOGR))

Use DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.94      0.92      0.93        48
           1       0.94      0.95      0.95        66

    accuracy                           0.94       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.94      0.94      0.94       114

Use RandomForestClassifier
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        48
           1       0.96      1.00      0.98        66

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Use Support Vector Machine(SVM)
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        48
           1       0.90      1.00      0.95        66

    accuracy                           0.94       114
   macro avg       0.95      0.93      0.9

# Model evaluation
---
Breast_cancer Classifer 경우에서는 Precision과 Recall을 잘 판단해서 모델을 결정해야합니다.

암을 진단하는 경우 실제 malignant를 놓치지 않아야 하므로 Recall값이 높은 것이 매우 중요하다고 판단할 수 있습니다

    label 값이 0일 경우 -> malignant(암환자) 일 때는 recall 값을 중점적으로 분석해야 하고, label 값이 1일 경우 -> benign(정상) 일 때는 반대로 precision 값을 중점적으로 분석해야 합니다.
  
  |  | 암환자 | 정상 |
| :---: | :---: | :---: |
| Recall | 중요 | X |
| Precision | X | 중요 |
  
0 일때 recall과 1일때 precision이 가장 높은 SDG 모델이 실제 정확도는 0.89지만 암환자를 확진하는데에는 의외로 좋을 수도 있다고 생각합니다. 

SDG와 같이 특이한 값이 나옴을 제외하고는 RandomForestClassifier LogisticRegression 모델이 꾸준하게 준

수한 성능을 보이고 있습니다.

