## (1) 필요한 모듈 import하기

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비
load_digits 메서드를 사용합니다.

In [3]:
data = load_breast_cancer()

dir(data)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

## (3) 데이터 이해하기
지피지기면 백전불태! 다루어야 할 데이터를 자세히 살펴봅시다.

- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [4]:
import pandas as pd

feature_data = data.data
label_data = data.target

feature_data.shape, label_data.shape

((569, 30), (569,))

In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## (4) train, test 데이터 분리
모델 학습과 테스트용 문제지와 정답지를 준비해 봅시다.  
X_train, X_test, y_train, y_test를 생성하는 방법을 참고해 보세요.


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feature_data, label_data,
                                                    test_size=0.2, stratify=label_data,
                                                    random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

## (5) 다양한 모델로 학습시켜보기
학습데이터 X_train, y_train 을 활용해 분류기 모델을 만들어 봅시다. 어떤 모델이 가장 좋은 성능을 보일까요?

- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

In [8]:
from sklearn.tree import DecisionTreeClassifier

# 학습
DecisionTreeClassifier_model = DecisionTreeClassifier()
DecisionTreeClassifier_model.fit(X_train, y_train)

# 평가
y_pred = DecisionTreeClassifier_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88        42
           1       0.96      0.89      0.92        72

    accuracy                           0.90       114
   macro avg       0.89      0.91      0.90       114
weighted avg       0.91      0.90      0.90       114



In [9]:
from sklearn.ensemble import RandomForestClassifier

# 학습
RandomForestClassifier_model = RandomForestClassifier()
RandomForestClassifier_model.fit(X_train, y_train)

# 평가
y_pred = RandomForestClassifier_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [10]:
from sklearn.svm import SVC

# 학습
SVC_model = SVC(probability=True)
SVC_model.fit(X_train, y_train)

# 평가
y_pred = SVC_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90        42
           1       0.92      0.97      0.95        72

    accuracy                           0.93       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114



In [11]:
from sklearn.linear_model import SGDClassifier

# 학습
SGDClassifier_model = SGDClassifier()
SGDClassifier_model.fit(X_train, y_train)

# 평가
y_pred = SGDClassifier_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.79      0.88        42
           1       0.89      1.00      0.94        72

    accuracy                           0.92       114
   macro avg       0.94      0.89      0.91       114
weighted avg       0.93      0.92      0.92       114



In [12]:
from sklearn.linear_model import LogisticRegression

# 학습
LogisticRegression_model = LogisticRegression()
LogisticRegression_model.fit(X_train, y_train)

# 평가
y_pred = LogisticRegression_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.90      0.94        42
           1       0.95      0.99      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- 대체적으로 성능이 좋게 나왔다.
  - 그 중 RandomForest, LogisticRegression의 성능이 가장 높게 나왔다.

## (6) 모델을 평가해 보기
학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 모델의 성능을 평가하는 지표로는 무엇이 좋을까요?  
sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

- 발병율을 Recall의 예시로 들 때 자주 사용되는 예로,  
  발병 여부를 놓치면 현실에서 큰 문제가 될 수 있기 때문에 Recall을 사용하도록 하는 것이 좋을 것이다.

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = RandomForestClassifier_model.predict(X_test)

print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"Precision Score: {precision_score(y_test, y_pred, average='weighted')}")
print(f"Recall Score: {recall_score(y_test, y_pred, average='weighted')}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

Accuracy Score: 0.956140350877193
Precision Score: 0.9560729421281235
Recall Score: 0.956140350877193
F1 Score: 0.9560273762928301
ROC AUC Score: 0.9503968253968255
