# 유방암 분류하기

## 1. 필요한 모듈 import

In [13]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import time

def get_item_count(li):
  new_list = {}
  for i in li:
    try: new_list[i] += 1
    except: new_list[i] = 1
  return new_list

## 2. 데이터 로드

In [2]:
breast_cancer = load_breast_cancer()

## 3. 데이터 확인

In [12]:
print(breast_cancer.DESCR)

breast_cancer_data = breast_cancer.data
breast_cancer_label = breast_cancer.target

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## 4. 훈련/테스트용 데이터 분리


In [14]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2,
                                                    random_state=35)
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}\nX_test: {X_test.shape}, y_test: {y_test.shape}')
print(get_item_count(y_train))

X_train: (455, 30), y_train: (455,)
X_test: (114, 30), y_test: (114,)
{1: 284, 0: 171}


## 5. 학습

### 5.1. Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
start = time.time()
tree.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = tree.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.007초
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        41
           1       0.96      0.99      0.97        73

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



### 5.2. Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
start = time.time()
forest.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = forest.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.204초
              precision    recall  f1-score   support

           0       0.97      0.93      0.95        41
           1       0.96      0.99      0.97        73

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



### 5.3. SVM

In [8]:
from sklearn.svm import SVC

svc = SVC()
start = time.time()
svc.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = svc.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.018초
              precision    recall  f1-score   support

           0       1.00      0.83      0.91        41
           1       0.91      1.00      0.95        73

    accuracy                           0.94       114
   macro avg       0.96      0.91      0.93       114
weighted avg       0.94      0.94      0.94       114



### 5.4. SGD Classifier

In [9]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
start = time.time()
sgd.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = sgd.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.006초
              precision    recall  f1-score   support

           0       0.67      0.98      0.79        41
           1       0.98      0.73      0.83        73

    accuracy                           0.82       114
   macro avg       0.82      0.85      0.81       114
weighted avg       0.87      0.82      0.82       114



### 5.4. Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# max_iter : 해를 찾아가기위해 반복하는 횟수를 제한
logistic = LogisticRegression(max_iter=2700)
start = time.time()
logistic.fit(X_train, y_train)
print("학습소요시간: %.3f초" % (time.time() - start))
y_pred = logistic.predict(X_test)

print(classification_report(y_test, y_pred))

학습소요시간: 0.734초
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        41
           1       0.95      1.00      0.97        73

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



## 6. 결과

### 6.1. 모델 평가
> 1. 클래스별로 훈련에 사용된 데이터의 갯수가 많이 차이나고,
2. 실제로 유방암 양성(P) 환자가 음성(N)으로 분류되지 않는 것이 중요하기 때문에
3. $실제\ True를\ True로 예측한 수 (TP) \over 실제로 True인\ 데이터\ 수(TP+FN)$
4. *Recall*을 평가지표로 사용했다

* SDG를 제외하고 0.95 정도의 높은 재현율을 가졌고 둘 중 Decision Tree의 속도가 가장 빨랐다
* SGD는 속도가 매우 빨랐지만 재현율이 0.9에 미치지 못했다.