# 필요한 모듈 Import 하기

In [31]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import confusion_matrix

# 데이터 준비

In [35]:
breast_cancer = load_breast_cancer()

# 데이터 이해하기

In [36]:
breast_cancer_data = breast_cancer.data #Feature Data 지정
breast_cancer_label = breast_cancer.target #Label Data 지정
print(breast_cancer.target_names) #Target Names 출력
print(breast_cancer.DESCR) #데이터 Describe

['malignant' 'benign']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instanc

# train, test 데이터 분리

In [37]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data,
                                                    breast_cancer_label,
                                                    test_size=0.2, 
                                                    random_state=7)

# 다양한 모델로 학습시켜보기

## 1. Decision Tree

In [43]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy : ', accuracy_score(y_test, y_pred))
print('reacll : ', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

accuracy :  0.9122807017543859
reacll :  0.9594594594594594


array([[33,  7],
       [ 3, 71]])

## 2. Random Forest

In [44]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=64)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy : ', accuracy_score(y_test, y_pred))
print('reacll : ', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        40
           1       0.99      1.00      0.99        74

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

accuracy :  0.9912280701754386
reacll :  1.0


array([[39,  1],
       [ 0, 74]])

## 3. SVM

In [45]:
from sklearn import svm

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy : ', accuracy_score(y_test, y_pred))
print('reacll : ', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114

accuracy :  0.9035087719298246
reacll :  1.0


array([[29, 11],
       [ 0, 74]])

## 4. SGD Classifier

In [46]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier()
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy : ', accuracy_score(y_test, y_pred))
print('reacll : ', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.70      0.95      0.81        40
           1       0.97      0.78      0.87        74

    accuracy                           0.84       114
   macro avg       0.84      0.87      0.84       114
weighted avg       0.87      0.84      0.85       114

accuracy :  0.8421052631578947
reacll :  0.7837837837837838


array([[38,  2],
       [16, 58]])

## 5. Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=5000)
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred))
print('accuracy : ', accuracy_score(y_test, y_pred))
print('reacll : ', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114

accuracy :  0.9473684210526315
reacll :  1.0


array([[34,  6],
       [ 0, 74]])

# 모델 평가해보기

유방암 데이터의 경우 Recall이 높아야한다. 학습한 5개의 모델 중 RandomForest 모델이

accuracy : 0.99   
recall : 1

의 결과로 가장 적합함을 알 수 있다.