In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
breast = load_breast_cancer()
print(breast.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [None]:
dtclf = DecisionTreeClassifier()

data = breast.data
label = breast.target

In [None]:
print('feature: ', data.shape) 
print('label : ', label.shape)

print('target: ', breast.target_names)

feature:  (569, 30)
label :  (569,)
target:  ['malignant' 'benign']


In [None]:
# 특성을 7:3으로 나눔
xtrain, xtest, ytrain, ytest = train_test_split(data, label, test_size=0.3, random_state=2201061240)

print('xtrain의 수: ', len(xtrain), 'xtest의 수: ', len(xtest))
print(xtrain.shape, xtest.shape)
print(ytrain.shape, ytest.shape)

xtrain의 수:  398 xtest의 수:  171
(398, 30) (171, 30)
(398,) (171,)


In [None]:
# 의사결정나무
decision_tree = DecisionTreeClassifier(random_state=2201061240)
decision_tree.fit(xtrain, ytrain)
ypred = decision_tree.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90        55
           1       0.96      0.95      0.95       116

    accuracy                           0.94       171
   macro avg       0.92      0.93      0.93       171
weighted avg       0.94      0.94      0.94       171



In [None]:
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier

xtrain, xtest, ytrain, ytest = train_test_split(data, label, test_size=0.2, random_state=2201061240)

random_forest = RandomForestClassifier(random_state=2201061240)
random_forest.fit(xtrain, ytrain)
ypred = random_forest.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.97      0.92      0.94        37
           1       0.96      0.99      0.97        77

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [None]:
# SVM 모델
from sklearn import svm
svm_model = svm.SVC()

svm_model.fit(xtrain, ytrain)
ypred = svm_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       1.00      0.84      0.91        37
           1       0.93      1.00      0.96        77

    accuracy                           0.95       114
   macro avg       0.96      0.92      0.94       114
weighted avg       0.95      0.95      0.95       114



In [None]:
# SGD 모델
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier()

sgd_model.fit(xtrain, ytrain)
ypred = sgd_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88        37
           1       0.94      0.95      0.94        77

    accuracy                           0.92       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114



In [None]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression()

logistic_model.fit(xtrain, ytrain)
ypred = logistic_model.predict(xtest)

print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        37
           1       0.95      0.94      0.94        77

    accuracy                           0.92       114
   macro avg       0.91      0.91      0.91       114
weighted avg       0.92      0.92      0.92       114



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 제일 낮은 정확도는 92%(SGD, 로지스틱)이고, 제일 높은 정확도는 96%(랜덤 포레스트)를 확인할 수 있음 