## 실습

UCI 머신러닝 저장소에서 제공하는 암세포 진단(breast cancer) 데이터셋을 사용

샘플 ID, 암세포 조직의 크기와 모양 등 종양 특성을 나타내는 열 9개, 악성 종양 여부(2:양성, 4:악성)
11개의 열에 699개의 샘플 데이터가 있다.

‘bare_unclei’열의 데이터 중 ‘?’을 np.nan으로 바꿔주고 해당 데이터가 들어 있는 행을 dropna()로 삭제 후 정수형으로 변환


In [1]:
import pandas as pd
import numpy as np

In [2]:
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

df = pd.read_csv(uci_path, header=None)
# print(df.head())
df.columns=['id', 'clump', 'cell_size', 'cell_shape', 'adhesion', 
           'epithlial', 'bare_nuclei', 'chromation', 'normal_nucleoli', 
           'mitoses', 'class']

In [3]:
print(df.info())
print(df.describe())
# print(df['bare_nuclei']).unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                 699 non-null int64
clump              699 non-null int64
cell_size          699 non-null int64
cell_shape         699 non-null int64
adhesion           699 non-null int64
epithlial          699 non-null int64
bare_nuclei        699 non-null object
chromation         699 non-null int64
normal_nucleoli    699 non-null int64
mitoses            699 non-null int64
class              699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.2+ KB
None
                 id       clump   cell_size  cell_shape    adhesion  \
count  6.990000e+02  699.000000  699.000000  699.000000  699.000000   
mean   1.071704e+06    4.417740    3.134478    3.207439    2.806867   
std    6.170957e+05    2.815741    3.051459    2.971913    2.855379   
min    6.163400e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.706885e+05    2.000000    1.000000    1.000000    1.000

In [4]:
df['bare_nuclei'].replace('?', np.nan, inplace=True)
df.dropna(subset=['bare_nuclei'], axis=0, inplace=True)
df['bare_nuclei'] = df['bare_nuclei'].astype('int')
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 683 entries, 0 to 698
Data columns (total 11 columns):
id                 683 non-null int64
clump              683 non-null int64
cell_size          683 non-null int64
cell_shape         683 non-null int64
adhesion           683 non-null int64
epithlial          683 non-null int64
bare_nuclei        683 non-null int32
chromation         683 non-null int64
normal_nucleoli    683 non-null int64
mitoses            683 non-null int64
class              683 non-null int64
dtypes: int32(1), int64(10)
memory usage: 61.4 KB
None
                 id       clump   cell_size  cell_shape    adhesion  \
count  6.830000e+02  683.000000  683.000000  683.000000  683.000000   
mean   1.076720e+06    4.442167    3.150805    3.215227    2.830161   
std    6.206440e+05    2.820761    3.065145    2.988581    2.864562   
min    6.337500e+04    1.000000    1.000000    1.000000    1.000000   
25%    8.776170e+05    2.000000    1.000000    1.000000    1.000000

In [5]:
# 독립변수
X = df[['clump', 'cell_size', 'cell_shape', 'adhesion', 
           'epithlial', 'bare_nuclei', 'chromation', 'normal_nucleoli', 
           'mitoses']]
# 설명변수
Y = df[['class']]

In [6]:
# 설명변수 데이터 정규화
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

In [7]:
# train:test(7:3)으로 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)
print(X_train.shape)
print(X_test.shape)

(478, 9)
(205, 9)


In [8]:
#Decision Tree 분류 모델 생성

from sklearn import tree

tree_model = tree.DecisionTreeClassifier(criterion='entropy',  max_depth=5)
# tree_model = tree.DecisionTreeClassifier(criterion='gini',  max_depth=5)
tree_model.fit(X_train, Y_train)

y_predict = tree_model.predict(X_test)

In [16]:
print(y_predict[0:10])
print(Y_test.values[0:1])

[4 4 4 4 4 4 2 2 4 4]
[[4]]


In [17]:
from sklearn import metrics
tree_matrix = metrics.confusion_matrix(Y_test, y_predict)
print(tree_matrix)

[[127   4]
 [  2  72]]


In [18]:
tree_report = metrics.classification_report(Y_test, y_predict)
print(tree_report)

              precision    recall  f1-score   support

           2       0.98      0.97      0.98       131
           4       0.95      0.97      0.96        74

    accuracy                           0.97       205
   macro avg       0.97      0.97      0.97       205
weighted avg       0.97      0.97      0.97       205

