### 머신러닝 프로세스 개요


1. Data Set 분할 - 학습 데이터를 랜덤으로 학습데이터셋/검증데이터 셋을 분할하고, 테스트 셋도 준비한다.
2. 데이터 전처리과정 - 특성치 변수의 정규화와 범주자료 원핫인코딩 과정이 핵심
- 정규화는 표준화 또는 Min-max 정규화가 대표적
3. 모델 학습 - 사이킷런 패키지
4. 하이퍼파라미터 탐색

In [2]:
#분석 데이터 검토

import pandas as pd
data = pd.read_csv('./practice/breastCancer.csv', encoding='utf-8')
data.head()

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
data['class'].value_counts() # 2정상, 4유방암

2    458
4    241
Name: class, dtype: int64

In [4]:
print(data.shape)

(699, 11)


In [50]:
import numpy as np
data.mean(numeric_only=True)

id                   1.071704e+06
clump_thickness      4.417740e+00
size_uniformity      3.134478e+00
shape_uniformity     3.207439e+00
marginal_adhesion    2.806867e+00
epithelial_size      3.216023e+00
bland_chromatin      3.437768e+00
normal_nucleoli      2.866953e+00
mitoses              1.589413e+00
class                2.689557e+00
dtype: float64

In [51]:
print(data['bare_nucleoli'].value_counts())
data2 = data.copy()

data2['bare_nucleoli'] = np.where(data2['bare_nucleoli'] == '?', 1, data2['bare_nucleoli'])
data2['bare_nucleoli'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nucleoli, dtype: int64


1     402
10    132
2      30
5      30
3      28
8      21
4      19
1      16
9       9
7       8
6       4
Name: bare_nucleoli, dtype: int64

In [52]:
#특성(X)과 레이블(y) 나누기

X1 = data2[data2.columns[1:10]]
X1

Unnamed: 0,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [53]:
print(X1.shape)

(699, 9)


In [54]:
y = data2[['class']]
y

Unnamed: 0,class
0,2
1,2
2,2
3,2
4,2
...,...
694,2
695,2
696,4
697,4


In [55]:
print(y.shape)

(699, 1)


In [56]:
# train-test 데이터셋 나누기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X1, y, stratify=y, random_state=42)

In [57]:
print(y_train.mean())
print(y_test.mean())

class    2.69084
dtype: float64
class    2.685714
dtype: float64


In [58]:
# 정규화

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

In [59]:
scaler_minmax.fit(X_train)
X_scaled_minmax_train = scaler_minmax.transform(X_train)

In [60]:
scaler_standard.fit(X_train)
X_scaled_standard_train = scaler_standard.transform(X_train)

In [61]:
pd.DataFrame(X_scaled_minmax_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0
mean,0.384012,0.238974,0.248516,0.202926,0.245759,0.278414,0.270992,0.207379,0.072519
std,0.315992,0.343795,0.330619,0.323274,0.243872,0.404895,0.268368,0.338869,0.201778
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.111111,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.0
50%,0.333333,0.0,0.111111,0.0,0.111111,0.0,0.222222,0.0,0.0
75%,0.555556,0.444444,0.444444,0.333333,0.333333,0.666667,0.333333,0.333333,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [62]:
pd.DataFrame(X_scaled_standard_train).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0,524.0
mean,1.423797e-16,2.7119950000000003e-17,-4.406992e-17,-6.779988e-18,-1.1864980000000002e-17,4.7459920000000007e-17,4.406992e-17,-2.3729960000000003e-17,-5.593490000000001e-17
std,1.000956,1.000956,1.000956,1.000956,1.000956,1.000956,1.000956,1.000956,1.000956
min,-1.216419,-0.6957698,-0.752386,-0.6283215,-1.008699,-0.6882774,-1.010743,-0.6125583,-0.3597434
25%,-0.8644566,-0.6957698,-0.752386,-0.6283215,-0.5526524,-0.6882774,-0.5963227,-0.6125583,-0.3597434
50%,-0.1605324,-0.6957698,-0.415995,-0.6283215,-0.5526524,-0.6882774,-0.1819021,-0.6125583,-0.3597434
75%,0.5433919,0.5982262,0.593178,0.4037802,0.3594416,0.9598141,0.2325184,0.3720446,-0.3597434
max,1.95124,2.215721,2.275133,2.467984,3.095724,1.78386,2.719042,2.341251,4.600929


In [63]:
X_scaled_minmax_test = scaler_minmax.transform(X_test)
pd.DataFrame(X_scaled_minmax_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,0.366984,0.231746,0.235556,0.194286,0.247619,0.269841,0.270476,0.207619,0.044444
std,0.303818,0.325331,0.329747,0.299346,0.2531,0.396062,0.279242,0.341531,0.150681
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.111111,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0
50%,0.333333,0.0,0.0,0.0,0.111111,0.0,0.222222,0.0,0.0
75%,0.444444,0.333333,0.444444,0.277778,0.333333,0.444444,0.444444,0.222222,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [64]:
X_scaled_standard_test = scaler_standard.transform(X_test)
pd.DataFrame(X_scaled_standard_test).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8
count,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0,175.0
mean,-0.053938,-0.021043,-0.039237,-0.026754,0.007634,-0.021193,-0.001925,0.000709,-0.139269
std,0.962391,0.947199,0.998314,0.926867,1.03883,0.97912,1.041512,1.008819,0.747478
min,-1.216419,-0.69577,-0.752386,-0.628321,-1.008699,-0.688277,-1.010743,-0.612558,-0.359743
25%,-0.864457,-0.69577,-0.752386,-0.628321,-0.552652,-0.688277,-1.010743,-0.612558,-0.359743
50%,-0.160532,-0.69577,-0.752386,-0.628321,-0.552652,-0.688277,-0.181902,-0.612558,-0.359743
75%,0.19143,0.274727,0.593178,0.231763,0.359442,0.41045,0.646939,0.043844,-0.359743
max,1.95124,2.215721,2.275133,2.467984,3.095724,1.78386,2.719042,2.341251,4.600929


In [65]:
#모델학습

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_scaled_minmax_train,y_train)

  y = column_or_1d(y, warn=True)


In [66]:
pred_train = model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train,y_train)

0.9675572519083969

In [67]:
pred_test = model.predict(X_scaled_minmax_test)
model.score(X_scaled_minmax_test,y_test)

0.9428571428571428

In [71]:
#confusion matrix, 오차행렬

from sklearn.metrics import confusion_matrix
confusion_train =  confusion_matrix(y_train, pred_train)
print("훈련 데이터 오차행렬:\n", confusion_train)

훈련 데이터 오차행렬:
 [[336   7]
 [ 10 171]]


In [72]:
confusion_test =  confusion_matrix(y_test, pred_test)
print("테스트 데이터 오차행렬:\n", confusion_test)

테스트 데이터 오차행렬:
 [[112   3]
 [  7  53]]


In [74]:
from sklearn.metrics import classification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측 레포트:\n", cfreport_train)

분류예측 레포트:
               precision    recall  f1-score   support

           2       0.97      0.98      0.98       343
           4       0.96      0.94      0.95       181

    accuracy                           0.97       524
   macro avg       0.97      0.96      0.96       524
weighted avg       0.97      0.97      0.97       524



In [75]:
cfreport_test = classification_report(y_test, pred_test)
print("분류예측 레포트:\n", cfreport_test)

분류예측 레포트:
               precision    recall  f1-score   support

           2       0.94      0.97      0.96       115
           4       0.95      0.88      0.91        60

    accuracy                           0.94       175
   macro avg       0.94      0.93      0.94       175
weighted avg       0.94      0.94      0.94       175



In [77]:
#ROC평가지표
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

false_positive_rate, true_positive_Rate, thresholds = roc_curve(y_test,model.decision_function(X_scaled_minmax_test))
roc_auc = metrics.roc_auc_score(y_test,model.decision_function(X_scaled_minmax_test))
roc_auc

ValueError: y_true takes value in {2, 4} and pos_label is not specified: either make y_true take value in {0, 1} or {-1, 1} or pass pos_label explicitly.

In [78]:
#예측값 병합 및 저장

prob_train = model.predict_proba(X_scaled_minmax_train)
y_train[['y_pred']] = pred_train
y_train[['y_prob0','y_prob1']] = prob_train
y_train

ValueError: Columns must be same length as key