# 위스콘신 유방암 데이터셋
- 위스콘신 대학교에서 제공한 유방암 진단결과 데이터
- 암측정값들, 진단결과 컬럼들로 구성
    - 모든 Feature들은 연속형(continous)이다.
- Scikit-learn에서 toy dataset으로 제공한다. 
    - load_breast_cancer() 함수 이용
- StandardScaler와 MinMax Scaler를 이용해 Feature Scaling 위스콘신 유방암 데이터셋의 Feature들을 처리를 한다.
- StandardScaler
    - X_train_scaled1, X_val_scaled1, X_test_scaled1
- MinMaxScaler    
    - X_train_scaled2, X_val_scaled2, X_test_scaled2

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [5]:
data.target_names

array(['malignant', 'benign'], dtype='<U9')

In [6]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

##### 데이터 나누기

In [7]:
data.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [25]:
from sklearn.model_selection import train_test_split

# train/test 분리 => 전체에서 test set 분리
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target,
                                                   test_size=0.2, random_state=0,
                                                   stratify=data.target)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  test_size=0.2, random_state=0,
                                                  stratify=y_train)
print(X_train.shape, X_validation.shape, X_test.shape)

(364, 30) (91, 30) (114, 30)


##### scaling 처리
 - train set으로 fit 한 모델을 이용해 train/validation/test set 을 변환한다.
 
##### 표준화

In [16]:
import numpy as np
from sklearn.preprocessing import StandardScaler

s_scaler = StandardScaler()
X_train_scaled1 = s_scaler.fit_transform(X_train)
X_val_scaled1 = s_scaler.transform(X_val)
X_test_scaled1 = s_scaler.transform(X_test)

##### 확인
- 평균, 표준편차 확인

In [17]:
np.round(np.mean(X_train, axis=0))

array([ 14.,  19.,  92., 661.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         1.,   3.,  40.,   0.,   0.,   0.,   0.,   0.,   0.,  16.,  26.,
       108., 888.,   0.,   0.,   0.,   0.,   0.,   0.])

In [18]:
np.round(np.mean(X_train_scaled1, axis=0))

array([-0., -0., -0.,  0., -0.,  0.,  0., -0., -0.,  0., -0.,  0., -0.,
        0.,  0., -0.,  0., -0., -0.,  0., -0., -0., -0.,  0.,  0.,  0.,
        0., -0.,  0., -0.])

##### MinMax Scaling

In [21]:
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
X_train_scaled2 = mm_scaler.fit_transform(X_train)
X_val_scaled2 = mm_scaler.fit_transform(X_val)
X_test_scaled2 = mm_scaler.transform(X_test)

##### 확인
- min, max값 확인

In [24]:
print(np.min(X_train_scaled2, axis=0))
print(np.max(X_train_scaled2, axis=0))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
