# 前処理の手法
1. MinMaxScalling

In [1]:
import warnings

warnings.filterwarnings("ignore")

## データの準備と確認

In [2]:
# データの準備
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [3]:
# データセットの分割
from sklearn.model_selection import train_test_split

X_train,  X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

In [4]:
print(X_train.shape, X_test.shape)

(426, 30) (143, 30)


### どんなデータなのか？
- 569サンプル
- 特徴量は30次元

In [5]:
# データセットの説明を確認する
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

## MinMaxScaling

In [6]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# 学習データに対してMinとMaxを計算
# あくまで X_train のみ(y_train)は不要！
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

### 変換後のデータを確認する(学習データ)

In [7]:
# 学習データをスケーリング
X_train_scaled = scaler.transform(X_train)

In [8]:
# 次元は変わらない
print(X_train_scaled.shape)

(426, 30)


In [9]:
# 各特徴量の最小値(変換前)
print(X_train.min(axis=0))

[6.981e+00 9.710e+00 4.379e+01 1.435e+02 5.263e-02 1.938e-02 0.000e+00
 0.000e+00 1.060e-01 4.996e-02 1.115e-01 3.628e-01 7.570e-01 7.228e+00
 1.713e-03 2.252e-03 0.000e+00 0.000e+00 7.882e-03 8.948e-04 7.930e+00
 1.202e+01 5.041e+01 1.852e+02 7.117e-02 2.729e-02 0.000e+00 0.000e+00
 1.565e-01 5.504e-02]


In [10]:
# 各特徴量の最小値(変換後)
print(X_train_scaled.min(axis=0))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]


In [11]:
# 各特徴量の最大値(変換前)
print(X_train.max(axis=0))

[2.811e+01 3.381e+01 1.885e+02 2.501e+03 1.447e-01 3.114e-01 4.268e-01
 2.012e-01 3.040e-01 9.744e-02 2.873e+00 4.885e+00 2.198e+01 5.422e+02
 2.333e-02 1.064e-01 3.960e-01 5.279e-02 6.146e-02 2.984e-02 3.604e+01
 4.954e+01 2.512e+02 4.254e+03 2.226e-01 1.058e+00 1.252e+00 2.903e-01
 6.638e-01 2.075e-01]


In [12]:
# 各特徴量の最大値(変換後)
print(X_train_scaled.max(axis=0))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


### 変換後のデータを確認する(テストデータ)

In [13]:
# テストデータをスケーリング
X_test_scaled = scaler.transform(X_test)

In [14]:
# 各特徴量の最小値(変換前)
print(X_test.min(axis=0))

[7.729e+00 1.072e+01 4.798e+01 1.788e+02 6.576e-02 3.398e-02 0.000e+00
 0.000e+00 1.203e-01 5.024e-02 1.144e-01 3.602e-01 7.714e-01 6.802e+00
 2.826e-03 3.746e-03 0.000e+00 0.000e+00 1.013e-02 1.217e-03 8.964e+00
 1.249e+01 5.717e+01 2.422e+02 8.409e-02 4.619e-02 0.000e+00 0.000e+00
 1.603e-01 5.865e-02]


In [15]:
# 各特徴量の最小値(変換後)
print(X_test_scaled.min(axis=0))

[ 0.03540158  0.04190871  0.02895446  0.01497349  0.14260888  0.04999658
  0.          0.          0.07222222  0.00589722  0.00105015 -0.00057494
  0.00067851 -0.0007963   0.05148726  0.01434497  0.          0.
  0.04195752  0.01113138  0.03678406  0.01252665  0.03366702  0.01400904
  0.08531995  0.01833687  0.          0.          0.00749064  0.02367834]


In [16]:
# 各特徴量の最大値(変換前)
print(X_test.min(axis=0))

[7.729e+00 1.072e+01 4.798e+01 1.788e+02 6.576e-02 3.398e-02 0.000e+00
 0.000e+00 1.203e-01 5.024e-02 1.144e-01 3.602e-01 7.714e-01 6.802e+00
 2.826e-03 3.746e-03 0.000e+00 0.000e+00 1.013e-02 1.217e-03 8.964e+00
 1.249e+01 5.717e+01 2.422e+02 8.409e-02 4.619e-02 0.000e+00 0.000e+00
 1.603e-01 5.865e-02]


In [17]:
# 各特徴量の最大値(変換後)
print(X_test_scaled.max(axis=0))

[0.76809125 1.22697095 0.75813696 0.64750795 1.20310633 1.11643038
 0.99906279 0.90606362 0.93232323 0.94903117 0.45573058 0.72623944
 0.48593507 0.31641282 1.36082713 1.2784499  0.36313131 0.77476795
 1.32643996 0.72672498 0.82106012 0.87553305 0.77887345 0.67803775
 0.78603975 0.87843331 0.93450479 1.0024113  0.76384782 0.58743277]


#### テストデータだと、最小値0  最小値1 になっていない！
- これは問題ない
- スケーリングのもととのなるのは、学習データのみ
- テストデータ は **未知のデータ** なわけだから、スケーリングできていいものではない