### 데이터 전처리

### 데이터 인코딩

* 레이블 인코딩(Label encoding)

#### One-hot encoding, Scaling

In [None]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

encoder = LabelEncoder()
encoder.fit(items) # 라벨 인코더 객체에 items 을 기입

In [None]:
labels = encoder.transform(items) # 인코딩 실행
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [None]:
# 라벨 인코딩 규칙
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')

In [None]:
# 원본 값 확인
encoder.inverse_transform(labels)

array(['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U4')

### 원 핫 인코딩(One-Hot Encoding)  

컴퓨터는 숫자로 이해하니까 냉장고가 티비의 5배이다, 라는 식의 연산 관계와 같은 연관을 짓게 되므로 원하지 않는 사이드 이펙트가 생겨난다.  
이런게 없이 원하는대로 데이터를 다루기 위한 인코딩 방식이다.  

In [None]:
# 먼저 라벨인코딩으로 변환 후 원 핫 인코딩

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

# 라벨 인코딩
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

# 원 핫 인코딩으로 변환하기 전에 2차원 배열로 구조 변경
labels = labels.reshape((-1, 1)) # 구조의 모양을 튜플로 넣는다.

# 원 핫 인코딩
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [None]:
# 원본의 값을 확인
oh_encoder.inverse_transform(oh_labels)

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]])

In [None]:
# pandas 를 활용한 원 핫 인코딩 만들기 - get_dummies()
import pandas as pd
df = pd.DataFrame({'items' : ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']})
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [None]:
result = pd.get_dummies(df)

In [None]:
result.values

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]], dtype=uint8)

### 피처 스케일링과 정규화

* StandardScaler

In [2]:
from sklearn.datasets import load_iris

import pandas as pd

iris = load_iris()
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [3]:
df = pd.DataFrame(data=iris.data)
df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [4]:
iris_data = iris.data
iris_df = pd.DataFrame(data=iris_data, columns=iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [7]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
np.set_printoptions(precision=6, suppress=True)
iris_scaled

array([[-0.900681,  1.019004, -1.340227, -1.315444],
       [-1.143017, -0.131979, -1.340227, -1.315444],
       [-1.385353,  0.328414, -1.397064, -1.315444],
       [-1.506521,  0.098217, -1.283389, -1.315444],
       [-1.021849,  1.249201, -1.340227, -1.315444],
       [-0.537178,  1.939791, -1.169714, -1.05218 ],
       [-1.506521,  0.788808, -1.340227, -1.183812],
       [-1.021849,  0.788808, -1.283389, -1.315444],
       [-1.748856, -0.362176, -1.340227, -1.315444],
       [-1.143017,  0.098217, -1.283389, -1.447076],
       [-0.537178,  1.479398, -1.283389, -1.315444],
       [-1.264185,  0.788808, -1.226552, -1.315444],
       [-1.264185, -0.131979, -1.340227, -1.447076],
       [-1.870024, -0.131979, -1.510739, -1.447076],
       [-0.052506,  2.169988, -1.453901, -1.315444],
       [-0.173674,  3.090775, -1.283389, -1.05218 ],
       [-0.537178,  1.939791, -1.397064, -1.05218 ],
       [-0.900681,  1.019004, -1.340227, -1.183812],
       [-0.173674,  1.709595, -1.169714, -1.18

In [8]:
iris_scaled_df = pd.DataFrame(iris_scaled)
iris_scaled_df

Unnamed: 0,0,1,2,3
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [9]:
iris_scaled_df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [10]:
pd.options.display.float_format = '{:.5f}'.format
iris_scaled_df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,-0.0,-0.0,-0.0,-0.0
std,1.00335,1.00335,1.00335,1.00335
min,-1.87002,-2.43395,-1.56758,-1.44708
25%,-0.90068,-0.59237,-1.22655,-1.18381
50%,-0.05251,-0.13198,0.33648,0.13251
75%,0.6745,0.55861,0.76276,0.79067
max,2.49202,3.09078,1.78583,1.7121


* MinMaxScaler

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(iris.data)
scaler.transform(iris.data)

array([[0.222222, 0.625   , 0.067797, 0.041667],
       [0.166667, 0.416667, 0.067797, 0.041667],
       [0.111111, 0.5     , 0.050847, 0.041667],
       [0.083333, 0.458333, 0.084746, 0.041667],
       [0.194444, 0.666667, 0.067797, 0.041667],
       [0.305556, 0.791667, 0.118644, 0.125   ],
       [0.083333, 0.583333, 0.067797, 0.083333],
       [0.194444, 0.583333, 0.084746, 0.041667],
       [0.027778, 0.375   , 0.067797, 0.041667],
       [0.166667, 0.458333, 0.084746, 0.      ],
       [0.305556, 0.708333, 0.084746, 0.041667],
       [0.138889, 0.583333, 0.101695, 0.041667],
       [0.138889, 0.416667, 0.067797, 0.      ],
       [0.      , 0.416667, 0.016949, 0.      ],
       [0.416667, 0.833333, 0.033898, 0.041667],
       [0.388889, 1.      , 0.084746, 0.125   ],
       [0.305556, 0.791667, 0.050847, 0.125   ],
       [0.222222, 0.625   , 0.067797, 0.083333],
       [0.388889, 0.75    , 0.118644, 0.083333],
       [0.222222, 0.75    , 0.084746, 0.083333],
       [0.305556, 0.

* Scaler를 이용하여 학습 데이터와 테스트 데이터에 fit(), transform(), fit_transform() 적용 시 유의사항. 