### 데이터 인코딩

* 레이블 인코딩(Label encoding)

In [3]:
from sklearn.preprocessing import LabelEncoder

items =['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

encoder = LabelEncoder()
encoder.fit(items)

In [5]:
labels = encoder.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 라벨 인코딩 규칙
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')

In [7]:
# 원본값 확인
encoder.inverse_transform(labels)

array(['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U4')

* 원-핫 인코딩(One-Hot encoding) : 하나만 참이있는걸로 변환시킴

In [27]:
# 먼저 라벨 인코딩으로 변환 후 원핫인코딩으로 변환
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
# 라벨 인코딩
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items) # 1차원 array
# 원핫 인코딩으로 변환하기 전에 2차원 배열로 구조 변경
# 한 행에 열이 1개만 있도록 만들기
labels = labels.reshape((-1, 1))
# 원핫 인코딩 # 사이킷럿은 2차원 배열로 만들어야 실행 가능하기에
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [28]:
oh_encoder.inverse_transform(oh_labels)

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]])

In [29]:
a2 = np.array(range(0,12))
a2

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [30]:
a2.reshape((3,-1))

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [32]:
# pandas를 활용한 원핫 인코딩 만들기 - get_dummies()
import pandas as pd
df = pd.DataFrame({'items':['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']})
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [33]:
result = pd.get_dummies(df)
result

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자렌지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


In [34]:
result.values

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]], dtype=uint8)

### 피처 스케일링과 정규화

* StandardScaler

In [38]:
from sklearn.datasets import load_iris

import pandas as pd

iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data= iris_data, columns = iris.feature_names)

In [36]:
df = pd.DataFrame(data = iris.data)
df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [44]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
np.set_printoptions(precision = 6, suppress = True)
iris_scaled
iris_scaled_df = pd.DataFrame(iris_scaled)
pd.options.display.float_format = '{:.5f}'.format
iris_scaled_df.describe()

Unnamed: 0,0,1,2,3
count,150.0,150.0,150.0,150.0
mean,-0.0,-0.0,-0.0,-0.0
std,1.00335,1.00335,1.00335,1.00335
min,-1.87002,-2.43395,-1.56758,-1.44708
25%,-0.90068,-0.59237,-1.22655,-1.18381
50%,-0.05251,-0.13198,0.33648,0.13251
75%,0.6745,0.55861,0.76276,0.79067
max,2.49202,3.09078,1.78583,1.7121


* MinMaxScaler

In [46]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(iris.data)
scaler.transform(iris.data)

array([[0.222222, 0.625   , 0.067797, 0.041667],
       [0.166667, 0.416667, 0.067797, 0.041667],
       [0.111111, 0.5     , 0.050847, 0.041667],
       [0.083333, 0.458333, 0.084746, 0.041667],
       [0.194444, 0.666667, 0.067797, 0.041667],
       [0.305556, 0.791667, 0.118644, 0.125   ],
       [0.083333, 0.583333, 0.067797, 0.083333],
       [0.194444, 0.583333, 0.084746, 0.041667],
       [0.027778, 0.375   , 0.067797, 0.041667],
       [0.166667, 0.458333, 0.084746, 0.      ],
       [0.305556, 0.708333, 0.084746, 0.041667],
       [0.138889, 0.583333, 0.101695, 0.041667],
       [0.138889, 0.416667, 0.067797, 0.      ],
       [0.      , 0.416667, 0.016949, 0.      ],
       [0.416667, 0.833333, 0.033898, 0.041667],
       [0.388889, 1.      , 0.084746, 0.125   ],
       [0.305556, 0.791667, 0.050847, 0.125   ],
       [0.222222, 0.625   , 0.067797, 0.083333],
       [0.388889, 0.75    , 0.118644, 0.083333],
       [0.222222, 0.75    , 0.084746, 0.083333],
       [0.305556, 0.

* Scaler를 이용하여 학습 데이터와 테스트 데이터에 fit(), transform(), fit_transform() 적용 시 유의사항. 