# 데이터 전처리 (Preprocessing)

## 데이터 인코딩

In [2]:
from sklearn.preprocessing import LabelEncoder
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [4]:
# LabelEncoder 객체 생성후, fit()과 transform()으로 labels encoder 수행
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [12]:
labels = encoder.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [17]:
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')

In [14]:
encoder.inverse_transform([3,4,5,2,0,1])

array(['선풍기', '전자렌지', '컴퓨터', '믹서', 'TV', '냉장고'], dtype='<U4')

In [18]:
encoder.inverse_transform([3])[0]

'선풍기'

In [20]:
def get_name(model, num):
    return model.inverse_transform([num])[0]

In [22]:
get_name(encoder, 4)

'전자렌지'

## One-hot encoding ==> 매우중요

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [28]:
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
labels.shape

(8,)

In [29]:
labels = labels.reshape(-1, 1)
labels.shape

(8, 1)

In [34]:
labels

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]], dtype=int64)

In [36]:
# 원-핫 인코딩을 적용 : 비효율적, 하지만 딥러닝시 반드시 '원-핫' 인코딩을 사용해아함.
oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(labels)
oh_labels.shape

(8, 6)

In [37]:

oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [41]:
# 자주 사용하는 방법 -- 파이선 스러운  방법
labels = LabelEncoder().fit_transform(items).reshape(-1,1)
oh_labels = OneHotEncoder().fit_transform(labels)
oh_labels.shape

(8, 6)

In [42]:
# 판다스를 통한 데이터 프레임화 
import pandas as pd 
df = pd.DataFrame({'items':items})
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [43]:
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자렌지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


## Feature Scaling and Nomalization

In [45]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [46]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


- 1) 표준정규분포 - Standard Scaler

In [48]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

In [49]:
type(iris_scaled)

numpy.ndarray

In [50]:
iris_scaled[:5, :]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

In [51]:
import numpy as np 

np.mean(iris_scaled, axis=0)

array([-1.69031455e-15, -1.84297022e-15, -1.69864123e-15, -1.40924309e-15])

In [52]:
np.std(iris_scaled, axis=0)

array([1., 1., 1., 1.])

- 2) 0~1 사이의 값으로 - Min Max Scaler

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_df)

In [54]:
#최대값
np.max(iris_scaled, axis=0)

array([1., 1., 1., 1.])

In [57]:
#최소값
np.min(iris_scaled, axis=0)

array([0., 0., 0., 0.])

In [58]:
np.mean(iris_scaled, axis=0)

array([0.4287037 , 0.44055556, 0.46745763, 0.45805556])

In [56]:
np.std(iris_scaled, axis=0)

array([0.22925036, 0.18100457, 0.29820408, 0.31653859])