# 데이터 전처리 (Preprocessing)

## Data Encoding
머신러닝 알고리즘은 **문자열(string) 데이터** 속성을 입력 받지 않으며, 모든 데이터는 숫자형으로 표현되어야 한다.

→ 문자형 카테고리형 속성은 모두 숫자값으로 변환/인코딩 되어야 한다.

### Label Encoding

In [14]:
from sklearn.preprocessing import LabelEncoder

items = ["TV", "냉장고", "전자랜지", "컴퓨터", "선풍기", "선풍기", "믹서", "믹서"]
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(f"인코딩 변환값: {labels}")
print(f"디코딩 원본값: {encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3])}")

인코딩 변환값: [0 1 4 5 3 3 2 2]
디코딩 원본값: ['전자랜지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


### One-Hot Encoding

피쳐 값의 고유 값에 해당하는 컬럼에만 1을 표시하고 나머지 컬럼에는 0을 표시하는 방식

In [15]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ["TV", "냉장고", "전자랜지", "컴퓨터", "선풍기", "선풍기", "믹서", "믹서"]
items = np.array(items).reshape(-1, 1)  # 2차원 nparray로 변환

encoder = OneHotEncoder()
# encoder.fit(items)
# labels = encoder.transform(items)
labels = encoder.fit_transform(items)

print(labels.toarray())
print(labels.shape)

[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
(8, 6)


In [16]:
import pandas as pd

items = ["TV", "냉장고", "전자랜지", "컴퓨터", "선풍기", "선풍기", "믹서", "믹서"]
labels = pd.get_dummies(items, dtype='int')
display(labels)


Unnamed: 0,TV,냉장고,믹서,선풍기,전자랜지,컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


## Feature scaling


In [8]:
from sklearn.datasets import load_iris
import pandas as pd

iris_data = load_iris()
iris_df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)
display(iris_df)

print(f"[feature 평균값]\n{iris_df.mean()}")
print(f"[feature 분산값]\n{iris_df.var()}")

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


[feature 평균값]
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
[feature 분산값]
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


- StandardScaler

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# scaler.fit(iris_df)
# iris_scaled = scaler.transform(iris_df)  # return type: nupy ndarray
iris_scaled = scaler.fit_transform(iris_df)

scaled_df = pd.DataFrame(iris_scaled, columns=iris_data.feature_names)
print(f"[feature 평균값]\n{scaled_df.mean()}")
print(f"[feature 분산값]\n{scaled_df.var()}")

[feature 평균값]
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
[feature 분산값]
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


- MinMaxScaler

In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_df)

scaled_df = pd.DataFrame(iris_scaled, columns=iris_data.feature_names)
print(f"[feature 최소값]\n{scaled_df.min()}")
print(f"[feature 최대값]\n{scaled_df.max()}")


[feature 최소값]
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
[feature 최대값]
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64


### fit, transform 사용 시 유의사항
- 학습 데이터와 테스트 데이터는 scale 척도가 같아야한다.
  - fit을 실행시킬 때 마다 scale 척도가 새로 적용되므로 유의해야한다.

In [22]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

train_array = np.arange(0, 11).reshape(-1, 1)  # reshape(-1, 1): 1차원 array를 2차원으로 변경
test_array = np.arange(0, 6).reshape(-1, 1)

scaler = MinMaxScaler()

scaler.fit(train_array)
train_scaled = scaler.transform(train_array)
print(f"[원본 train_array] {train_array.reshape(-1)}")
print(f"[Scaled train_array] {train_scaled.reshape(-1)}")

scaler.fit(test_array)  # 테스트 데이터 기준으로 scale이 다시 설정된다
test_scaled = scaler.transform(test_array)
print(f"\n[원본 test_array] {test_array.reshape(-1)}")
print(f"[Scaled test_array] {test_scaled.reshape(-1)}")


[원본 train_array] [ 0  1  2  3  4  5  6  7  8  9 10]
[Scaled train_array] [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

[원본 test_array] [0 1 2 3 4 5]
[Scaled test_array] [0.  0.2 0.4 0.6 0.8 1. ]


In [24]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

train_array = np.arange(0, 11).reshape(-1, 1)  # reshape(-1, 1): 1차원 array를 2차원으로 변경
test_array = np.arange(0, 6).reshape(-1, 1)

scaler = MinMaxScaler()

scaler.fit(train_array)
train_scaled = scaler.transform(train_array)
print(f"[원본 train_array] {train_array.reshape(-1)}")
print(f"[Scaled train_array] {train_scaled.reshape(-1)}")

# scaler.fit(test_array)  # 테스트 데이터 기준으로 scale이 다시 설정된다
test_scaled = scaler.transform(test_array)
print(f"\n[원본 test_array] {test_array.reshape(-1)}")
print(f"[Scaled test_array] {test_scaled.reshape(-1)}")


[원본 train_array] [ 0  1  2  3  4  5  6  7  8  9 10]
[Scaled train_array] [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]

[원본 test_array] [0 1 2 3 4 5]
[Scaled test_array] [0.  0.1 0.2 0.3 0.4 0.5]
