### 데이터 전처리

#### 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기','선풍기', '믹서', '믹서']

In [3]:
# 객체 생성
le = LabelEncoder()

In [4]:
# 학습할 데이터를 훑어 봄
le.fit(items)

LabelEncoder()

In [5]:
# 인코딩 실행
le.transform(items)

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 실전에서는 동시에 한다
le2 = LabelEncoder()
labels = le.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [8]:
# 더 익숙해지면 다음과 같은 방식으로
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [10]:
le.inverse_transform([2,4,3,0,5,1])     # decoding

array(['믹서', '전자렌지', '선풍기', 'TV', '컴퓨터', '냉장고'], dtype='<U4')

#### 2. One-hot Encoding
- ML에서는 잘 안쓰이지만 DL에서는 자주쓰임

In [11]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
oh_labels = ohe.fit_transform(labels.reshape(-1,1))

In [15]:
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [16]:
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

#### 3. 표준화
- 평균 0, 표준편차 1인 가우시안 정규분포

In [17]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
from sklearn.preprocessing import StandardScaler
iris_std = StandardScaler().fit_transform(iris.data)

In [24]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [36]:
df2 = pd.DataFrame(iris_std, columns=iris.feature_names)
df2.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


- Logistic Regression 으로 표준화한 데이터와 그렇지 않은 데이터로 분류비교

In [28]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2022)

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, random_state=2022
)

In [29]:
lrc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=2022)

In [30]:
# 정규화된 데이터로 학습하면 목표값에 도달하는 속도가 빨라 에러가 발생하지 않음
X_train, X_test, y_train, y_test = train_test_split(
    iris_std, iris.target, stratify=iris.target, random_state=2022
)

In [31]:
lrc.fit(X_train, y_train)

LogisticRegression(random_state=2022)

In [32]:
lrc.score(X_test, y_test)

0.9473684210526315

#### 4. 정규화
- 최소값 0, 최댓값 1로 변환

In [34]:
from sklearn.preprocessing import MinMaxScaler
iris_mm = MinMaxScaler().fit_transform(iris.data)

In [35]:
df3 = pd.DataFrame(iris_mm, columns=iris.feature_names)
df3.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [37]:
# 정규화된 데이터로 학습하면 목표값에 도달하는 속도가 빨라 에러가 발생하지 않음
X_train, X_test, y_train, y_test = train_test_split(
    iris_mm, iris.target, stratify=iris.target, random_state=2022
)

In [38]:
lrc = LogisticRegression(random_state=2022)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.9210526315789473