In [2]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '전자랜지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']
encoder = LabelEncoder()
encoder.fit(items)
encoder.transform(items)
labels = encoder.transform(items)
print(labels)

[0 1 4 5 3 3 2 2]


In [3]:
# 인코딩 전 원래의 값 확인 : encoder.classes_ 속성
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자랜지', '컴퓨터'], dtype='<U4')

In [4]:
# 인코딩된 값 디코딩
encoder.inverse_transform([3, 0, 2, 1])

array(['선풍기', 'TV', '믹서', '냉장고'], dtype='<U4')

In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np

items = ['TV', '냉장고', '전자랜지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']

# 1. 먼저 숫자값으로 변환을 위해 LabelEncoder로 변환
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
labels
         

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [6]:
#2. 2차원 데이터로 변환
labels = labels.reshape(-1, 1) # -1을 쓰면 모든행을 말하고, 뒤의 숫자 1은 하나의 열을 갖는 차원
labels # 8차원의 1열을 가지는

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]], dtype=int64)

In [7]:
#3. 원-핫 인코딩을 적용
one_encoder = OneHotEncoder()
one_encoder.fit(labels)
one_labels = one_encoder.transform(labels)
one_labels

<8x6 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [8]:
print(one_labels) # 결과값의 의미는 (0,0) (1,1).. 이런 좌표값에 1이 들어가있다는 소리임

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 4)	1.0
  (3, 5)	1.0
  (4, 3)	1.0
  (5, 3)	1.0
  (6, 2)	1.0
  (7, 2)	1.0


In [11]:
# 2차원 형태로 출력
print('원-핫 인코딩 데이터')
print(one_labels.toarray())

원-핫 인코딩 데이터
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [12]:
print('원-핫 인코딩 데이터 차원')
print(one_labels.shape)

원-핫 인코딩 데이터 차원
(8, 6)


In [13]:
import pandas as pd
df = pd.DataFrame({'item':['TV', '냉장고', '전자랜지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서']})
df


Unnamed: 0,item
0,TV
1,냉장고
2,전자랜지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [14]:
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자랜지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


In [15]:
# Pandas 데이터프레임을 Numpy 배열로 변환
pd.get_dummies(df).to_numpy()

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]], dtype=uint8)

## 데이터 정규화 (StandardScaler)

In [19]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)

print(iris_df.mean())
print(iris_df.std())

sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
sepal length (cm)    0.828066
sepal width (cm)     0.435866
petal length (cm)    1.765298
petal width (cm)     0.762238
dtype: float64


In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
#print(iris_scaled)
iris_scaled_df = pd.DataFrame(data = iris_scaled, columns = iris.feature_names)

print(iris_scaled_df.mean())
print(iris_scaled_df.std())

sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
sepal length (cm)    1.00335
sepal width (cm)     1.00335
petal length (cm)    1.00335
petal width (cm)     1.00335
dtype: float64


## 데이터 정규화(MinMaxScaler)

In [25]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
print(iris_scaled)

[[0.22222222 0.625      0.06779661 0.04166667]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.11111111 0.5        0.05084746 0.04166667]
 [0.08333333 0.45833333 0.08474576 0.04166667]
 [0.19444444 0.66666667 0.06779661 0.04166667]
 [0.30555556 0.79166667 0.11864407 0.125     ]
 [0.08333333 0.58333333 0.06779661 0.08333333]
 [0.19444444 0.58333333 0.08474576 0.04166667]
 [0.02777778 0.375      0.06779661 0.04166667]
 [0.16666667 0.45833333 0.08474576 0.        ]
 [0.30555556 0.70833333 0.08474576 0.04166667]
 [0.13888889 0.58333333 0.10169492 0.04166667]
 [0.13888889 0.41666667 0.06779661 0.        ]
 [0.         0.41666667 0.01694915 0.        ]
 [0.41666667 0.83333333 0.03389831 0.04166667]
 [0.38888889 1.         0.08474576 0.125     ]
 [0.30555556 0.79166667 0.05084746 0.125     ]
 [0.22222222 0.625      0.06779661 0.08333333]
 [0.38888889 0.75       0.11864407 0.08333333]
 [0.22222222 0.75       0.08474576 0.08333333]
 [0.30555556 0.58333333 0.11864407 0.04166667]
 [0.22222222 

In [26]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

iris_scaled_df = pd.DataFrame(data = iris_scaled, columns = iris.feature_names)

print(iris_scaled_df.min())
print(iris_scaled_df.max())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64
