# 데이터 인코딩

## label encoding
문자열 피쳐를 코드형 숫자값으로 변환해 주는 인코딩 방식

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({'name':['냉장고','TV','세탁기'],
                  'price':[13,23,12]})

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   price   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [4]:
from sklearn import preprocessing

In [5]:
le = preprocessing.LabelEncoder() # 객체생성

In [7]:
le.fit(df.name) # 매핑 테이블 생성

LabelEncoder()

In [8]:
le.transform(df.name)
# le.fit_transform(df.name) 으로 한번에 실행 가능
# fit과 transform를 따로 사용하는 이유 : 해당 인코딩을 반복 사용 가능하기 때문

array([1, 0, 2])

In [11]:
le.classes_

array(['TV', '냉장고', '세탁기'], dtype=object)

In [13]:
le.inverse_transform([0,2,0])

array(['TV', '세탁기', 'TV'], dtype=object)

In [9]:
df['id'] = le.transform(df.name)

In [10]:
df

Unnamed: 0,name,price,id
0,냉장고,13,1
1,TV,23,0
2,세탁기,12,2


In [None]:
df2 = df.copy()

## one-hot encoding

In [14]:
# OneHotEncdoer 방법

ohe = preprocessing.OneHotEncoder()

In [20]:
ohe.fit_transform(df.name.values.reshape(-1,1)).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [21]:
# LabelBinarizer 방법

lb = preprocessing.LabelBinarizer()

In [23]:
lb.fit_transform(df.name)

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1]])

In [24]:
# 가장 쉬운 방법 : pandas.get_dumies()

pd.get_dummies(df)

Unnamed: 0,price,id,name_TV,name_냉장고,name_세탁기
0,13,1,0,1,0
1,23,0,1,0,0
2,12,2,0,0,1


In [25]:
pd.get_dummies(df.name)

Unnamed: 0,TV,냉장고,세탁기
0,0,1,0
1,1,0,0
2,0,0,1


# 피쳐 스케일링과 정규화

In [26]:
from sklearn.datasets import load_iris

In [28]:
iris = load_iris()

iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [29]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## StandardScaler

In [30]:
ss = preprocessing.StandardScaler()

In [32]:
ss.fit(iris_df)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [36]:
iris_ss = pd.DataFrame(ss.transform(iris_df),columns=iris_df.columns)

In [37]:
iris_ss.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


## MinMaxScaler

In [38]:
mms = preprocessing.MinMaxScaler()

In [39]:
mms.fit(iris_df)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [40]:
iris_mms = pd.DataFrame(mms.transform(iris_df), columns=iris_df.columns)

In [41]:
iris_mms.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0
