## 라벨 인코딩

In [50]:
### 01. 데이터 준비
import pandas as pd
data = { "eng": ["b", "c", "a", "d"] }
df = pd.DataFrame(data)
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,eng
0,b
1,c
2,a
3,d


In [56]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [48]:
en_x = LabelEncoder()
df['라벨인코딩'] = en_x.fit_transform(df['eng'])
df

Unnamed: 0,eng,라벨인코딩
0,b,1
1,c,2
2,a,0
3,d,3


## 원핫인코딩

In [49]:
df['라벨인코딩'].values

array([1, 2, 0, 3])

In [8]:
val = df['라벨인코딩'].values.reshape(-1,1)
val

array([[1],
       [2],
       [0],
       [3]])

In [10]:
onehot = OneHotEncoder()
val = df['라벨인코딩'].values.reshape(-1,1) # OneHotEncoder()를 사용을 위한 적합한 값으로 변경.
y = onehot.fit_transform( val ).toarray()    # 값을 변경후, 배열로 만들어준다.
y

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [9]:
val = df['라벨인코딩'].values.reshape(-1,2)
val

array([[1, 2],
       [0, 3]])

In [11]:
onehot_val = pd.DataFrame(y, dtype=int)
print(onehot_val)

df_new = pd.concat([df, onehot_val], axis=1)
df_new

   0  1  2  3
0  0  1  0  0
1  0  0  1  0
2  1  0  0  0
3  0  0  0  1


Unnamed: 0,eng,라벨인코딩,0,1,2,3
0,b,1,0,1,0,0
1,c,2,0,0,1,0
2,a,0,1,0,0,0
3,d,3,0,0,0,1


### 실습 - 라벨 인코딩, 원핫인코딩

In [38]:
data = { "회사명": ["MS","Apple", "Google", "Google"]}
df1 = pd.DataFrame(data)
df1

Unnamed: 0,회사명
0,MS
1,Apple
2,Google
3,Google


In [33]:
le = LabelEncoder()
df1['라벨인코딩'] = le.fit_transform(df1['회사명'])
df1

Unnamed: 0,회사명,라벨인코딩
0,MS,2
1,Apple,0
2,Google,1
3,Google,1


In [35]:
ohe = OneHotEncoder()
val = df1['라벨인코딩'].values.reshape(-1,1)
y = ohe.fit_transform(val).toarray()
y

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [36]:
onehot_val = pd.DataFrame(y, dtype=int)
print(onehot_val)
df_new = pd.concat([df1, onehot_val], axis=1)
df_new

   0  1  2
0  0  0  1
1  1  0  0
2  0  1  0
3  0  1  0


Unnamed: 0,회사명,라벨인코딩,0,1,2
0,MS,2,0,0,1
1,Apple,0,1,0,0
2,Google,1,0,1,0
3,Google,1,0,1,0


### OneHotEncoding 정리

In [37]:
### 01. 데이터 준비
import pandas as pd
data = { "feature1":[2,3,8,4],
         "feature2":[22,32,82,42],
         "target": ["b","c", "a", "d"]
       }
df = pd.DataFrame(data)
df

Unnamed: 0,feature1,feature2,target
0,2,22,b
1,3,32,c
2,8,82,a
3,4,42,d


In [39]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
df['lbl_en'] = label_encoder.fit_transform(df['target'])
df

Unnamed: 0,feature1,feature2,target,lbl_en
0,2,22,b,1
1,3,32,c,2
2,8,82,a,0
3,4,42,d,3


In [40]:
train_y = df['lbl_en'].values.reshape(len(df), 1)

onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
train_y_onehot = onehot_encoder.fit_transform(train_y)
print(train_y_onehot)
print(train_y_onehot.shape)

[[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]
(4, 4)


### One Hot Encoding with Pandas

In [41]:
import pandas as pd
import os

In [42]:
demo_df = pd.DataFrame({"범주형_feature":['양말', '여우', '양말', '상자']})
display(demo_df)

Unnamed: 0,범주형_feature
0,양말
1,여우
2,양말
3,상자


In [43]:
onehot = pd.get_dummies(demo_df)
onehot

Unnamed: 0,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0


In [44]:
df = pd.concat([demo_df, onehot], axis=1)
df

Unnamed: 0,범주형_feature,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,양말,0,1,0
1,여우,0,0,1
2,양말,0,1,0
3,상자,1,0,0
