## OneHotEncoder 방법 사용하기
### 1. scikit-learn OneHotEncoder
[주의사항]
- pandas의 시리즈가 아닌 numpy 행렬을 입력해야함 → values 이용(ex: goods.values.reshape(-1, 1))

- 벡터 입력을 허용하지 않음 → reshape을 이용해 Matrix로 변환 필요

### 2. pandas의 get_dummies()를 활용한 OneHotEncoder

### 라이브러리 임폴트와 데이터 준비

In [164]:
import numpy as np
import pandas as pd
from random import randint
from sklearn.preprocessing import OneHotEncoder

In [165]:
data = np.arange(1,15,1).reshape(-1,2)
data

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14]])

In [166]:
df = pd.DataFrame(data, columns=('판매수량', '재고수량'))
df['상품']=['a001','a002','a001','a002', 'b002', 'b001', 'b001']
df

Unnamed: 0,판매수량,재고수량,상품
0,1,2,a001
1,3,4,a002
2,5,6,a001
3,7,8,a002
4,9,10,b002
5,11,12,b001
6,13,14,b001


In [167]:
df['상품'].unique()

array(['a001', 'a002', 'b002', 'b001'], dtype=object)

### 1. scikit-learn OneHotEncoder 사용하기

In [168]:
from sklearn.preprocessing import OneHotEncoder
# sparse=True가 디폴트임, Matrix를 반환함.
# sparse=False를 넣어주면 array를 반환함
ohe = OneHotEncoder(sparse = False)
ohe

OneHotEncoder(sparse=False)

In [169]:
goods = df['상품']
print(goods)
print(goods.shape)

0    a001
1    a002
2    a001
3    a002
4    b002
5    b001
6    b001
Name: 상품, dtype: object
(7,)


In [174]:
# numpy array 2차원 행렬로 입력을 해야함으로 reshape()을 해줌
goods_reshape = goods.values.reshape(-1,1)
goods_reshape.shape

(7, 1)

In [175]:
ohe_data = ohe.fit(goods_reshape).transform(goods_reshape)
ohe_data

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [176]:
ohe_data = ohe_data.astype(int)
print(ohe_data)

[[1 0 0 0]
 [0 1 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 1 0]]


In [162]:
print(type(ohe.categories_))
ohe.categories_

<class 'list'>


[array(['a001', 'a002', 'b001', 'b002'], dtype=object)]

In [150]:
# categories_를 활용해 열의 컬럼으로 지정
ohe_df = pd.DataFrame(ohe_data, columns=ohe.categories_)
ohe_df

Unnamed: 0,a001,a002,b001,b002
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,0,1,0,0
4,0,0,0,1
5,0,0,1,0
6,0,0,1,0


### 2. Pasndas의 get_dummies()를 활용한 one hot encoding

In [163]:
import pandas as pd
ohe_df = pd.get_dummies(goods)
ohe_df

Unnamed: 0,a001,a002,b001,b002
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,0,1,0,0
4,0,0,0,1
5,0,0,1,0
6,0,0,1,0
