# 参考
- [Category Encodersでカテゴリ特徴量をストレスなく変換する - Qiita](https://qiita.com/Hyperion13fleet/items/afa49a84bd5db65ffc31)

In [1]:
import warnings

import category_encoders
import pandas as pd

warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/train.csv',
                    usecols=[
                        'PassengerId', 'Survived', 'Pclass', 'Sex','Cabin', 'Embarked'
                    ])

# OneHotEncoder

## その1: 素直にやる
- 欠損は別カラム扱い(デフォルト)
- カラム名は連番になるみたいね

In [3]:
# Encoding対象のカラムのリスト
target_cols = ['Embarked']

encorder = category_encoders.OneHotEncoder(cols=target_cols)

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0,3,male,,1,0,0,0
1,2,1,1,female,C85,0,1,0,0
2,3,1,3,female,,1,0,0,0
3,4,1,1,female,C123,1,0,0,0
4,5,0,3,male,,1,0,0,0


In [4]:
df[df['Embarked_4']==1]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
61,62,1,1,female,B28,0,0,0,1
829,830,1,1,female,B28,0,0,0,1


In [5]:
# Embarked_4 は 欠損しているやつ
train.iloc[[61,829], :]

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked
61,62,1,1,female,B28,
829,830,1,1,female,B28,


## その2: handle_missing
```
    handle_missing: str
        options are 'error', 'return_nan', 'value', and 'indicator'. The default is 'value'. Warning: if indicator is used,
        an extra column will be added in if the transform matrix has nan values.  This can cause
        unexpected changes in dimension in some cases.
```

In [6]:
# Encoding対象のカラムのリスト
target_cols = ['Embarked']

encorder = category_encoders.OneHotEncoder(cols=target_cols,
                                     handle_missing='return_nan')

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0,3,male,,1.0,0.0,0.0,0.0
1,2,1,1,female,C85,0.0,1.0,0.0,0.0
2,3,1,3,female,,1.0,0.0,0.0,0.0
3,4,1,1,female,C123,1.0,0.0,0.0,0.0
4,5,0,3,male,,1.0,0.0,0.0,0.0


## その3: カラム名のsuffixをどうにかする
- カラム名どうにかならんの？ Emberked_Sとかにさ。

In [7]:
# Encoding対象のカラムのリスト
target_cols = ['Embarked']

encorder = category_encoders.OneHotEncoder(cols=target_cols,
                                     use_cat_names=True)

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan
0,1,0,3,male,,1,0,0,0
1,2,1,1,female,C85,0,1,0,0
2,3,1,3,female,,1,0,0,0
3,4,1,1,female,C123,1,0,0,0
4,5,0,3,male,,1,0,0,0


## その4: 複数のカラムをエンコーディング
- 位置が変化しないのも面白いね
    - 対象のDFのカラム位置で決まるっぽいね

In [8]:
# Encoding対象のカラムのリスト
target_cols = ['Embarked', 'Sex']

encorder = category_encoders.OneHotEncoder(cols=target_cols,
                                     use_cat_names=True)

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex_male,Sex_female,Cabin,Embarked_S,Embarked_C,Embarked_Q,Embarked_nan
0,1,0,3,1,0,,1,0,0,0
1,2,1,1,0,1,C85,0,1,0,0
2,3,1,3,0,1,,1,0,0,0
3,4,1,1,0,1,C123,1,0,0,0
4,5,0,3,1,0,,1,0,0,0


## その5: Int型に適用する
- `use_cat_names=False`にするとイメージ通りの動き

In [9]:
# Encoding対象のカラムのリスト
target_cols = ['Embarked', 'Pclass']

encorder = category_encoders.OneHotEncoder(cols=target_cols,
                                      use_cat_names=False) # Trueにすると `Pclass_1.0` になっちゃう

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass_1,Pclass_2,Pclass_3,Sex,Cabin,Embarked_1,Embarked_2,Embarked_3,Embarked_4
0,1,0,1,0,0,male,,1,0,0,0
1,2,1,0,1,0,female,C85,0,1,0,0
2,3,1,1,0,0,female,,1,0,0,0
3,4,1,0,1,0,female,C123,1,0,0,0
4,5,0,1,0,0,male,,1,0,0,0


# BinaryEncoder

In [10]:
# Encoding対象のカラムのリスト
target_cols = ['Sex','Embarked']

encorder = category_encoders.BinaryEncoder(cols=target_cols)

df = encorder.fit_transform(train)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex_0,Sex_1,Cabin,Embarked_0,Embarked_1,Embarked_2
0,1,0,3,0,1,,0,0,1
1,2,1,1,1,0,C85,0,1,0
2,3,1,3,1,0,,0,0,1
3,4,1,1,1,0,C123,0,0,1
4,5,0,3,0,1,,0,0,1


# OrdinalEncoder
- 木モデルならこれだなあ

In [11]:
# Encoding対象のカラムのリスト
target_cols = ['Sex','Embarked']

encorder = category_encoders.OrdinalEncoder(cols=target_cols)

encorder.fit_transform(train).head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Cabin,Embarked
0,1,0,3,1,,1
1,2,1,1,2,C85,2
2,3,1,3,2,,1
3,4,1,1,2,C123,1
4,5,0,3,1,,1


# おわりに
- 参考記事にある通り、これらは強いな
    - 対象を指定できる
    - DFを渡すとそのままDFで返ってくる
- 位置が変わらないのも個人的には◎
- まだ見ぬEncodingと引数も調べたらよりGoodだね
- 参考記事は、sklearn.preprocessingと比較しているのがマジで偉い