In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame([['black','yes', 1],
                   ['white', 'no', 2],
                  ['yellow', 'no', 3],
                  ['black', 'yes', 4]], columns=['label', 'binary', 'num'])
df

Unnamed: 0,label,binary,num
0,black,yes,1
1,white,no,2
2,yellow,no,3
3,black,yes,4


In [2]:
# Label Encoding	ラベル種類に応じた数値を割当
from sklearn.preprocessing import LabelEncoder

df_a = df.copy()
le = LabelEncoder()

df_a['label'] = le.fit_transform(df_a['label'])
df_a['binary'] = le.fit_transform(df_a['binary'])
df_a

Unnamed: 0,label,binary,num
0,0,1,1
1,1,0,2
2,2,0,3
3,0,1,4


In [3]:
# Label Encoding	ラベル種類に応じた数値を割当
# 複数列一括対応Encoder。二値分類も含めてまとめて使える。
from sklearn.preprocessing import OrdinalEncoder

df_b = df.copy()
oe = OrdinalEncoder()

df_b.loc[:,['labelEncoded', 'binaryEncoded', 'numEncoded']] = oe.fit_transform(df_b)
df_b

Unnamed: 0,label,binary,num,labelEncoded,binaryEncoded,numEncoded
0,black,yes,1,0.0,1.0,0.0
1,white,no,2,1.0,0.0,1.0
2,yellow,no,3,2.0,0.0,2.0
3,black,yes,4,0.0,1.0,3.0


In [4]:
# One-hot Encoding	ラベル種類ごとに特徴量を作りTrue/Falseを割当
from sklearn.preprocessing import OneHotEncoder

df_c = df.copy()
ohe = OneHotEncoder(sparse=False, drop='first') # dropでマルチコ防止
ohe.fit(df_c)

df_new = pd.DataFrame(ohe.transform(df_c), 
                      columns=ohe.get_feature_names_out(), 
                      dtype=np.int8)
df_c2 = pd.concat([df_c, df_new], axis = 1)
df_c2

Unnamed: 0,label,binary,num,label_white,label_yellow,binary_yes,num_2,num_3,num_4
0,black,yes,1,0,0,1,0,0,0
1,white,no,2,1,0,0,1,0,0
2,yellow,no,3,0,1,0,0,1,0
3,black,yes,4,0,0,1,0,0,1


In [5]:
# Count Encoding	ラベルの出現回数を割当
df_d = df.copy()

df_d['label'] = df_d.groupby('label')['label'].transform('count')
df_d

Unnamed: 0,label,binary,num
0,2,yes,1
1,1,no,2
2,1,no,3
3,2,yes,4


In [6]:
# Label Count(Count Rank) Encoding	ラベルの出現回数ランクを割当
df_e = df.copy()

count_rank = df_e.groupby('label')['label'].count().rank(ascending=False)
df_e['label'] = df_e['label'].map(count_rank)
df_e

Unnamed: 0,label,binary,num
0,1.0,yes,1
1,2.5,no,2
2,2.5,no,3
3,1.0,yes,4


In [7]:
# Target Encoding	ラベルごとの目的変数平均値を割当(Greedy Target Statistics)
df_f = df.copy()

target_dict = df_f.groupby(['label'])['num'].mean().to_dict()
df_f['label'] = df_f['label'].map(lambda x: target_dict[x]).values

df_f

Unnamed: 0,label,binary,num
0,2.5,yes,1
1,2.0,no,2
2,3.0,no,3
3,2.5,yes,4


In [8]:
# # Target Encoding	ラベルごとの目的変数平均値を割当(Leave one-out Target Statistics)
# te = ce.LeaveOneOutEncoder(cols=['label'])
# df_encoded = te.fit_transform(df['label'], df['target'])


In [9]:
# # Target Encoding	ラベルごとの目的変数平均値を割当(Ordered Target Statistics)
# te = ce.CatBoostEncoder(cols=['label'])
# df_encoded = te.fit_transform(df['label'], df['target'])
# print(df_encoded)