## Category Encoders

### Step 1. 导入必要的包

In [27]:
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
pd.options.display.float_format = '{:.2f}'.format

In [3]:
#make some data
df = pd.DataFrame({
     'color':["a","c","a","a","b","b"],
     'outcome':[1,2,0,0,0,1]})
# setup X and y
X = df.drop('outcome', axis=1)
y = df.drop('color', axis=1)

In [4]:
X

Unnamed: 0,color
0,a
1,c
2,a
3,a
4,b
5,b


In [5]:
df

Unnamed: 0,color,outcome
0,a,1
1,c,2
2,a,0
3,a,0
4,b,0
5,b,1


## Classic Encoder

### Step 2. Binary Encoder

In [6]:
ce_binary = ce.BinaryEncoder(cols = ['color'])
ce_binary.fit_transform(X, y)

Unnamed: 0,color_0,color_1,color_2
0,0,0,1
1,0,1,0
2,0,0,1
3,0,0,1
4,0,1,1
5,0,1,1


### Step 3. OrdinalEncoder

In [20]:
ce_ord = ce.OrdinalEncoder(cols=['color'])
ce_ord.fit_transform(X)

Unnamed: 0,color
0,1
1,2
2,1
3,1
4,3
5,3


#### sklearn LabelEncoder

In [25]:
le = LabelEncoder()
X['color_label'] = X[['color']].apply(lambda col: le.fit_transform(col))
X

Unnamed: 0,color,color_label
0,a,0
1,c,2
2,a,0
3,a,0
4,b,1
5,b,1


### Step 4. OneHotEncoder

In [26]:
ce_one_hot = ce.OneHotEncoder(cols=['color'])
ce_one_hot.fit_transform(X)

Unnamed: 0,color_1,color_2,color_3,color_-1,color_label
0,1,0,0,0,0
1,0,1,0,0,2
2,1,0,0,0,0
3,1,0,0,0,0
4,0,0,1,0,1
5,0,0,1,0,1


#### Sklearn OneHot

In [46]:
ohc = OneHotEncoder()

In [50]:
ohc.fit_transform(X[['color_label']]).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

#### pandas OneHot

In [51]:
one_hot = pd.get_dummies(X['color'])

In [52]:
one_hot

Unnamed: 0,a,b,c
0,1,0,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,1,0
5,0,1,0


In [None]:
one_hot = pd.get_dummies(X, prefix_sep='_', drop_first=True)

## Contrast Encoder

### Step 5.  Backward Difference Encoding

#### A feature of K categories, or levels, usually enters a regression as a sequence of K-1 dummy variables. In backward difference coding, the mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level. This type of coding may be useful for a nominal or an ordinal variable.

In [54]:
X

Unnamed: 0,color,color_label
0,a,0
1,c,2
2,a,0
3,a,0
4,b,1
5,b,1


In [53]:
ce_bde = ce.BackwardDifferenceEncoder(cols=['color'])
ce_bde.fit_transform(X)

Unnamed: 0,intercept,color_0,color_1,color_label
0,1,-0.67,-0.33,0
1,1,0.33,-0.33,2
2,1,-0.67,-0.33,0
3,1,-0.67,-0.33,0
4,1,0.33,0.67,1
5,1,0.33,0.67,1


### Step 6. Sum Encoding

In [61]:
from patsy.contrasts import Sum, Treatment

In [59]:
url = 'https://stats.idre.ucla.edu/stat/data/hsb2.csv'
hsb2 = pd.read_csv(url)

In [60]:
hsb2.head()

Unnamed: 0,id,female,race,ses,schtyp,prog,read,write,math,science,socst
0,70,0,4,1,1,1,57,52,41,47,57
1,121,1,4,2,1,3,68,59,53,63,61
2,86,0,4,3,1,1,44,33,54,58,31
3,141,0,4,3,1,3,63,44,47,53,56
4,172,0,4,2,1,2,47,52,57,53,61


In [63]:
hsb2.race.value_counts()

4    145
1     24
3     20
2     11
Name: race, dtype: int64

In [64]:
levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)

In [65]:
contrast.matrix

array([[0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [66]:
from statsmodels.formula.api import ols
mod = ols("write ~ C(race, Treatment)", data=hsb2)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,write,R-squared:,0.107
Model:,OLS,Adj. R-squared:,0.093
Method:,Least Squares,F-statistic:,7.833
Date:,"Mon, 25 Mar 2019",Prob (F-statistic):,5.78e-05
Time:,19:18:56,Log-Likelihood:,-721.77
No. Observations:,200,AIC:,1452.0
Df Residuals:,196,BIC:,1465.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,46.4583,1.842,25.218,0.000,42.825,50.091
"C(race, Treatment)[T.2]",11.5417,3.286,3.512,0.001,5.061,18.022
"C(race, Treatment)[T.3]",1.7417,2.732,0.637,0.525,-3.647,7.131
"C(race, Treatment)[T.4]",7.5968,1.989,3.820,0.000,3.675,11.519

0,1,2,3
Omnibus:,10.487,Durbin-Watson:,1.779
Prob(Omnibus):,0.005,Jarque-Bera (JB):,11.031
Skew:,-0.551,Prob(JB):,0.00402
Kurtosis:,2.67,Cond. No.,8.25


In [67]:
contrast = Sum().code_without_intercept(levels)
contrast.matrix

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [-1., -1., -1.]])

### Step 7. Helmert Coding

In [68]:
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept(levels)
contrast.matrix

array([[-1., -1., -1.],
       [ 1., -1., -1.],
       [ 0.,  2., -1.],
       [ 0.,  0.,  3.]])

### Step 8. Orthogonal Polynomial Coding

In [70]:
from patsy.contrasts import Poly
_, bins = np.histogram(hsb2.read, 3)
try: # requires numpy master
    readcat = np.digitize(hsb2.read, bins, True)
except:
    readcat = np.digitize(hsb2.read, bins)
hsb2['readcat'] = readcat
hsb2.groupby('readcat').mean()['write']
levels = hsb2.readcat.unique().tolist()
contrast = Poly().code_without_intercept(levels)
contrast.matrix

array([[-0.67082039,  0.5       , -0.2236068 ],
       [-0.2236068 , -0.5       ,  0.67082039],
       [ 0.2236068 , -0.5       , -0.67082039],
       [ 0.67082039,  0.5       ,  0.2236068 ]])

### Dict Vectorizer

In [55]:
from sklearn.feature_extraction import DictVectorizer
X_dict = X.to_dict(orient='records')
dv_X = DictVectorizer(sparse=False)
X_encoded = dv_X.fit_transform(X_dict)
X_encoded

array([[1., 0., 0., 0.],
       [0., 0., 1., 2.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 1.],
       [0., 1., 0., 1.]])

In [56]:
dv_X.vocabulary_

{'color=a': 0, 'color=b': 1, 'color=c': 2, 'color_label': 3}