# Import

In [1]:
import warnings
import hashlib
import numpy as np
import pandas as pd
import category_encoders as ce
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
warnings.filterwarnings("ignore")

# Formulas

$$enc_i = mean(y| x = i)$$

$$enc_i = w_i \times mean(y| x = i) + (1 - w_i) \times mean(y)$$

## 1. Ordinal Encoding

In [3]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [4]:
sorted_x = sorted(set(x))
ordinal_encoding = x.replace(dict(zip(sorted_x, range(1, len(sorted_x) + 1))))

In [5]:
# ensure that our output coincides with the one from category_encoders
assert ordinal_encoding.eq(ce.OrdinalEncoder().fit_transform(X = x.sort_values())['x']).all()

In [6]:
ordinal_encoding.name = 'OrdinalEncoding'
show = pd.concat([x, ordinal_encoding], axis = 1)
show

Unnamed: 0,x,OrdinalEncoding
0,2_Bachelors,2
1,1_High-School,1
2,4_PhD,4
3,3_Masters,3
4,1_High-School,1
5,2_Bachelors,2


## 2. Count Encoding

In [7]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [8]:
count_encoding = x.replace(x.value_counts().to_dict())

In [9]:
# ensure that our output coincides with the one from category_encoders
assert count_encoding.eq(ce.CountEncoder().fit_transform(X = x.sort_values())['x']).all()

In [10]:
count_encoding.name = 'CountEncoding'
show = pd.concat([x, count_encoding], axis = 1)
show

Unnamed: 0,x,CountEncoding
0,2_Bachelors,2
1,1_High-School,2
2,4_PhD,1
3,3_Masters,1
4,1_High-School,2
5,2_Bachelors,2


## 3. One-Hot Encoding

In [11]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [12]:
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
one_hot_encoding = ordinal_encoding.apply(lambda e: pd.Series(np.diag(np.ones(len(set(x))))[e - 1].astype(int)))

In [13]:
# ensure that our output coincides with the one from category_encoders
assert one_hot_encoding.eq(ce.OneHotEncoder().fit_transform(X = x.sort_values()).rename(lambda c: int(c[2:]) - 1, axis='columns')).all().all()

In [14]:
one_hot_encoding.columns = sorted(set(x))
show = pd.concat([x, ordinal_encoding, one_hot_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['OneHotEncoding'] * len(set(x)), [''] * 2 + list(one_hot_encoding.columns)]
show

Unnamed: 0_level_0,x,OrdinalEncoding,OneHotEncoding,OneHotEncoding,OneHotEncoding,OneHotEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,1_High-School,2_Bachelors,3_Masters,4_PhD
0,2_Bachelors,2,0,1,0,0
1,1_High-School,1,1,0,0,0
2,4_PhD,4,0,0,0,1
3,3_Masters,3,0,0,1,0
4,1_High-School,1,1,0,0,0
5,2_Bachelors,2,0,1,0,0


### OLS

In [15]:
x = pd.Series(['1_High-School', '2_Bachelors', '3_Masters', '4_PhD'], name = 'x')
x.index = x.to_list()
y = pd.Series([35, 45, 52, 68], index = x.index, name = 'y')

one_hot_encoding = ce.OneHotEncoder().fit_transform(X = x.sort_values()) #.rename(lambda c: int(c[2:]) - 1, axis='columns')
one_hot_encoding.columns = x.to_list()

ols_coefs = sm.OLS(y, pd.concat([pd.Series(1, index = x.index, name = 'intercept'), one_hot_encoding], axis = 1)).fit().params
ols_coefs.index = ['intercept'] + x.to_list()

show = pd.concat([x, one_hot_encoding, y, ols_coefs], axis = 1).loc[['intercept'] + x.to_list(), :]
show.index = ['intercept'] + x.to_list()
show.columns = [['x'] + ['OneHotEncoding'] * len(set(x)) + ['y', 'ols_coefs'], [''] + list(one_hot_encoding.columns) + [''] * 2]
show.fillna('')

Unnamed: 0_level_0,x,OneHotEncoding,OneHotEncoding,OneHotEncoding,OneHotEncoding,y,ols_coefs
Unnamed: 0_level_1,Unnamed: 1_level_1,1_High-School,2_Bachelors,3_Masters,4_PhD,Unnamed: 6_level_1,Unnamed: 7_level_1
intercept,,,,,,,40.0
1_High-School,1_High-School,1.0,0.0,0.0,0.0,35.0,-5.0
2_Bachelors,2_Bachelors,0.0,1.0,0.0,0.0,45.0,5.0
3_Masters,3_Masters,0.0,0.0,1.0,0.0,52.0,12.0
4_PhD,4_PhD,0.0,0.0,0.0,1.0,68.0,28.0


## 4. Sum Encoding

In [16]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [17]:
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
one_hot_encoding = ordinal_encoding.apply(lambda e: pd.Series(np.diag(np.ones(len(set(x))))[e - 1].astype(int)))
sum_encoding = one_hot_encoding.iloc[:, :-1].apply(lambda row: row if row.sum() == 1 else row.replace(0, -1), axis = 1)

In [18]:
# ensure that our output coincides with the one from category_encoders
assert sum_encoding.eq(ce.SumEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1).rename(lambda c: int(c[2:]), axis='columns')).all().all()

In [19]:
sum_encoding.columns = sorted(set(x))[:-1]
show = pd.concat([x, ordinal_encoding, sum_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['SumEncoding'] * (len(set(x)) - 1), [''] * 2 + sorted(set(x))[:-1]]
show

Unnamed: 0_level_0,x,OrdinalEncoding,SumEncoding,SumEncoding,SumEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,1_High-School,2_Bachelors,3_Masters
0,2_Bachelors,2,0,1,0
1,1_High-School,1,1,0,0
2,4_PhD,4,-1,-1,-1
3,3_Masters,3,0,0,1
4,1_High-School,1,1,0,0
5,2_Bachelors,2,0,1,0


### OLS

In [20]:
x = pd.Series(['1_High-School', '2_Bachelors', '3_Masters', '4_PhD'], name = 'x')
x.index = x.to_list()
y = pd.Series([35, 45, 52, 68], index = x.index, name = 'y')

sum_encoding = ce.SumEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1)
sum_encoding.columns = x.to_list()[:-1]

ols_coefs = sm.OLS(y, pd.concat([pd.Series(1, index = sum_encoding.index, name = 'intercept'), sum_encoding], axis = 1)).fit().params

show = pd.concat([x, sum_encoding, y, ols_coefs], axis = 1).loc[['intercept'] + x.to_list()]
show.columns = [['x'] + ['SumEncoding'] * sum_encoding.shape[1] + ['y', 'ols_coefs'], [''] + list(sum_encoding.columns) + [''] * 2]
show.fillna('')

Unnamed: 0_level_0,x,SumEncoding,SumEncoding,SumEncoding,y,ols_coefs
Unnamed: 0_level_1,Unnamed: 1_level_1,1_High-School,2_Bachelors,3_Masters,Unnamed: 5_level_1,Unnamed: 6_level_1
intercept,,,,,,50.0
1_High-School,1_High-School,1.0,0.0,0.0,35.0,-15.0
2_Bachelors,2_Bachelors,0.0,1.0,0.0,45.0,-5.0
3_Masters,3_Masters,0.0,0.0,1.0,52.0,2.0
4_PhD,4_PhD,-1.0,-1.0,-1.0,68.0,


## 5. Backward-Difference

In [21]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [22]:
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
backward_difference_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series(
        [i / len(set(x)) for i in range(1, oe)] + [- i / len(set(x)) for i in range(len(set(x)) - oe, 0, -1)]))

In [23]:
# ensure that our output coincides with the one from category_encoders
assert backward_difference_encoding.eq(ce.BackwardDifferenceEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1).rename(lambda c: int(c[2:]), axis='columns')).all().all()

In [24]:
backward_difference_encoding.columns = sorted(set(x))[1:]
show = pd.concat([x, ordinal_encoding, backward_difference_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['BackwardDifferenceEncoding'] * len(sorted(set(x))[1:]), [''] * 2 + sorted(set(x))[1:]]
show

Unnamed: 0_level_0,x,OrdinalEncoding,BackwardDifferenceEncoding,BackwardDifferenceEncoding,BackwardDifferenceEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,2_Bachelors,3_Masters,4_PhD
0,2_Bachelors,2,0.25,-0.5,-0.25
1,1_High-School,1,-0.75,-0.5,-0.25
2,4_PhD,4,0.25,0.5,0.75
3,3_Masters,3,0.25,0.5,-0.25
4,1_High-School,1,-0.75,-0.5,-0.25
5,2_Bachelors,2,0.25,-0.5,-0.25


### OLS

In [25]:
x = pd.Series(['1_High-School', '2_Bachelors', '3_Masters', '4_PhD'], name = 'x')
x.index = x.to_list()
y = pd.Series([35, 45, 52, 68], index = x.index, name = 'y')

backward_difference_encoding = ce.BackwardDifferenceEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1)
backward_difference_encoding.columns = x.to_list()[1:]

ols_coefs = sm.OLS(y, pd.concat([pd.Series(1, index = backward_difference_encoding.index, name = 'intercept'), backward_difference_encoding], axis = 1)).fit().params

show = pd.concat([x, backward_difference_encoding, y, ols_coefs], axis = 1).loc[['intercept'] + x.to_list()]
show.columns = [['x'] + ['BackwardDifferenceEncoding'] * backward_difference_encoding.shape[1] + ['y', 'ols_coefs'], [''] + list(backward_difference_encoding.columns) + [''] * 2]
show.fillna('')

Unnamed: 0_level_0,x,BackwardDifferenceEncoding,BackwardDifferenceEncoding,BackwardDifferenceEncoding,y,ols_coefs
Unnamed: 0_level_1,Unnamed: 1_level_1,2_Bachelors,3_Masters,4_PhD,Unnamed: 5_level_1,Unnamed: 6_level_1
intercept,,,,,,50.0
1_High-School,1_High-School,-0.75,-0.5,-0.25,35.0,
2_Bachelors,2_Bachelors,0.25,-0.5,-0.25,45.0,10.0
3_Masters,3_Masters,0.25,0.5,-0.25,52.0,7.0
4_PhD,4_PhD,0.25,0.5,0.75,68.0,16.0


## 6. Helmert Encoding

In [26]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [27]:
# this implementation is similar to https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
helmert_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series([0] * (oe - 2) + ([oe - 1] if oe > 1 else []) + [-1] * (len(set(x)) - oe))
).div(pd.Series(range(2,len(set(x)) + 1)))

In [28]:
helmert_encoding.columns = sorted(set(x))[1:]
show = pd.concat([x, ordinal_encoding, helmert_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['HelmertEncoding'] * helmert_encoding.shape[1], [''] * 2 + sorted(set(x))[1:]]
show.round(2)

Unnamed: 0_level_0,x,OrdinalEncoding,HelmertEncoding,HelmertEncoding,HelmertEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,2_Bachelors,3_Masters,4_PhD
0,2_Bachelors,2,0.5,-0.33,-0.25
1,1_High-School,1,-0.5,-0.33,-0.25
2,4_PhD,4,0.0,0.0,0.75
3,3_Masters,3,0.0,0.67,-0.25
4,1_High-School,1,-0.5,-0.33,-0.25
5,2_Bachelors,2,0.5,-0.33,-0.25


In [30]:
# in this case, the outcome of category_encoders has a different implementation
ce.HelmertEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1).rename(lambda c: int(c[2:]), axis='columns')

Unnamed: 0,0,1,2
1,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0
0,1.0,-1.0,-1.0
5,1.0,-1.0,-1.0
3,0.0,2.0,-1.0
2,0.0,0.0,3.0


OLS

In [31]:
x = pd.Series(['1_High-School', '2_Bachelors', '3_Masters', '4_PhD'], name = 'x')
x.index = x.to_list()
y = pd.Series([35, 45, 52, 68], index = x.index, name = 'y')

ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
helmert_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series([0] * (oe - 2) + ([oe - 1] if oe > 1 else []) + [-1] * (len(set(x)) - oe))
).div(pd.Series(range(2,len(set(x)) + 1)))
helmert_encoding.columns = x.to_list()[1:]

ols_coefs = sm.OLS(y, pd.concat([pd.Series(1, index = helmert_encoding.index, name = 'intercept'), helmert_encoding], axis = 1)).fit().params

show = pd.concat([x, helmert_encoding, y, ols_coefs], axis = 1).loc[['intercept'] + x.to_list()]
show.columns = [['x'] + ['HelmertEncoding'] * helmert_encoding.shape[1] + ['y', 'ols_coefs'], [''] + list(helmert_encoding.columns) + [''] * 2]
show.fillna('')

Unnamed: 0_level_0,x,HelmertEncoding,HelmertEncoding,HelmertEncoding,y,ols_coefs
Unnamed: 0_level_1,Unnamed: 1_level_1,2_Bachelors,3_Masters,4_PhD,Unnamed: 5_level_1,Unnamed: 6_level_1
intercept,,,,,,50.0
1_High-School,1_High-School,-0.5,-0.333333,-0.25,35.0,
2_Bachelors,2_Bachelors,0.5,-0.333333,-0.25,45.0,10.0
3_Masters,3_Masters,0.0,0.666667,-0.25,52.0,12.0
4_PhD,4_PhD,0.0,0.0,0.75,68.0,24.0


## 7. Polynomial Encoding

In [32]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [33]:
def do_polynomial_encoding(order):
    # https://github.com/pydata/patsy/blob/master/patsy/contrasts.py
    n = len(set(x))
    scores = np.arange(n)
    scores = np.asarray(scores, dtype=float)
    scores -= scores.mean()
    raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))
    q, r = np.linalg.qr(raw_poly)
    q *= np.sign(np.diag(r))
    q /= np.sqrt(np.sum(q ** 2, axis=1))
    # q[:, 0] = 1
    q = q[:, 1:]
    return q[order - 1]

ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
polynomial_encoding = ordinal_encoding.apply(lambda oe: pd.Series(do_polynomial_encoding(oe)))

In [34]:
# ensure that our output coincides with the one from category_encoders
assert polynomial_encoding.eq(ce.PolynomialEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1).rename(lambda c: int(c[2:]), axis='columns')).all().all()

In [35]:
polynomial_encoding.columns = ['degree' + str(i) for i in range(1, polynomial_encoding.shape[1] + 1)]
show = pd.concat([x, ordinal_encoding, polynomial_encoding], axis = 1)
show.columns = [['x', 'OrdinalEncoding'] + ['PolynomialEncoding'] * polynomial_encoding.shape[1], 
                [''] * 2 + list(polynomial_encoding.columns)]
show.round(3)

Unnamed: 0_level_0,x,OrdinalEncoding,PolynomialEncoding,PolynomialEncoding,PolynomialEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,degree1,degree2,degree3
0,2_Bachelors,2,-0.224,-0.5,0.671
1,1_High-School,1,-0.671,0.5,-0.224
2,4_PhD,4,0.671,0.5,0.224
3,3_Masters,3,0.224,-0.5,-0.671
4,1_High-School,1,-0.671,0.5,-0.224
5,2_Bachelors,2,-0.224,-0.5,0.671


OLS

In [36]:
x = pd.Series(['1_High-School', '2_Bachelors', '3_Masters', '4_PhD'], name = 'x')
x.index = x.to_list()
y = pd.Series([35, 45, 52, 68], index = x.index, name = 'y')

polynomial_encoding = ce.PolynomialEncoder().fit_transform(X = x.sort_values()).drop('intercept', axis = 1)
polynomial_encoding.columns = ['degree' + str(i) for i in range(1, polynomial_encoding.shape[1] + 1)]

ols_coefs = sm.OLS(y, pd.concat([pd.Series(1, index = polynomial_encoding.index, name = 'intercept'), polynomial_encoding], axis = 1)).fit().params

ols_coefs

intercept    50.000000
degree1      23.702321
degree2       3.000000
degree3       2.683282
dtype: float64

## 8. Binary Encoding

In [37]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [38]:
ordinal_encoding = x.replace(dict(zip(sorted(set(x)), range(1, len(sorted(set(x))) + 1))))
binary_base = ordinal_encoding.apply(lambda oe: str(bin(oe))[2:].zfill(len(bin(len(set(x)))) - 2))
binary_encoding = binary_base.apply(lambda bb: pd.Series(list(bb))).astype(int)

In [39]:
# ensure that our output coincides with the one from category_encoders
assert binary_encoding.eq(ce.BinaryEncoder().fit_transform(X = x.sort_values()).rename(lambda c: int(c[2:]), axis='columns')).all().all()

In [40]:
binary_encoding.columns = ['dim' + str(i) for i in range(binary_encoding.shape[1], 0, -1)]
show = pd.concat([x, ordinal_encoding, binary_base, binary_encoding], axis = 1)
show.columns = [
    ['x', 'OrdinalEncoding', 'binary_base'] + ['BinaryEncoding'] * binary_encoding.shape[1], 
    [''] * 3 + list(binary_encoding.columns)
]
show

Unnamed: 0_level_0,x,OrdinalEncoding,binary_base,BinaryEncoding,BinaryEncoding,BinaryEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,dim3,dim2,dim1
0,2_Bachelors,2,10,0,1,0
1,1_High-School,1,1,0,0,1
2,4_PhD,4,100,1,0,0
3,3_Masters,3,11,0,1,1
4,1_High-School,1,1,0,0,1
5,2_Bachelors,2,10,0,1,0


## 9. Base N Encoding

In [41]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [42]:
def int2base(n, base):
    out = ''
    while n:
        out += str(int(n % base))
        n //= base
    return out[::-1]

base = 3
base_n = ordinal_encoding.apply(lambda oe: int2base(n = oe, base = base))
base_n_encoding = base_n.apply(lambda bn: pd.Series(list(bn.zfill(base_n.apply(len).max())))).astype(int)

In [43]:
# ensure that our output coincides with the one from category_encoders
assert base_n_encoding.eq(ce.BaseNEncoder(base = base).fit_transform(X = x.sort_values()).drop('x_0', axis = 1).rename(lambda c: int(c[2:]) - 1, axis='columns')).all().all()

In [44]:
base_n_encoding.columns = ['dim' + str(i) for i in range(base_n_encoding.shape[1], 0, -1)]
show = pd.concat([x, ordinal_encoding, base_n, base_n_encoding], axis = 1)
show.columns = [
    ['x', 'ordinal_encoding', 'base_{}'.format(base)] + ['BaseNEncoding'] * base_n_encoding.shape[1], 
    [''] * 3 + list(base_n_encoding.columns)
]
show

Unnamed: 0_level_0,x,ordinal_encoding,base_3,BaseNEncoding,BaseNEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,dim2,dim1
0,2_Bachelors,2,2,0,2
1,1_High-School,1,1,0,1
2,4_PhD,4,11,1,1
3,3_Masters,3,10,1,0
4,1_High-School,1,1,0,1
5,2_Bachelors,2,2,0,2


## 10. Hashing Encoding

In [45]:
x = pd.Series(['2_Bachelors', '1_High-School', '4_PhD', '3_Masters', '1_High-School', '2_Bachelors'], name = 'x')

In [46]:
def do_hash(string, output_dimension):
    hasher = hashlib.new('sha256')
    hasher.update(bytes(string, 'utf-8'))
    string_hashed = hasher.hexdigest()
    string_hashed_int = int(string_hashed, 16)
    string_hashed_int_remainder = string_hashed_int % output_dimension
    return string_hashed, string_hashed_int, string_hashed_int_remainder

output_dimension = 11
hashing = x.apply(
    lambda string: pd.Series(do_hash(string, output_dimension), 
        index = ['x_hashed', 'x_hashed_int', 'x_hashed_int_remainder']))
hashing_encoding = hashing['x_hashed_int_remainder'].apply(lambda rem: pd.Series(np.diag(np.ones(output_dimension))[rem])).astype(int)

In [47]:
# ensure that our output coincides with the one from category_encoders
assert hashing_encoding.eq(ce.HashingEncoder(hash_method = 'sha256', n_components = output_dimension).fit_transform(X = x).rename(lambda c: int(c[4:]), axis='columns')).all().all()

In [48]:
hashing_encoding.columns = ['dim' + str(i) for i in range(hashing_encoding.shape[1])]
show = pd.concat([x, hashing, hashing_encoding], axis = 1)
show.columns = [
    ['x', 'x_hashed', 'x_hashed_int', 'x_hashed_int_remainder'] + ['HashingEncoding'] * hashing_encoding.shape[1], 
    [''] * 4 + list(hashing_encoding.columns)
]
show

Unnamed: 0_level_0,x,x_hashed,x_hashed_int,x_hashed_int_remainder,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding,HashingEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,dim0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10
0,2_Bachelors,dfdf62292a10a136ef6c334ce42a2b9678bfaca95082ca...,1012604496104437404713436958132099208170126968...,0,1,0,0,0,0,0,0,0,0,0,0
1,1_High-School,246ef3f20c0e981d07dd9403889eda024e8cda9a607444...,1647929937606232701793386007505252070943301070...,7,0,0,0,0,0,0,0,1,0,0,0
2,4_PhD,05696fc5341308e512fca84cee1462028e38e2121dc07b...,2447854595156119084567771247529450695955408508...,2,0,0,1,0,0,0,0,0,0,0,0
3,3_Masters,87cfcc5fbb3c7208a37a6b217aecae0fe3e363ecc6753a...,6142938243831803099648826653448075878678926793...,3,0,0,0,1,0,0,0,0,0,0,0
4,1_High-School,246ef3f20c0e981d07dd9403889eda024e8cda9a607444...,1647929937606232701793386007505252070943301070...,7,0,0,0,0,0,0,0,1,0,0,0
5,2_Bachelors,dfdf62292a10a136ef6c334ce42a2b9678bfaca95082ca...,1012604496104437404713436958132099208170126968...,0,1,0,0,0,0,0,0,0,0,0,0


## 11. Target Encoding

In [50]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [51]:
count_encoding = x.replace(x.value_counts().to_dict())
y_grand_mean = y.mean()
y_level_mean = x.replace(y.groupby(x).mean())
smoothing = 1
weight = 1 / (1 + np.exp(-(count_encoding - 1) / smoothing))
target_encoding = y_level_mean * weight + y_grand_mean * (1 - weight)

In [52]:
# ensure that our output coincides with the one from category_encoders
assert (target_encoding == ce.TargetEncoder(smoothing = smoothing).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [53]:
count_encoding = x.replace(y.groupby(x).count())
y_grand_mean = x.apply(lambda l: y.mean())
y_level_mean = x.replace(y.groupby(x).mean())
target_encoding = dict()
for smoothing in [0, 1, 10]:
    weight = 1 / (1 + np.exp(-(count_encoding - 1) / smoothing))
    target_encoding[smoothing] = (y_level_mean * weight + y_grand_mean * (1 - weight)).round(2)

In [54]:
show = pd.concat([x, y, y_level_mean, y_grand_mean] + [target_encoding[i] for i in target_encoding.keys()], axis = 1)
show.columns = [
    ['x', 'y', 'y_level_mean', 'y_grand_mean'] + ['TargetEncoding'] * len(target_encoding), 
    [''] * 4 + ['smoothing={}'.format(sm) for sm in target_encoding.keys()]
]
show.round(2)

Unnamed: 0_level_0,x,y,y_level_mean,y_grand_mean,TargetEncoding,TargetEncoding,TargetEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,smoothing=0,smoothing=1,smoothing=10
0,1_High-School,35,36.5,52.27,36.5,40.74,43.99
1,1_High-School,38,36.5,52.27,36.5,40.74,43.99
2,2_Bachelors,49,50.25,52.27,50.25,50.35,51.11
3,2_Bachelors,45,50.25,52.27,50.25,50.35,51.11
4,2_Bachelors,52,50.25,52.27,50.25,50.35,51.11
5,2_Bachelors,55,50.25,52.27,50.25,50.35,51.11
6,3_Masters,63,59.0,52.27,59.0,58.2,55.97
7,3_Masters,47,59.0,52.27,59.0,58.2,55.97
8,3_Masters,67,59.0,52.27,59.0,58.2,55.97
9,4_PhD,51,62.0,52.27,62.0,59.38,57.38


## 12. MEstimate Encoding

In [55]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [56]:
m = 0
count_encoding = x.replace(y.groupby(x).count())
y_mean = y.mean()
y_level_mean = x.replace(y.groupby(x).mean())
weight = count_encoding / (count_encoding + m)
m_estimate_encoding =  y_level_mean * weight + y_grand_mean * (1 - weight)

In [57]:
assert (m_estimate_encoding == ce.MEstimateEncoder(m = m).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [58]:
m_estimate_encoding = dict()

for m in [0, 1, 10]:
    m_estimate_encoding[m] = ((y_level_mean * count_encoding + y_grand_mean * m) / (count_encoding + m)).round(2)

In [59]:
# ensure that our output coincides with the one from category_encoders
for m, te in m_estimate_encoding.items():
    assert (te == ce.MEstimateEncoder(m = m).fit_transform(X = x, y = y).iloc[:, 0].round(2)).all()

In [60]:
show = pd.concat([x, y, count_encoding, y_level_mean, y_grand_mean] + [m_estimate_encoding[i] for i in m_estimate_encoding.keys()], axis = 1)
show.columns = [
    ['x', 'y', 'CountEncoding', 'y_level_mean', 'y_grand_mean'] + ['MEstimateEncoding'] * len(m_estimate_encoding), 
    [''] * 5 + ['m={}'.format(m) for m in m_estimate_encoding.keys()]
]
show.round(2)

Unnamed: 0_level_0,x,y,CountEncoding,y_level_mean,y_grand_mean,MEstimateEncoding,MEstimateEncoding,MEstimateEncoding
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,m=0,m=1,m=10
0,1_High-School,35,2,36.5,52.27,36.5,41.76,49.64
1,1_High-School,38,2,36.5,52.27,36.5,41.76,49.64
2,2_Bachelors,49,4,50.25,52.27,50.25,50.65,51.69
3,2_Bachelors,45,4,50.25,52.27,50.25,50.65,51.69
4,2_Bachelors,52,4,50.25,52.27,50.25,50.65,51.69
5,2_Bachelors,55,4,50.25,52.27,50.25,50.65,51.69
6,3_Masters,63,3,59.0,52.27,59.0,57.32,53.83
7,3_Masters,47,3,59.0,52.27,59.0,57.32,53.83
8,3_Masters,67,3,59.0,52.27,59.0,57.32,53.83
9,4_PhD,51,2,62.0,52.27,62.0,58.76,53.89


## 13. James-Stein Encoding

In [61]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [62]:
y_level_mean = x.replace(y.groupby(x).mean())
y_level_var = x.replace(y.groupby(x).var())
y_var = y.var()
y_mean = y.mean()
weight = 1 - (y_level_var / (y_var + y_level_var) * (len(set(x)) - 3) / (len(set(x)) - 1)).clip(lower=0, upper=1)
james_stein_encoding = y_level_mean * weight + y_mean * (1 - weight)

In [63]:
# ensure that our output coincides with the one from category_encoders
assert (james_stein_encoding == ce.JamesSteinEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [64]:
show = pd.concat([x, y, y_level_mean, y_level_var, pd.Series(y.mean(), index = x.index), pd.Series(y.var(), index = x.index), weight, james_stein_encoding], axis = 1)
show.columns = ['x', 'y', 'y_level_mean', 'y_level_var', 'y_mean', 'y_var', 'weight', 'JamesSteinEncoding']
show.round(2)

Unnamed: 0,x,y,y_level_mean,y_level_var,y_mean,y_var,weight,JamesSteinEncoding
0,1_High-School,35,36.5,4.5,52.27,136.42,0.99,36.67
1,1_High-School,38,36.5,4.5,52.27,136.42,0.99,36.67
2,2_Bachelors,49,50.25,18.25,52.27,136.42,0.96,50.33
3,2_Bachelors,45,50.25,18.25,52.27,136.42,0.96,50.33
4,2_Bachelors,52,50.25,18.25,52.27,136.42,0.96,50.33
5,2_Bachelors,55,50.25,18.25,52.27,136.42,0.96,50.33
6,3_Masters,63,59.0,112.0,52.27,136.42,0.85,57.99
7,3_Masters,47,59.0,112.0,52.27,136.42,0.85,57.99
8,3_Masters,67,59.0,112.0,52.27,136.42,0.85,57.99
9,4_PhD,51,62.0,242.0,52.27,136.42,0.79,59.93


## 14. GLMM Encoding

In [65]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [66]:
model = smf.mixedlm(formula = 'y ~ 1', data = y.to_frame(), groups = x).fit()
intercept = model.params['Intercept']
random_effect = x.replace({k: float(v) for k, v in model.random_effects.items()})
glmm_encoding = intercept + random_effect

In [67]:
# in this case, category_encoders coincides only with the random_effect, not with the glmm_encoding
assert (random_effect == ce.GLMMEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [68]:
show = pd.concat([x, y, pd.Series(intercept, index = x.index), random_effect, glmm_encoding], axis = 1)
show.columns = ['x', 'y', 'intercept', 'random_effect', 'GLMMEncoding']
show.round(2)

Unnamed: 0,x,y,intercept,random_effect,GLMMEncoding
0,1_High-School,35,52.05,-10.82,41.24
1,1_High-School,38,52.05,-10.82,41.24
2,2_Bachelors,49,52.05,-1.48,50.57
3,2_Bachelors,45,52.05,-1.48,50.57
4,2_Bachelors,52,52.05,-1.48,50.57
5,2_Bachelors,55,52.05,-1.48,50.57
6,3_Masters,63,52.05,5.38,57.43
7,3_Masters,47,52.05,5.38,57.43
8,3_Masters,67,52.05,5.38,57.43
9,4_PhD,51,52.05,6.92,58.97


## 15. WOE Encoding

In [69]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1], name = 'y')

In [70]:
y_level_ones = x.replace(y.groupby(x).apply(lambda l: (l == 1).sum()))
y_level_zeros = x.replace(y.groupby(x).apply(lambda l: (l == 0).sum()))
y_ones = (y == 1).sum()
y_zeros = (y == 0).sum()
nominator = y_level_ones / y_ones
denominator = y_level_zeros / y_zeros
woe_encoder = np.log(nominator / denominator)

In [71]:
assert (woe_encoder == ce.WOEEncoder(regularization = 0).fit_transform(X = x, y = y).iloc[:, 0]).all()

In [72]:
show = pd.concat([x, y, y_level_ones, y_level_zeros, pd.Series(y_ones, index = x.index), pd.Series(y_zeros, index = x.index), nominator, denominator, woe_encoder], axis = 1)
show.columns = ['x', 'y', 'y_level_ones', 'y_level_zeros', 'y_ones', 'y_zeros','nominator', 'denominator', 'WOEEncoding']
show

Unnamed: 0,x,y,y_level_ones,y_level_zeros,y_ones,y_zeros,nominator,denominator,WOEEncoding
0,1_High-School,1,1,1,4,7,0.25,0.142857,0.559616
1,1_High-School,0,1,1,4,7,0.25,0.142857,0.559616
2,2_Bachelors,0,1,3,4,7,0.25,0.428571,-0.538997
3,2_Bachelors,0,1,3,4,7,0.25,0.428571,-0.538997
4,2_Bachelors,1,1,3,4,7,0.25,0.428571,-0.538997
5,2_Bachelors,0,1,3,4,7,0.25,0.428571,-0.538997
6,3_Masters,0,1,2,4,7,0.25,0.285714,-0.133531
7,3_Masters,1,1,2,4,7,0.25,0.285714,-0.133531
8,3_Masters,0,1,2,4,7,0.25,0.285714,-0.133531
9,4_PhD,0,1,1,4,7,0.25,0.142857,0.559616


## 16. Leave One Out Encoding

In [73]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [74]:
y_level_except_self = x.to_frame().apply(lambda row: y[x == row['x']].drop(row.name).to_list(), axis = 1)
leave_one_out_encoding = y_level_except_self.apply(np.mean)

In [75]:
assert (leave_one_out_encoding == ce.LeaveOneOutEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [76]:
show = pd.concat([x, y, y_level_except_self, leave_one_out_encoding], axis = 1)
show.columns = ['x', 'y', 'y_level_except_self', 'LeaveOneOutEncoding']
show['LeaveOneOutEncoding'] = show['LeaveOneOutEncoding'].round(2)
show

Unnamed: 0,x,y,y_level_except_self,LeaveOneOutEncoding
0,1_High-School,35,[38],38.0
1,1_High-School,38,[35],35.0
2,2_Bachelors,49,"[45, 52, 55]",50.67
3,2_Bachelors,45,"[49, 52, 55]",52.0
4,2_Bachelors,52,"[49, 45, 55]",49.67
5,2_Bachelors,55,"[49, 45, 52]",48.67
6,3_Masters,63,"[47, 67]",57.0
7,3_Masters,47,"[63, 67]",65.0
8,3_Masters,67,"[63, 47]",55.0
9,4_PhD,51,[73],73.0


## 17. CatBoost Encoding

In [77]:
x = pd.Series([
    '1_High-School', '1_High-School', '2_Bachelors', 
    '2_Bachelors', '2_Bachelors', '2_Bachelors',
    '3_Masters', '3_Masters', '3_Masters', '4_PhD', '4_PhD'
    ], name = 'x')
y = pd.Series([35, 38, 49, 45, 52, 55, 63, 47, 67, 51, 73], name = 'y')

In [78]:
a = 1
y_mean = y.mean()
y_level_before_self = x.to_frame().apply(lambda row: y[(x == row['x']) & (y.index < row.name)].to_list(), axis = 1)
catboost_encoding = y_level_before_self.apply(lambda ylbs: (sum(ylbs) + y_mean * a) / (len(ylbs) + a))

In [79]:
assert (catboost_encoding == ce.CatBoostEncoder().fit_transform(X = x, y = y).iloc[:, 0]).all()

In [80]:
show = pd.concat([x, y, pd.Series(y.mean(), index = x.index), y_level_before_self, catboost_encoding], axis = 1)
show.columns = ['x', 'y', 'y_mean', 'y_level_before_self', 'CatBoostEncoding']
show.round(2)

Unnamed: 0,x,y,y_mean,y_level_before_self,CatBoostEncoding
0,1_High-School,35,52.27,[],52.27
1,1_High-School,38,52.27,[35],43.64
2,2_Bachelors,49,52.27,[],52.27
3,2_Bachelors,45,52.27,[49],50.64
4,2_Bachelors,52,52.27,"[49, 45]",48.76
5,2_Bachelors,55,52.27,"[49, 45, 52]",49.57
6,3_Masters,63,52.27,[],52.27
7,3_Masters,47,52.27,[63],57.64
8,3_Masters,67,52.27,"[63, 47]",54.09
9,4_PhD,51,52.27,[],52.27
