## Mean Encodings

  1. KFold scheme
  2. Leave-one-out scheme
  3. smoothing scheme
  4. expanding mean scheme

In [1]:
import pandas as pd
import numpy as np

In [2]:
sales_df = pd.read_csv('./data/input/sales_train.csv')

In [3]:
sales_df['revenue'] = sales_df['item_price'] * sales_df['item_cnt_day']
full_df = sales_df.groupby(['shop_id', 'item_id', 'date_block_num'])['item_cnt_day'].sum().reset_index()
full_df = full_df.rename(columns={'item_cnt_day': 'item_cnt_month'})

In [4]:
for dbn in range(0, 34):
    # 生成所有可能的(shop_id, item_id)
    unique_shop_id = full_df[full_df['date_block_num'] == dbn]['shop_id'].copy().drop_duplicates()
    unique_item_id = full_df[full_df['date_block_num'] == dbn]['item_id'].copy().drop_duplicates()
    m_index = pd.MultiIndex.from_product([unique_shop_id, unique_item_id], names=['shop_id', 'item_id'])
    val_df = pd.DataFrame([], index=m_index).reset_index()
    val_df['date_block_num'] = dbn
    # 去掉已经有的(shop_id, item_id)
    origin_df = full_df[full_df['date_block_num'] == dbn][['item_id', 'shop_id', 'item_cnt_month']]
    val_df = val_df.merge(origin_df, how='left', on=['item_id', 'shop_id'])
    val_df = val_df[val_df.item_cnt_month.isnull()]
    # 没有的记录 说明销售为0
    val_df['item_cnt_month'] = 0

    # 合并到full_df
    full_df = pd.concat([full_df, val_df])

In [5]:
full_df['target'] = full_df['item_cnt_month']

In [6]:
# globalmean = full_df['target'].mean()
globalmean = 0.3343

## Mean encoding without regularization

使用所有`item_id`的均值作为encoding，不进行任何regularization

这种方式会出现过拟合的现象

In [7]:
# 方法1
item_id_target_mean = full_df.groupby('item_id').target.mean()
full_df['item_target_enc'] = full_df['item_id'].map(item_id_target_mean)
full_df['item_target_enc'].fillna(globalmean, inplace=True) 

# Print correlation
encoded_feature = full_df['item_target_enc'].values
print(np.corrcoef(full_df['target'].values, encoded_feature)[0][1])

0.48303869886213624


In [8]:
# 方法2
full_df['item_target_enc'] = full_df.groupby('item_id')['target'].transform('mean')
full_df['item_target_enc'].fillna(globalmean, inplace=True) 

# Print correlation
encoded_feature = full_df['item_target_enc'].values
print(np.corrcoef(full_df['target'].values, encoded_feature)[0][1])

0.48303869886213624


接下来使用regularization的方法来减轻过拟合的状况

## 1. KFold scheme

  对原始数据进行KFold，对于每个fold，使用训练集得到mean_encoding，再赋值到验证集上

In [9]:
from sklearn.model_selection import KFold
kf = KFold(5, shuffle=True, random_state=20180717)
for trn_idx, val_idx in kf.split(full_df['target'].values):
    trn_df, val_df = full_df.iloc[trn_idx].copy(), full_df.iloc[val_idx].copy()
    item_id_target_mean = trn_df.groupby('item_id')['target'].mean()
    val_df['item_target_enc'] = val_df['item_id'].map(item_id_target_mean)
    full_df.iloc[val_idx] = val_df
full_df.fillna(globalmean, inplace=True)

# You will need to compute correlation like that
encoded_feature = full_df['item_target_enc'].values
corr = np.corrcoef(full_df['target'].values, encoded_feature)[0][1]
print(corr)

0.4803001456002725


## 2. Leave-one-out scheme

  对原始数据进行Leave-one-out，也就是除了当前的这一行，使用其他所有行进行mean_encoding，再赋值到当前行

In [10]:
target_sum = full_df.groupby('item_id')['target'].transform('sum')
target_count = full_df.groupby('item_id')['target'].transform('count')
encoded_feature = (target_sum - full_df['target']) / (target_count - 1)
encoded_feature.fillna(globalmean, inplace=True)

corr = np.corrcoef(full_df['target'].values, encoded_feature)[0][1]
print(corr)

0.4803848311292604


## 3. Smoothing scheme

  使用以下公式进行平滑：$$\frac{mean(target)*nrows+globalmean*alpha}{nrows+alpha}$$

In [11]:
alpha = 100
target_mean = full_df.groupby('item_id')['target'].transform('mean')
target_count = full_df.groupby('item_id')['target'].transform('count')
encoded_feature = (target_mean * target_count + globalmean * alpha) / (target_count + alpha)
encoded_feature.fillna(globalmean, inplace=True)

corr = np.corrcoef(full_df['target'].values, encoded_feature)[0][1]
print(corr)

0.4818198797096877


## 4. Expanding mean scheme

  使用到当前的累计求和除以累计计数得到mean_encoding（不包括当前行）

In [12]:
encoded_feature = (full_df.groupby('item_id')['target'].cumsum() - full_df['target']) / full_df.groupby('item_id')['target'].cumcount()
encoded_feature.fillna(globalmean, inplace=True)

corr = np.corrcoef(full_df['target'].values, encoded_feature)[0][1]
print(corr)

0.4540235160517741
