In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from collections import defaultdict

In [2]:
nrows = 200000
df1 = pd.DataFrame({'Var1' : np.random.randint(low=0, high=12, size=nrows),
                    'Var2' : np.random.randint(low=0, high=6, size=nrows),
                    'y' : np.random.randint(low=0, high=2, size=nrows)})

In [3]:
 def KFoldTargetEncoding(x,
                         inner_splits,
                         group_col,
                         target_col,
                         n_col_name,
                         alpha,
                         noise_std):
    """KFold Target Encoding. 
       For each fold, fill-in values in `group_col` using other folds.
       :math:`\frac{local\_mean \cdot nrows + global\_mean \cdot \alpha}{nrows + \alpha} + \mathcal{N}(0, std^{2})`

       
    Args:
      x : pandas data frame.
      inner_splits : list. Indices for each fold.
      group_col : str. Name of column for which the average target response
                       will be calculated.
      target_col : str. Name of target column.
      n_col_name : str. Name of new column.
      alpha : float. Regularisation parameter which regulates trade-off
                     between local (within-group) mean and global mean.
      noise_std: float. St. dev in `N(0, std)` noise.
      
    Returns:
      Pandas Series of the same length as `x` containing encoded target.
    """
    ## initialise new column
    x[n_col_name] = 0.0
    ## iterate over inner folds
    for j in range(len(inner_splits)):
        ## calculate new column values on all except for j
        fill_idx = inner_splits[j]
        ## at which idx to calculate
        calc_idx = np.concatenate(inner_splits[:j] + inner_splits[(j + 1):])

        x.loc[fill_idx, n_col_name] = targetEncoding(x.loc[calc_idx, [group_col, target_col]],
                                                     x.loc[fill_idx, [group_col]],
                                                     group_col,
                                                     target_col,
                                                     alpha,
                                                     noise_std)
    return x[n_col_name]

In [4]:
def targetEncoding(x_calc,
                   x_fill,
                   group_col,
                   target_col,
                   alpha,
                   noise_std):
    """Target Encoding.
       Fill-in values for values of `group_col` for `x_fill` from `x_calc`.
       :math:`\frac{local\_mean \cdot nrows + global\_mean \cdot \alpha}{nrows + \alpha} + \mathcal{N}(0, std^{2})`

       
    Args:
      x_calc : pd data frame. Used for calculating target statistics.
      x_fill : pd data frame. Used for filling-in statistics.
      group_col : str. Name of column for which the average target response
                       will be calculated.
      target_col : str. Name of target column.
      alpha : float. Regularisation parameter which regulates trade-off
                     between local (within-group) mean and global mean.
      noise_std: float. St. dev in `N(0, std)` noise.
      
    Returns:
      Pandas Series of the same length as `x_fill`.
    """
    ## global mean
    global_mean = x_calc[target_col].mean()
    ## dictionary: if key is not presented, replace by global mean
    calc_dict = defaultdict(lambda : global_mean)      
    ## update dictionary
    calc_dict.update(x_calc
                     .groupby(group_col)
                     .apply(lambda x: (((np.mean(x[target_col]) * len(x)) +
                                        alpha * global_mean) /
                                        (len(x) + alpha)))
                     .to_dict())
    return (x_fill
            .loc[:, group_col]
            .apply(lambda x: calc_dict[x]) +
            np.random.normal(0, noise_std, size=len(x_fill))
           )

In [5]:
def HypeNKFoldCV(x,
                 group_cols,
                 target_col,
                 clf,
                 nfolds,
                 kfolds,
                 alpha,
                 noise_std,
                 scorer):
    """Hype NKFold Cross-Validation.
       Performs target encoding for each of `group_cols`,
       and evaluate the performance using two-staged folding.
       :math:`\frac{local\_mean \cdot nrows + global\_mean \cdot \alpha}{nrows + \alpha} + \mathcal{N}(0, std^{2})`
       
    Args:
      x : input data frame. Must contain all `group_cols` and `target_col`.
          During training, we use all columns but `target_col` for training.
      group_cols : list of str. Names of columns for which the average target response
                   will be calculated.
      target_col : str. Name of target column.
      clf : classifier object. Must have `fit` or `train` methods, 
                               `predict` or `test` methods.
      nfolds : int. Number of outer folds.
      kfolds : int. Number of inner folds.
      alpha : float. Regularisation parameter which regulates trade-off
                     between local (within-group) mean and global mean.
      noise_std: float. St. dev in `N(0, std)` noise.
      scorer : function. Evaluation metric; must take two arguments:
               a vector of predictions and a vector of ground truth values.
      
    Returns:
      A list of `N` scores.
    """
    ## all indices
    all_idx = x.copy().index.values
    ## will shuffle indices for randomisation
    np.random.shuffle(all_idx)
    ## outer splits indices
    outer_splits = np.array_split(all_idx, nfolds)
    ## scorer results
    scores_val = []
    ## outer cycle
    for i in range(nfolds):
        ## keep `i`-th fold for validation
        val_idx = outer_splits[i]
        x_val = x.loc[val_idx].copy()
        ## choose all but `i`-th split
        inner_idx = np.concatenate(outer_splits[:i] + outer_splits[(i + 1):])
        ## further randomise training indices
        np.random.shuffle(inner_idx)
        ## split others further
        inner_splits = np.array_split(inner_idx, kfolds)
        ## training data frame
        x_train = x.loc[inner_idx].copy()
        ## iterate over group cols
        for group_col in group_cols:
            n_col_name = '_'.join([group_col, target_col])
            ## encode using division into KFolds
            x_train.loc[:, n_col_name] = KFoldTargetEncoding(x_train[[group_col, target_col]].copy(),
                                                             inner_splits,
                                                             group_col,
                                                             target_col,
                                                             n_col_name,
                                                             alpha,          
                                                             noise_std)
            ## filling in the same column on val
            ## using whole `x_train`
            x_val.loc[:, n_col_name] = targetEncoding(x_train.loc[:, [group_col, target_col]],
                                                      x_val.loc[:, [group_col]],
                                                      group_col,
                                                      target_col,
                                                      alpha,
                                                      noise_std)
        
        ## will train on x_train
        ## will validate on x_val
        if 'fit' in dir(clf):
            clf.fit(x_train.drop(target_col, axis=1), x_train[target_col])
            preds_val = clf.predict(x_val.drop(target_col, axis=1))
        elif 'train' in dir(clf):
            clf.train(x_train.drop(target_col, axis=1), x_train[target_col])
            preds_val = clf.test(x_val.drop(target_col, axis=1)).argmax(axis=1)
        else:
            raise Exception("`clf` must contain either (`fit` and `predict`) or"
                            " (`train` and `test`) methods")
        scores_val.append(scorer(x_val[target_col], preds_val))
    return scores_val

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [7]:
clf = LogisticRegression()

In [8]:
HypeNKFoldCV(df1,
             ['Var1', 'Var2'],
             'y',
             clf,
             5,
             4,
             10,
             0.00001,
             f1_score)

[0.27686113807110252,
 0.2912010185316169,
 0.41574988939684415,
 0.33520224269122945,
 0.32202352621200786]

In [246]:
%%timeit -n 5
HypeNKFoldCV(df1,
             ['Var1', 'Var2'],
             'y',
             clf,
             5,
             4,
             10,
             0.00001,
             f1_score)

5 loops, best of 3: 4.19 s per loop


In [231]:
x1_10 = df1.copy().loc[:10]
x2_10 = df1.copy().loc[10:20]

In [232]:
x1_10

Unnamed: 0,Var1,Var2,y
0,2,3,1
1,11,4,0
2,3,4,1
3,4,5,0
4,11,0,0
5,4,3,1
6,10,0,0
7,11,4,1
8,6,0,0
9,11,0,1


In [233]:
x2_10

Unnamed: 0,Var1,Var2,y
10,5,1,0
11,0,5,1
12,7,1,1
13,4,0,0
14,5,0,1
15,7,1,0
16,1,0,1
17,7,1,1
18,3,0,1
19,9,2,1


In [235]:
targetEncoding(x1_10,
               x2_10,
               'Var1',
               'y',
               10,
               0.0)

10    0.413223
11    0.454545
12    0.454545
13    0.462121
14    0.413223
15    0.454545
16    0.454545
17    0.454545
18    0.504132
19    0.454545
Name: Var1, dtype: float64

In [155]:
targetEncoding(x1_10,
               x2_10,
               'Var2',
               'y',
               10,
               0.01)

10    0.293447
11    0.305036
12    0.329155
13    0.307064
14    0.318288
15    0.434477
16    0.407655
17    0.324360
18    0.419025
19    0.420123
Name: Var2, dtype: float64