In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [2]:
def smoothed_mean(X, y, kf, alpha, global_mean=None, columns=None):
    
    # alpha - is parameter for smoothing
    
    # only for pandas object
    initial_indexes = X.index
    X.reset_index(drop=True, inplace=True)
    
    if global_mean is None:
        global_mean = np.mean(y)
        
    if columns is None:
        columns = X.columns
    
    for column in columns:
        X[column + '_agr'] = np.NaN
            
    
    for train_indexes, fill_indexes in kf.split(X, y):
        print("TRAIN IND:", train_indexes, "FILL IND:", fill_indexes)
        for column in columns:
            print("COL:", columns)
            
            unic_values = set(X.iloc[fill_indexes, X.columns.get_loc(column)])
            for value in unic_values:
                
                print("VALUE:", value)
                slice_train = X.iloc[train_indexes, X.columns.get_loc(column)] == value
                print("SLICE TRAIN\n", slice_train)
                n_rows = slice_train.sum()
                mean_y = 0
                if n_rows > 0:
                    mean_y = y[slice_train[slice_train].index].mean()
                smoothed_mean = (mean_y * n_rows + global_mean * alpha) / (n_rows + alpha)
                
                slice_fill = X.iloc[fill_indexes, X.columns.get_loc(column)] == value
                slice_fill_indexes = slice_fill[slice_fill].index
                X.loc[slice_fill_indexes, column + '_agr'] = smoothed_mean
                print("FILLED IND:", slice_fill_indexes.tolist())
        print('------')

    X.rename(index=dict(zip(X.index, initial_indexes)), inplace=True)
    return X

In [3]:
number_rows = 15
train_X = pd.DataFrame(np.random.randint(low=10, size=(number_rows,5)), columns=['a', 'b', 'c', 'd', 'e'])
train_y = np.random.binomial(1, 0.7, number_rows)

test_X = pd.DataFrame(np.random.randint(low=10, size=(number_rows,5)), columns=['a', 'b', 'c', 'd', 'e'])
test_y = np.random.binomial(1, 0.7, number_rows)

In [13]:
alpha = 1000
global_mean = train_y.mean()
columns = ['a']

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_fill = KFold(n_splits=3, shuffle=True, random_state=42)
for train_index, test_index in kf.split(train_X, train_y):
    
    print(train_index)
    print(test_index)
    
    print('---------------------------------------------------')
    
    X_train_new = smoothed_mean(X=train_X.ix[train_index], 
                                y=train_y[train_index], 
                                kf=kf_fill,
                                alpha=alpha, 
                                global_mean=global_mean,
                                columns=columns)
    break
    
    """
    print('train_X \n', train_X.ix[train_index])
    print('------')
    print('train_y \n', train_y[train_index])
    print('------')
    print('test_X \n', train_X.ix[test_index])
    print('------')
    print('test_y \n', train_y[test_index])
    print('---------------------------------------------------')
    """
    

[ 1  2  3  4  5  6  7  8 10 12 13 14]
[ 0  9 11]
---------------------------------------------------
TRAIN IND: [ 1  2  3  4  5  6  7 11] FILL IND: [ 0  8  9 10]
COL: ['a']
VALUE: 9
SLICE TRAIN
 1     False
2     False
3     False
4     False
5     False
6     False
7     False
11    False
Name: a, dtype: bool
FILLED IND: [0, 10]
VALUE: 2
SLICE TRAIN
 1     False
2     False
3     False
4     False
5     False
6     False
7     False
11    False
Name: a, dtype: bool
FILLED IND: [8]
VALUE: 3
SLICE TRAIN
 1      True
2     False
3     False
4     False
5     False
6     False
7     False
11    False
Name: a, dtype: bool
FILLED IND: [9]
------
TRAIN IND: [ 0  3  4  6  7  8  9 10] FILL IND: [ 1  2  5 11]
COL: ['a']
VALUE: 8
SLICE TRAIN
 0     False
3     False
4     False
6     False
7      True
8     False
9     False
10    False
Name: a, dtype: bool
FILLED IND: [5]
VALUE: 3
SLICE TRAIN
 0     False
3     False
4     False
6     False
7     False
8     False
9      True
10    False
Name: 

In [14]:
X_train_new

Unnamed: 0,a,b,c,d,e,a_agr
1,9,2,6,7,4,0.533333
2,3,7,7,2,5,0.532801
3,4,1,7,5,1,0.532269
4,4,0,9,5,8,0.5338
5,0,9,2,6,3,0.533333
6,8,2,4,2,6,0.5338
7,4,8,6,1,3,0.5338
8,8,1,9,8,9,0.532801
10,2,0,3,1,7,0.533333
12,3,5,1,9,1,0.5338


In [10]:
global_mean

0.53333333333333333