In [1]:
import numpy as np
import pandas as pd

In [2]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [4]:
%%timeit
target_mean_v1(data, 'y', 'x')

33.7 s ± 2.62 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [8]:
%%timeit
target_mean_v2(data, 'y', 'x')

416 ms ± 35.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [44]:
from collections import defaultdict

y = np.random.randint(2, size=(10, 1))
x = np.random.randint(10, size=(10, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

def target_mean_v3(data, y_name, x_name):
    n = data.shape[0]
    
    ndarray_y = data[y_name].values
    ndarray_x = data[x_name].values
    
    value_dict = defaultdict(lambda : 0)
    count_dict = defaultdict(lambda : 0)
    
    for x, y in zip(ndarray_x, ndarray_y):
        value_dict[x] += ndarray_y
        count_dict[x] += 1
    
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(ndarray_x, ndarray_y)]
    
    return result

[[0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]]
[[0]
 [2]
 [6]
 [2]
 [9]
 [1]
 [0]
 [1]
 [6]
 [3]]
   y  x
0  0  0
1  1  2
2  1  6
3  1  2
4  1  9
5  0  1
6  0  0
7  0  1
8  0  6
9  1  3


In [45]:
%%timeit
target_mean_v3(data, 'y', 'x')

0., 2., 2., 2., 2., 0., 0., 0., 0., 2.]), array([-inf,  nan,  nan,  nan,  nan, -inf, -inf, -inf, -inf,  nan])]
[0 1 1 1 1 0 0 0 0 1]
[0 2 6 2 9 1 0 1 6 3]
defaultdict(<function target_mean_v3.<locals>.<lambda> at 0x7fd5c70391f0>, {})
defaultdict(<function target_mean_v3.<locals>.<lambda> at 0x7fd5c7039b80>, {})
0 0
[0 1 1 1 1 0 0 0 0 1]
1
2 1
[0 1 1 1 1 0 0 0 0 1]
1
6 1
[0 1 1 1 1 0 0 0 0 1]
1
2 1
[0 2 2 2 2 0 0 0 0 2]
2
9 1
[0 1 1 1 1 0 0 0 0 1]
1
1 0
[0 1 1 1 1 0 0 0 0 1]
1
0 0
[0 2 2 2 2 0 0 0 0 2]
2
1 0
[0 2 2 2 2 0 0 0 0 2]
2
6 0
[0 2 2 2 2 0 0 0 0 2]
2
3 1
[0 1 1 1 1 0 0 0 0 1]
1
[array([0., 2., 2., 2., 2., 0., 0., 0., 0., 2.]), array([-1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,  1.]), array([-1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,  1.]), array([-1.,  1.,  1.,  1.,  1., -1., -1., -1., -1.,  1.]), array([-inf,  nan,  nan,  nan,  nan, -inf, -inf, -inf, -inf,  nan]), array([0., 2., 2., 2., 2., 0., 0., 0., 0., 2.]), array([0., 2., 2., 2., 2., 0., 0., 0., 0., 2.]), array([0., 2.

In [50]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [52]:
%%cython --cplus

import cython
cimport cython
import numpy as np
cimport numpy as np
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)

cpdef target_mean_v4(int[:] x, int [:] y):
    cdef int feature_size = 10
    cdef int n = x.shape[0]
    cdef int[:] data_count = np.zeros(feature_size).astype(np.intc)
    cdef int[:] data_sum = np.zeros(feature_size).astype(np.intc)
    cdef double[:] result = np.zeros(n)
    
    cdef int i = 0

    for i in prange(n, nogil = True):
        data_count[x[i]] += 1
        data_sum[x[i]] += y[i]
        
    for i in prange(n, nogil = True):
        result[i] = (data_sum[x[i]] - y[i]) / (data_count[x[i]] - 1)
    
    return result


In [46]:
%%timeit
target_mean_v4(data['x'].values.astype(np.intc), data['y'].values.astype(np.intc))

75 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
