In [1]:
import numpy as np
import pandas as pd

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result



In [2]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])


In [4]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.7 s per loop


In [5]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [146]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 260 ms per loop


In [149]:
# 验证正确性
result_2 = target_mean_v2(data, 'y', 'x')
np.linalg.norm(result_2 - result_1)

0.0

In [32]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [155]:
%%cython

import numpy as np
cimport numpy as np

cpdef target_mean_by_cython(data, y_name, x_name):
    # 注意np.ndarray[long, ndim=2, mode='fortran']此处转换的类型 ValueError: Buffer dtype mismatch, expected 'int' but got 'long'
    cdef np.ndarray[long, ndim=2, mode='fortran'] arg = np.asfortranarray(data, dtype=int)
    cdef list result = [0] * data.shape[0]
    value_dict = dict()
    count_dict = dict()
    keys = value_dict.keys()
    for i in range(arg.shape[0]):
        if arg[i, 1] not in keys:
            value_dict[arg[i, 1]] = arg[i, 0]
            count_dict[arg[i, 1]] = 1
        else:
            value_dict[arg[i, 1]] += arg[i, 0]
            count_dict[arg[i, 1]] += 1
    for i in range(arg.shape[0]):
        result[i] = (value_dict[arg[i, 1]] - arg[i, 0]) / (count_dict[arg[i, 1]] - 1)
    return result

In [156]:
%%timeit
result_3 = target_mean_by_cython(data, 'y', 'x')

1000 loops, best of 3: 1.2 ms per loop


In [157]:
# 验证正确性
result_3 = target_mean_by_cython(data, 'y', 'x')
print(np.linalg.norm(result_3 - result_2))

0.0


In [165]:
%%cython
# 不使用pthon中的dict
import numpy as np
cimport numpy as np
import cython
cimport cython
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_by_no_dict(data, y_name, x_name):

    cdef int shape = data.shape[0]
    cdef np.ndarray[double] result = np.zeros(shape, dtype = np.float)
    cdef np.ndarray[double] values = np.zeros(shape, dtype = np.float)
    cdef np.ndarray[double] counts = np.zeros(shape, dtype = np.float)

    cdef np.ndarray[long] x_values = data[x_name].values
    cdef np.ndarray[long] y_values = data[y_name].values

    cdef int i=0

    for i in range(shape):
        values[x_values[i]] += y_values[i]
        counts[x_values[i]] += 1

    for i in range(shape):
        result[i] = (values[x_values[i]] - y_values[i]) / (counts[x_values[i]] -1 )
    return result

In [166]:
%%timeit
result_4 = target_mean_by_no_dict(data, 'y', 'x')

The slowest run took 19.57 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 36.8 µs per loop


In [167]:
# 验证正确性
result_4 = target_mean_by_no_dict(data, 'y', 'x')
print(np.linalg.norm(result_4 - result_2))

0.0


In [194]:
%%cython

import numpy as np
cimport numpy as np
import cython
cimport cython
from cython.parallel import prange


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_by_openmp(data, y_name, x_name):
    cdef int shape = data.shape[0]
    cdef np.ndarray[double] result = np.zeros(shape, dtype = np.float64)
    cdef np.ndarray[double] values = np.zeros(shape, dtype = np.float64)
    cdef np.ndarray[double] counts = np.zeros(shape, dtype = np.float64)
    cdef np.ndarray[long] x_values = data[x_name].values
    cdef np.ndarray[long] y_values = data[y_name].values

    cdef int i=0
    # 可能存在race condition
    for i in prange(shape, nogil=True):
        values[x_values[i]] += y_values[i]
        counts[x_values[i]] += 1

    for i in prange(shape, nogil=True):
        result[i] = (values[x_values[i]] - y_values[i]) / (counts[x_values[i]] -1 )
    return result


In [195]:
%%timeit
result_5 = target_mean_by_openmp(data, 'y', 'x')

The slowest run took 54.42 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 43.1 µs per loop


In [196]:
# 验证正确性
result_5 = target_mean_by_no_dict(data, 'y', 'x')
print(np.linalg.norm(result_5 - result_2))

0.0
