## The Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
dn = np.concatenate([y, x], axis=1)
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

## The Python Edition

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [5]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.9 s per loop


In [6]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [8]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 310 ms per loop


In [9]:
diff = np.linalg.norm(result_2 - result_1)
print(diff)

0.0


使用setdefault方法减少if-else结构

In [19]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        value_dict.setdefault(data.loc[i, x_name], 0.0)
        value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
        count_dict.setdefault(data.loc[i, x_name], 0)
        count_dict[data.loc[i, x_name]] += 1       
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [20]:
%%timeit
result_3 = target_mean_v3(data, 'y', 'x')

1 loop, best of 3: 352 ms per loop


In [16]:
diff = np.linalg.norm(result_3 - result_1)
print(diff)

0.0


改用defaultdict，这个是群里同学发的版本。不急于求结果的话，可以只返回生成器

In [21]:
from collections import defaultdict

def target_mean_v4(data, y_name, x_name):
    X = data[x_name].values
    Y = data[y_name].values   
    value_dict = defaultdict(lambda:0.0)
    count_dict = defaultdict(lambda:0)    
    for x, y in zip(X, Y):
        value_dict[x] += y
        count_dict[x] += 1     
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(X, Y)]
    return result

In [27]:
%%timeit
result_4 = target_mean_v4(data, 'y', 'x')

100 loops, best of 3: 10.5 ms per loop


In [26]:
diff = np.linalg.norm(result_4 - result_1)
print(diff)

0.0


这里测试一下itertools中的groupby方法

In [30]:
from itertools import groupby
from operator import itemgetter
def target_mean_v5(data, y_name, x_name):
    X = data[x_name].values
    Y = data[y_name].values 
    value_dict = defaultdict(lambda:0.0)
    count_dict = defaultdict(lambda:0)  
    for k, g in groupby(sorted(zip(X, Y),key=lambda z:z[0]),key=itemgetter(0)):
        y_by_group = list(item[1] for item in g)
        value_dict[k] = sum(y_by_group)
        count_dict[k] = len(y_by_group)
    result = [(value_dict[x] - y) / (count_dict[x] - 1) for x, y in zip(X, Y)]        
    return result

In [33]:
%%timeit
result_5 = target_mean_v5(data, 'y', 'x')

100 loops, best of 3: 10.2 ms per loop


In [32]:
diff = np.linalg.norm(result_5 - result_1)
print(diff)

0.0


## The Cython Edition

In [34]:
%load_ext Cython

使用cnp.ndarray

In [35]:
%%cython

import cython
import numpy as np
import pandas as pd
cimport numpy as cnp

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v6(cnp.ndarray[int, ndim=2] data):
    cdef cnp.ndarray[int] x = data[:,1]
    cdef cnp.ndarray[int] y = data[:,0]
    cdef int n = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(n)
    cdef cnp.ndarray[int] value_dict = np.zeros(n).astype(np.intc)
    cdef cnp.ndarray[int] count_dict = np.zeros(n).astype(np.intc)
    cdef int i
    for i in range(n):
        value_dict[x[i]] += y[i]
        count_dict[x[i]] += 1
    for i in range(n):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

In [38]:
%%timeit
result_6 = target_mean_v6(data.values.astype(np.intc))

The slowest run took 12.51 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 51.2 µs per loop


In [37]:
diff = np.linalg.norm(result_6 - result_1)
print(diff)

0.0


测试一下并行，[cython中的并行](https://cython.readthedocs.io/en/latest/src/userguide/parallelism.html)目前只支持OpenMP，在这里运行意义不大。

In [39]:
%%cython

import cython
import numpy as np
import pandas as pd
cimport numpy as cnp
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v7(cnp.ndarray[int, ndim=2] data):
    cdef cnp.ndarray[int] x = data[:,1]
    cdef cnp.ndarray[int] y = data[:,0]
    cdef int n = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(n)
    cdef cnp.ndarray[int] value_dict = np.zeros(n).astype(np.intc)
    cdef cnp.ndarray[int] count_dict = np.zeros(n).astype(np.intc)
    cdef int i
    for i in prange(n, nogil=True):
        value_dict[x[i]] += y[i]

        count_dict[x[i]] += 1
    for i in prange(n, nogil=True):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

In [42]:
%%timeit
result_7 = target_mean_v7(data.values.astype(np.intc))

The slowest run took 75.19 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 57.9 µs per loop


In [41]:
diff = np.linalg.norm(result_7 - result_1)
print(diff)

0.0


使用[Typed Memoryviews](https://cython.readthedocs.io/en/latest/src/userguide/memoryviews.html#typed-memoryviews)

In [43]:
%%cython

import cython
import numpy as np
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef double[:] target_mean_v8(int[:, :] data):
    cdef int[:] x = data[:,1]
    cdef int[:] y = data[:,0]
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros(n).astype(np.double)
    cdef int[:] value_dict = np.zeros(n).astype(np.intc)
    cdef int[:] count_dict = np.zeros(n).astype(np.intc)
    cdef int i
    for i in prange(n, nogil=True):
        value_dict[x[i]] += y[i]

        count_dict[x[i]] += 1
    for i in prange(n, nogil=True):
        result[i] = (value_dict[x[i]] - y[i]) / (count_dict[x[i]] - 1)
    return result

In [57]:
%%timeit
result_8 = target_mean_v8(data.values.astype(np.intc))

The slowest run took 6.67 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 56.2 µs per loop


当然直接传numby数组的话会更快一些

In [60]:
%%timeit
result_8 = target_mean_v8(dn.astype(np.intc))

The slowest run took 20.15 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 47.2 µs per loop


In [45]:
diff = np.linalg.norm(result_8 - result_1)
print(diff)

0.0


## The Numba Edition

Numba是个jit编译器，简单高效，支持并行，使用时只需要添加装饰器即可。

In [50]:
from numba import njit, prange
import numpy as np

@njit(parallel=True)
def target_mean_v9(data):
    X = data[:,1]
    Y = data[:,0] 
    n = X.shape[0]
    value_dict = np.zeros(n)
    count_dict = np.zeros(n) 
    result = np.zeros(n)   
    for i in range(n):
        value_dict[X[i]] += Y[i]
        count_dict[X[i]] += 1
    for i in range(n):
        result[i] = (value_dict[X[i]] - Y[i]) / (count_dict[X[i]] - 1)  
    return result

In [58]:
%%timeit
result_9 = target_mean_v9(data.values)

The slowest run took 5.33 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 50.1 µs per loop


In [62]:
%%timeit
result_9 = target_mean_v9(dn)

The slowest run took 10.56 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 40.2 µs per loop


In [52]:
diff = np.linalg.norm(result_9 - result_1)
print(diff)

0.0


## References



*   6 ways to significantly speed up Pandas with a couple lines of code [Part 1](https://medium.com/swlh/6-ways-to-significantly-speed-up-pandas-with-a-couple-lines-of-code-part-1-2c2dfb0de230) [Part 2](https://medium.com/swlh/6-ways-to-significantly-speed-up-pandas-with-a-couple-lines-of-code-part-2-7a9e41ba76dc)
*   其他一些python加速工具：
[Numba](http://numba.pydata.org/)
[Dask](https://dask.org/)
[Weld](https://www.weld.rs/)
