# HomeWork
请将target_mean_v1代码改为 cython 代码并比较速度区别
（如可以实现并行可加分）。

In [2]:
import numpy as np
import pandas as pd


def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result


def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

def main():
    y = np.random.randint(2, size=(5000, 1))
    x = np.random.randint(10, size=(5000, 1))
    data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
    result_1 = target_mean_v1(data, 'y', 'x')
    result_2 = target_mean_v2(data, 'y', 'x')

    diff = np.linalg.norm(result_1 - result_2)
    print(diff)

In [1]:
%load_ext Cython

In [11]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [55]:
%%cython
cimport cython
import numpy as np
cimport numpy as np
from libc.math cimport log as clog

# "ctypedef" assigns a corresponding compile-time type to DTYPE_t. For
# every type in the numpy module there's a corresponding compile-time
# type with a _t-suffix.
ctypedef np.int_t DTYPE_t

@cython.boundscheck(False)
@cython.wraparound(False)
cdef np.ndarray[float] c_target_mean_v3(np.ndarray[long] col_x, np.ndarray[long] col_y):
    
    assert (col_x.dtype == np.long 
            and col_y.dtype == np.long 
            and len(col_x) == len(col_y))
    
    # 声明并初始化循环使用的变量
    cdef int i = 0, n_rows = len(col_x)
    
    # 声明返回数据并初始化为0
    cdef np.ndarray[double] result = np.empty(n_rows)
    # 只有一个数据的情况
    if n_rows == 1:
        result[0] = col_y[i]
        return result
    
    map_dict = dict()
    for i in range(n_rows):
        if col_x[i] not in map_dict.keys():
            map_dict[col_x[i]] = list([col_y[i],1])
        else:
            map_dict[col_x[i]][0] += col_y[i]
            map_dict[col_x[i]][1] += 1
    cdef long long x, sum_y_by_x, count_x
    for i in range(n_rows):
        x = col_x[i]
        sum_y_by_x = map_dict[x][0]-col_y[i]
        count_x = map_dict[x][1]
        result[i] = sum_y_by_x/(count_x-1)
    return result

# 封装
cpdef np.ndarray[float] target_mean_v3(data, y_name, x_name):
    return c_target_mean_v3(data[x_name].to_numpy(),data[y_name].to_numpy())

In [56]:
#result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v2(data, 'y', 'x')
diff = np.linalg.norm(result_2 - result_3)
print(diff)

0.0


In [52]:
%%timeit
target_mean_v1(data, 'y', 'x')

18.5 s ± 626 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [57]:
%%timeit
target_mean_v3(data, 'y', 'x')

972 µs ± 170 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
