# Cython homework

Check how fast cython can be.

In [33]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [34]:
import numpy as np
import pandas as pd

y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
print(data)

      y  x
0     0  7
1     1  6
2     1  0
3     1  8
4     1  8
...  .. ..
4995  0  1
4996  1  0
4997  0  6
4998  1  7
4999  1  8

[5000 rows x 2 columns]


In [35]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [36]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [37]:
%%cython
import timeit
import numpy as np
cimport numpy as cnp
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v3(data, str y_name, str x_name):
    cdef int data_len = data.shape[0]
    cdef double[:] result = np.zeros(data_len)
    cdef long[:] x_data = data[x_name].values
    cdef long[:] y_data = data[y_name].values
    cdef long[:] values = np.zeros(10).astype(long)
    cdef long[:] counts = np.zeros(10).astype(long)
    
    cdef int i = 0
    # prange() can only be used without the GIL
    for i in prange(data_len, nogil=True):
        values[x_data[i]] += y_data[i]
        counts[x_data[i]] += 1

    for i in prange(data_len, nogil=True):
        result[i] = (values[x_data[i]] - y_data[i]) / (counts[x_data[i]] - 1)

    return result

In [41]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 22 s per loop


In [42]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 257 ms per loop


In [43]:
%%timeit
result_3 = target_mean_v3(data, 'y', 'x')

The slowest run took 4.89 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 41 µs per loop


In [47]:
diff = np.linalg.norm(result_3 - result_2)
print(diff)

0.0
