Jupyter notebookでcythonを使う
==
http://qiita.com/kenmatsu4/items/7c08a85e41741e95b9ba

Cython関数
--

In [3]:
%%cython -n test_cython_code
def fib(int n):
    cdef int i
    cdef double a=0.0, b=1.0

    for i in range(n):
        a, b = a+b, a
    return a

def primes(int kmax):
    cdef int n, k, i
    cdef int p[1000]
    result = []

    if kmax > 1000:
        kmax = 1000

    k = 0
    n = 2
    while k < kmax:
        i = 0
        while i < k and n % p[i] != 0:
            i += 1

        if i == k:
            p[k] = n
            k += 1
            result.append(n)
        n += 1
    return result

Python関数
--

In [4]:
# 性能比較用　Python関数
def pyfib(n):
    a, b = 0.0, 1.0
    for i in range(n):
        a, b = a+b, a
    return a

def pyprimes(kmax):
    p = np.zeros(1000)
    result = []

    # 最大個数は1000個
    if kmax > 1000:
        kmax = 1000

    k = 0
    n = 2
    while k < kmax:
        i = 0
        while i < k and n % p[i] != 0:
            i += 1

        if i == k:
            p[k] = n
            k += 1
            result.append(n)
        n += 1
    return result

計算時間を比較
--

In [5]:
%timeit fib(1000)
%timeit pyfib(1000)

1000000 loops, best of 3: 1.03 µs per loop
10000 loops, best of 3: 52.1 µs per loop


numpy.ndarryをつかったCython関数
--

In [23]:
%%cython -n sample_calc 
cimport numpy as np

cpdef np.ndarray[double] sample_calc(np.ndarray col_a, np.ndarray col_b, np.ndarray col_c):
    # 各列の型チェック
    assert (col_a.dtype == np.float and col_b.dtype == np.float and col_c.dtype == np.float)

    # 各列のサイズが同じであることをチェック
    cdef Py_ssize_t n = len(col_c)
    assert (len(col_a) == len(col_b) == n)
    cdef np.ndarray[double] res = np.empty(n)

    # (a-b)/c という計算をする
    for i in range(n):
        res[i] = (col_a[i] - col_b[i])/col_c[i]
    return res

In [24]:
np.random.seed(71)
n_data = 10**5
X = pd.DataFrame(np.random.normal(size=3*n_data).reshape((n_data,3)), columns=["a", "b", "c"])
print(X.shape)
print(X.head())

(100000, 3)
          a         b         c
0 -0.430603 -1.193928 -0.444299
1  0.489412 -0.451557  0.585696
2  1.177320 -0.965009  0.218278
3 -0.866144 -0.323006  1.412919
4 -0.712651 -1.362191 -1.705966


In [25]:
sample_calc(X.a.values, X.b.values, X.c.values)

array([-1.71804336,  1.60658332,  9.81468496, ..., -0.44683095,
        0.46970409, -0.28352272])

In [26]:
# 比較用
def pysample_calc(col_a, col_b, col_c):
    # 各列の型チェック
    assert (col_a.dtype == np.float and col_b.dtype == np.float and col_c.dtype == np.float)

    # 各列のサイズが同じであることをチェック
    n = len(col_c)
    assert (len(col_a) == len(col_b) == n)
    res = np.empty(n)

    # (a-b)/c という計算をする
#     for i in range(n):
#         res[i] = (col_a[i] - col_b[i])/col_c[i]
    res = (col_a - col_b) / col_c
    return res

In [27]:
%timeit sample_calc(X.a.values, X.b.values, X.c.values)
%timeit pysample_calc(X.a.values, X.b.values, X.c.values)

10 loops, best of 3: 22 ms per loop
1000 loops, best of 3: 602 µs per loop
