In [1]:
%load_ext cython

In [1]:
import time
import numpy as np

In [18]:
%%cython

import numpy as np
cimport numpy as np
import cython
from cython.parallel import prange

ctypedef np.double_t cDOUBLE
DOUBLE = np.float64


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.nonecheck(False)
def mydot(double[:,::1] a, double[:,::1] b):
    cdef double[:,::1] c
    cdef Py_ssize_t i, j, k, M, N, K, JK
    cdef double *ai, *bb, *ci

    c = np.zeros((a.shape[0], b.shape[1]), dtype=DOUBLE)
    M = a.shape[0]
    N = a.shape[1]
    K = b.shape[1]

    for i in prange(M, schedule='static', nogil=True):
#         multiply(&a[i,0], &b[0,0], &c[i,0], N, K)
        ai = &a[i,0]
        ci = &c[i,0]
        bb = &b[0,0]
        for j in range(N):
            JK = j*K
            for k in range(K):
                ci[k] += ai[j]*bb[k+JK]
    return c


@cython.wraparound(False)
@cython.boundscheck(False)
@cython.nonecheck(False)
cdef void multiply(double *a, double *b, double *c, Py_ssize_t N, Py_ssize_t K) nogil:
    cdef Py_ssize_t j, k
    for j in range(N):
        for k in range(K):
            c[k] += a[j]*b[k+j*K]



In [19]:
a = np.random.random((10000,500))
b = np.random.random((500,2000))

t1 = time.time()
c = np.dot(a, b)
t2 = time.time()-t1
print('finished dot: {} s'.format(t2))

t2 = time.time()
c2 = mydot(a, b)
t3 = time.time()-t2
print('finished mydot: {} s'.format(t3))

print('Passed test:', np.allclose(c, c2))

finished dot: 0.2402336597442627 s
finished mydot: 7.9641923904418945 s
Passed test: True


In [5]:
a = np.random.rand(1000000)
b = np.random.rand(1000000)

dot = np.dot

%timeit c=a @ b
%timeit c=dot(a,b)


999 µs ± 37.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.02 ms ± 88.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
