In [1]:
%load_ext cython

In [2]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
cimport cython
cimport libc.math as cmath

from cython.parallel import parallel, prange
from mlgrad.inventory cimport _dot

import numpy as np
cimport numpy as np

cpdef copy(double[::1] to, double[::1] src):
    cdef Py_ssize_t i
    for i in range(to.shape[0]):
        to[i] = src[i] + cmath.sin(i)
    
cpdef copy_openmp(double[::1] to, double[::1] src):
    cdef Py_ssize_t i
    for i in prange(to.shape[0], nogil=True, schedule='static', num_threads=4):
        to[i] = src[i] + cmath.sin(i)

cdef void _matrix_dot(const double *A, const double *x, const Py_ssize_t m, const Py_ssize_t n, double *y):
    cdef Py_ssize_t j

    for j in prange(m, nogil=True, schedule='static', num_threads=4):
        y[j] = _dot(A+j*n, x, n)
    
cpdef matrix_dot(double[:,::1] A, double[::1] x, double[::1] y):
    _matrix_dot(&A[0,0], &x[0], A.shape[0], A.shape[1], &y[0])


Content of stderr:
In file included from /home/intellimath/.local/lib/python3.11/site-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /home/intellimath/.local/lib/python3.11/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /home/intellimath/.local/lib/python3.11/site-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /home/intellimath/.cache/ipython/cython/_cython_magic_376da53109574eeac8bab26cab5700a2c2b83a55.c:1184:
      |  ^~~~~~~

In [3]:
A = np.full(1000, 1, 'd')
B = np.full(1000, -1, 'd')
%timeit copy(A, B)
%timeit copy_openmp(A, B)

13.1 µs ± 324 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
6.13 µs ± 15 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [4]:
A = np.random.random((1000,10))
x = np.random.random(10)
y = np.zeros(10, 'd')

%timeit A @ x
%timeit matrix_dot(A,x,y)


9.93 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
5.26 µs ± 363 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
