# Rules to Speed up Your Python Code
- Decrease the use of `for loop`
> this is the RULE NO.1.
- Use the List Comprehensions
> ```[i for i in range (1, 1000) if i%3 == 0]``` is faster than `for loop + append()` 
- Use the Built-In Functions
> Such as ```map(), sum(), mean(), median(), max(), min()```
- Function Calls Are Expensive
> It is better to iterate inside a function than to iterate and call a function each iteration.
- Use Numpy and Scipy
> This is RULE NO.2. Search a solution in Numpy and Scipy first before you code it up.
- Avoid Lazy Module Importing
>```python
>from math import sqrt # but not from math import *
>val = sqrt(60)
>```
- Be Careful with Bulky Libraries
> Reduce the number and size of dependencies, keep your code simple.
- Avoid Global Variables
> Local Variables lead to less memory usage and higher performance, it is simply best to avoid global variables when possible.
- Adopt Proper Algorithms and Data Structures
> - Algorithms are like science, while optimization is like technique. When the current technique can not take you to a galaxy far far away, try to find a worm whole.
> - Data structure provides the right way to organize information in the digital space. Consider using `FITS`, `HDF5`, `Pandas`, and so on, before start your astronomical research.

# Ways to Speed Up your Python code
- Use Exsiting Packages: Numpy/Scipy/Numba
- Python-C/C++/Fortran Integration
- Mutiprocessing/MPI4py/Ctypes+OpenMP

# Benchmark
Simpson Integration with Python

∫^𝑥𝑖+1_𝑥𝑖−1𝑓(𝑥)𝑑𝑥=ℎ3(𝑓(𝑥𝑖−1)+4𝑓(𝑥𝑖)+𝑓(𝑥𝑖+1))+𝑂(ℎ5).


- 1.Native Python
- 2.Numba
- 3.Numpy and Scipy
- 4.Ctypes/Cython
- 5.OpenMP+Ctypes
- 6.Multiprocessing
- 7.MPI4py

In [1]:
# with native python
# if N % 2 == 1:
#     raise ValueError("N must be an even integer.")

from math import sin, pi
def func_python(x): 
    return sin(x)

def simpson_python(f,a,b,N=100000000):
    dx = (b-a)/N
    S = 0
    for i in range(1, N, 2):
        x_m1 = a+(i-1)*dx
        x_i =  a+(i)*dx
        x_p1 = a+(i+1)*dx
        y_m1 = f(x_m1)
        y_i = f(x_i)
        y_p1 = f(x_p1)
        S += dx/3 * (y_m1 + 4.0*y_i + y_p1)
    return S

In [2]:
%%time

simpson_python(func_python, 0.0, pi/2)

CPU times: user 26.2 s, sys: 36.3 ms, total: 26.2 s
Wall time: 26.3 s


0.9999999999997551

In [2]:
# with Numpy
import numpy as np

def func_numpy(x): 
    return np.sin(x)

def simpson_numpy(f,a,b,N=100000000):
    dx = (b-a)/N
    x = np.linspace(a,b,N+1)
    y = f(x)
    S = dx/3 * np.sum(y[0:-1:2] + 4*y[1::2] + y[2::2])
    return S

In [4]:
%%time

simpson_numpy(func_numpy, 0.0, np.pi/2)

CPU times: user 1.38 s, sys: 501 ms, total: 1.88 s
Wall time: 1.88 s


0.9999999999999988

In [4]:
%%time

from scipy import integrate as spi
spi.quad(func_numpy, 0, np.pi/2)

CPU times: user 91.8 ms, sys: 48.7 ms, total: 141 ms
Wall time: 212 ms


(0.9999999999999999, 1.1102230246251564e-14)

In [3]:
%%time

N_arr = 100000001; # is this number even or odd? 
a_arr = 0.0; 
b_arr = np.pi/2;
x_arr = np.linspace(a_arr, b_arr, N_arr)
y_arr = func_numpy(x_arr)

CPU times: user 1.13 s, sys: 325 ms, total: 1.45 s
Wall time: 1.5 s


In [4]:
def simpson_python_arr(y, x):
    N = len(x)
    dx = x[1] - x[0]
    S = 0
    for i in range(1, N, 2):
        S += dx/3 * (y[i-1] + 4.0*y[i] + y[i+1])
    return S

In [5]:
%%time

print(simpson_python_arr(y_arr, x_arr))

0.9999999999997551
CPU times: user 31.3 s, sys: 42.9 ms, total: 31.4 s
Wall time: 31.4 s


In [6]:
%%time

def simpson_numpy_arr(y, x):
    S = (x[1]-x[0])/3 * np.sum(y[0:-1:2] + 4*y[1::2] + y[2::2])
    return S

print(simpson_numpy_arr(y_arr, x_arr))

0.9999999999999988
CPU times: user 355 ms, sys: 118 ms, total: 473 ms
Wall time: 477 ms


In [7]:
%%time
# with Scipy.integrate.simps
from scipy import integrate as spi

print(spi.simps(y_arr, x_arr))

0.9999999999999976
CPU times: user 1.93 s, sys: 1.99 s, total: 3.92 s
Wall time: 4.4 s


In [10]:
%%time
# with Numba

from numba import jit

@jit(nopython=True)
def simpson_numba(y, x):
    N = len(x)
    dx = x[1] - x[0]
    S = 0
    for i in range(1, N, 2):
        S += dx/3 * (y[i-1] + 4.0*y[i] + y[i+1])
    return S

print(simpson_numba(y_arr, x_arr))

0.9999999999997551
CPU times: user 530 ms, sys: 114 ms, total: 644 ms
Wall time: 804 ms


In [12]:
# gcc-11 -O2 --shared example.c -o example.so -Wall
# gcc-11 -O2 --shared example.c -o example.so -fopenmp  -Wall

In [8]:
import ctypes as ct
ctypes_example = ct.CDLL("./example.so")

In [9]:
ctypes_example.simpson_c.argtypes = [np.ctypeslib.ndpointer(dtype = ct.c_double), \
                                     np.ctypeslib.ndpointer(dtype = ct.c_double), \
                                     ct.c_long]
ctypes_example.simpson_c.restype  = ct.c_double

def call_simpson_c(y, x):
    N_in = ct.c_long(len(x))
    y_in = np.array(y, dtype=ct.c_double)
    x_in = np.array(x, dtype=ct.c_double)
    res = ctypes_example.simpson_c(y_in, x_in, N_in)
    return res

In [10]:
ctypes_example.simpson_c_omp.argtypes = [np.ctypeslib.ndpointer(dtype = ct.c_double), \
                                         np.ctypeslib.ndpointer(dtype = ct.c_double), \
                                         ct.c_long, ct.c_int]
ctypes_example.simpson_c_omp.restype  = ct.c_double

def call_simpson_c_omp(y, x, nThreads=4):
    N_in = ct.c_long(len(x))
    y_in = np.array(y, dtype=ct.c_double)
    x_in = np.array(x, dtype=ct.c_double)
    nThreads_in = ct.c_int(nThreads)
    res = ctypes_example.simpson_c_omp(y_in, x_in, N_in, nThreads_in)
    return res

In [16]:
%%time

print(call_simpson_c_omp(y_arr, x_arr, nThreads=8))

1.0000000000000182
CPU times: user 688 ms, sys: 391 ms, total: 1.08 s
Wall time: 831 ms


In [12]:
%%time

print(call_simpson_c(y_arr, x_arr))

1.000000005235743
CPU times: user 488 ms, sys: 403 ms, total: 892 ms
Wall time: 891 ms


# with mpi4py

>```bash
>(base) [nanli@tulip ~]$ time mpirun -np 4 python mpi4py_integrate_arrays.py
>
>With n =  100000001 bins, 
>integral of sine(x) from 0.0 to 1.5707963267948966 = 1.0000000000000338 
>real    0m13.090s
>user    0m48.931s
>sys     0m2.418s
>```

```python
import numpy as np

def simpson_local_arr(y, x, i_start, i_end):
    dx = x[1] - x[0]
    S = 0
    if i_end >= len(x):
        i_end = int(len(x))
    for i in range(i_start, i_end, 2):
        S += dx/3 * (y[i-1] + 4.0*y[i] + y[i+1])
    return S

from mpi4py import MPI
if __name__ == '__main__':
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    dest=0
    total=-1.0

    N_arr = 100000001; # is this number even or odd? 
    a_arr = 0.0; 
    b_arr = np.pi/2;
    x_arr = np.linspace(a_arr, b_arr, N_arr)
    y_arr = np.sin(x_arr)

    local_n = int(N_arr/size)
    i_s = 1+rank*local_n
    i_e = i_s + local_n
    
    integral = simpson_local_arr(y_arr, x_arr, i_s, i_e)

    if rank == 0:
        total = integral
        for source in range(1,size):
            integral = comm.recv(source=source)
            print("PE ", rank, "<-", source, ",", integral, "\n")
            total = total + integral
    else:
        print("PE ", rank, "->", dest, ",", integral, "\n")
        comm.send(integral, dest=0)

    if (rank == 0):
        print("\n")
        print("With n = ", N_arr, "bins, \n")
        print("integral of sine(x) from", a_arr, "to", b_arr, "=", total, "\n")
    MPI.Finalize
```

# mpi4py 2
>
>```bash
>(base) [nanli@tulip ~]$ time mpirun -np 4 python mpi4py_Sum_arrays.py 
>
>S =  [1.00000001]
>
>real    0m22.751s
>user    1m24.098s
>sys     0m5.683s
>```
>

```python
import numpy as np

def simpson_local_arr(y, x, i_start, i_end):
    dx = x[1] - x[0]
    S = 0
    if i_end >= len(x):
        i_end = int(len(x))
    for i in range(i_start, i_end, 2):
        S += dx/3 * (y[i-1] + 4.0*y[i] + y[i+1])
    return S

from mpi4py import MPI
if __name__ == '__main__':
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    
    dest=0
    total=-1.0

    N_arr = 100000001; # is this number even or odd? 
    a_arr = 0.0; 
    b_arr = np.pi/2;
    x_arr = np.linspace(a_arr, b_arr, N_arr)
    y_arr = np.sin(x_arr)

    comm.Bcast([y_arr, MPI.DOUBLE], root=0)
    comm.Bcast([x_arr, MPI.DOUBLE], root=0)

    if rank == 0:
        sendbuf = np.arange(N_arr)
        ave, res = divmod(len(sendbuf), size)
        count = [ave + 1 if p < res else ave for p in range(size)]
        count = np.array(count)
        displ = [sum(count[:p]) for p in range(size)]
        displ = np.array(displ)
    else:
        sendbuf = None
        count = np.zeros(size, dtype=int)
        displ = np.zeros(size, dtype=int)
    comm.Bcast(count, root=0)
    comm.Bcast(displ, root=0)
    recvbuf = np.zeros(count[rank])
    comm.Scatterv([sendbuf, count, displ, MPI.DOUBLE], recvbuf, root=0)
    
    S_loc = np.array([0.0])
    comm.Bcast([S_loc, MPI.DOUBLE], root=0)    
    i_s = displ[rank]
    i_e = displ[rank]+count[rank]
    S_loc[0] = simpson_local_arr(y_arr, x_arr, i_s, i_e)
    comm.Barrier()
    
    if rank==0:
        S_tot = np.array([0.0])
    else:
        S_tot = None

    comm.Reduce(
        [S_loc, MPI.DOUBLE],
        [S_tot, MPI.DOUBLE],
        op=MPI.SUM,
        root=0)

    if rank==0:
        print("S = ", S_tot)
```


In [16]:
def evaluate(name, func):
    st = time.time()
    out = [func(x) for x in range(10)]
    print('%s[%f]' % (name, time.time() - st))
    return out

Ctypes[0.000017]
Python[0.009069]
numpy[0.001311]


In [36]:
def simpson_tmp_arr(y):
    N = int(len(y))
    dx = x_arr[1] - x_arr[0]
    S = 0
    for i in range(1, N, 2):
        S += dx/3 * (y[i-1] + 4.0*y[i] + y[i+1])
    return S

In [37]:
cores = 8

y_parts = np.array_split(y_arr, cores)
x_parts = np.array_split(x_arr, cores)

with Pool(cores) as p:
    parts = p.map(simpson_tmp_arr, y_parts)

Process SpawnPoolWorker-100:
Traceback (most recent call last):
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'simpson_tmp_arr' on <module '__main__' (built-in)>
Process SpawnPoolWorker-101:
Traceback (most recent call last):
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    

KeyboardInterrupt: 

In [None]:
def multiprocess(y, x, cores=8):    
    y_parts = np.array_split(y, cores)
    x_parts = np.array_split(x, cores)

    with Pool(cores) as p:
        parts = p.map(simpson_tmp_arr, [y_parts, x_parts])
    return np.concatenate(parts)

In [None]:
        
multiprocess(y_arr, x_arr)

Process SpawnPoolWorker-76:
Traceback (most recent call last):
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/queues.py", line 368, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'simpson_tmp_arr' on <module '__main__' (built-in)>
Process SpawnPoolWorker-77:
Traceback (most recent call last):
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/uranus/Applications/miniconda3/lib/python3.9/multiprocessing/process.py", line 108, in run
    se