<a href="https://colab.research.google.com/github/lhuang-pvamu/Parallel-Computing-Code/blob/master/Python/Numba_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Numba tutorial

In [None]:
# install profilers: cProfile, line_profiler, timeit
!pip install line_profiler

Collecting line_profiler
  Downloading line_profiler-3.5.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (67 kB)
[?25l[K     |████▉                           | 10 kB 26.2 MB/s eta 0:00:01[K     |█████████▊                      | 20 kB 19.4 MB/s eta 0:00:01[K     |██████████████▌                 | 30 kB 10.6 MB/s eta 0:00:01[K     |███████████████████▍            | 40 kB 8.7 MB/s eta 0:00:01[K     |████████████████████████▏       | 51 kB 6.4 MB/s eta 0:00:01[K     |█████████████████████████████   | 61 kB 7.5 MB/s eta 0:00:01[K     |████████████████████████████████| 67 kB 4.1 MB/s 
[?25hInstalling collected packages: line-profiler
Successfully installed line-profiler-3.5.1


In [None]:
import numpy as np
from time import sleep

def bad_call():
  sleep(0.5)

def worse_call():
  sleep(1)

def sumulate(nums):
  a = np.random.random((4000,4000))
  b = a @ a  # @: matrix multiplication

  ans = 0
  for i in range(nums):
    ans += i

  bad_call()  # sleep 0.5 second
  worse_call()  # sleep 1 second
  return b

In [None]:
%time b=sumulate(1000)

CPU times: user 6.77 s, sys: 246 ms, total: 7.01 s
Wall time: 5.11 s


In [None]:
b.shape

(4000, 4000)

## cProfile

In [None]:
import cProfile

In [None]:
cProfile.run('sumulate(1000)')

         9 function calls in 5.091 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    3.441    3.441    5.090    5.090 <ipython-input-33-6cdc5c1c2d98>:10(sumulate)
        1    0.000    0.000    0.501    0.501 <ipython-input-33-6cdc5c1c2d98>:4(bad_call)
        1    0.000    0.000    1.001    1.001 <ipython-input-33-6cdc5c1c2d98>:7(worse_call)
        1    0.001    0.001    5.091    5.091 <string>:1(<module>)
        1    0.000    0.000    5.091    5.091 {built-in method builtins.exec}
        2    1.502    0.751    1.502    0.751 {built-in method time.sleep}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        1    0.148    0.148    0.148    0.148 {method 'random' of 'numpy.random.mtrand.RandomState' objects}




## line_profiler

In [None]:
%load_ext line_profiler

In [None]:
%lprun -f bad_call -f worse_call sumulate(1000)

In [None]:
%lprun -T timings.txt -f sumulate sumulate(1000)


*** Profile printout saved to text file 'timings.txt'. 


## timeit

In [None]:
%timeit x=9

10000000 loops, best of 5: 20.6 ns per loop


In [None]:
%time x=9

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [None]:
%%timeit
x=1
y=2
z=x+y

The slowest run took 21.10 times longer than the fastest. This could mean that an intermediate result is being cached.
10000000 loops, best of 5: 51.7 ns per loop


In [None]:
a = %timeit -qo x=5

In [None]:
a

<TimeitResult : 10000000 loops, best of 5: 20.4 ns per loop>

## Numba Jit

In [None]:
def sum_array(input): # assume the input is a 2D array
  J, I = input.shape

  mysum = 0
  for j in range(J):
    for i in range(I):
      mysum += input[j, i]
  
  return mysum

In [None]:
a = np.random.random((4000,4000))

In [None]:
%time sum_array(a)

CPU times: user 3.4 s, sys: 5.98 ms, total: 3.4 s
Wall time: 3.41 s


7999963.631036575

In [None]:
seq = %timeit -o sum_array(a)

1 loop, best of 5: 3.36 s per loop


In [None]:
seq

<TimeitResult : 1 loop, best of 5: 3.36 s per loop>

In [None]:
%time a.sum()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 13.6 ms


7999963.631036352

In [None]:
nptime = %timeit -o a.sum()

100 loops, best of 5: 10.8 ms per loop


In [None]:
# regular sequential Python vs. Numpy
speedup = seq.best / nptime.best
speedup

311.68619397171955

In [None]:
from numba import jit

In [None]:
sum_array_numba = jit()(sum_array)

In [None]:
%time sum_array_numba(a)

CPU times: user 25.8 ms, sys: 948 µs, total: 26.8 ms
Wall time: 27.7 ms


7999963.631036575

In [None]:
nbtime = %timeit -o sum_array_numba(a)

10 loops, best of 5: 25.6 ms per loop


In [None]:
seq.best / nbtime.best

130.90951091556957

In [None]:
nbtime.best/nptime.best

2.3809285650203247

In [None]:
@jit  # decorator
def sum_array(input): # assume the input is a 2D array
  J, I = input.shape

  mysum = 0
  for j in range(J):
    for i in range(I):
      mysum += input[j, i]
  
  return mysum

In [None]:
%timeit sum_array(a)

The slowest run took 5.75 times longer than the fastest. This could mean that an intermediate result is being cached.
10 loops, best of 5: 25.7 ms per loop


# How Numba Jit works?

In [None]:
@jit
def add(a, b):
  return a+b

In [None]:
add(1,2)

3

In [None]:
add.inspect_types()

add (int64, int64)
--------------------------------------------------------------------------------
# File: <ipython-input-70-04b5d46e4ce0>
# --- LINE 1 --- 

@jit

# --- LINE 2 --- 

def add(a, b):

  # --- LINE 3 --- 
  # label 0
  #   a = arg(0, name=a)  :: int64
  #   b = arg(1, name=b)  :: int64
  #   $6binary_add.2 = a + b  :: int64
  #   del b
  #   del a
  #   $8return_value.3 = cast(value=$6binary_add.2)  :: int64
  #   del $6binary_add.2
  #   return $8return_value.3

  return a+b




In [None]:
add(1.0, 2.1)

3.1

In [None]:
add.inspect_types()

add (int64, int64)
--------------------------------------------------------------------------------
# File: <ipython-input-70-04b5d46e4ce0>
# --- LINE 1 --- 

@jit

# --- LINE 2 --- 

def add(a, b):

  # --- LINE 3 --- 
  # label 0
  #   a = arg(0, name=a)  :: int64
  #   b = arg(1, name=b)  :: int64
  #   $6binary_add.2 = a + b  :: int64
  #   del b
  #   del a
  #   $8return_value.3 = cast(value=$6binary_add.2)  :: int64
  #   del $6binary_add.2
  #   return $8return_value.3

  return a+b


add (float64, float64)
--------------------------------------------------------------------------------
# File: <ipython-input-70-04b5d46e4ce0>
# --- LINE 1 --- 

@jit

# --- LINE 2 --- 

def add(a, b):

  # --- LINE 3 --- 
  # label 0
  #   a = arg(0, name=a)  :: float64
  #   b = arg(1, name=b)  :: float64
  #   $6binary_add.2 = a + b  :: float64
  #   del b
  #   del a
  #   $8return_value.3 = cast(value=$6binary_add.2)  :: float64
  #   del $6binary_add.2
  #   return $8return_value.3

  retur

In [None]:
for k, v in add.inspect_llvm().items():
  print(k, v)

In [None]:
%%time
# C++ PI program takes 1.5 seconds
sum = 0
num_steps = 100000000  # 100 million
step = 1.0/num_steps
for i in range(num_steps):
    x = (i-0.5)*step
    sum = sum + 4.0/(1.0+x*x)
pi = step * sum

CPU times: user 29.5 s, sys: 64.4 ms, total: 29.6 s
Wall time: 29.6 s


In [None]:
pi

3.1415926735904267

In [None]:
@jit
def pi_cal(num_steps):
  sum = 0
  step = 1.0/num_steps
  for i in range(num_steps):
      x = (i-0.5)*step
      sum = sum + 4.0/(1.0+x*x)
  pi = step * sum
  return pi

In [None]:
%timeit pi = pi_cal(1000000000)

1 loop, best of 5: 1.58 s per loop


In [None]:
pi

3.1415926735904267

In [None]:
%lprun -T timings.txt -f pi_cal pi_cal(10000000)


*** Profile printout saved to text file 'timings.txt'. 


In [None]:
cProfile.run('pi_cal(1000000000)')

         4 function calls in 1.572 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    1.572    1.572    1.572    1.572 <ipython-input-3-cf5078cde8aa>:1(pi_cal)
        1    0.001    0.001    1.572    1.572 <string>:1(<module>)
        1    0.000    0.000    1.572    1.572 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}




## Comparison of C++/OpenMP, Python plain, Numpy, and Numba (CPU and Nvidia CUDA GPU)
## Conduct the performance analysis