In [1]:
%pylab inline
from numba import jit, vectorize, float32, float64
from smartFormat import numFmt2, sepThreeTens, lowPrec
import pandas as pd


Populating the interactive namespace from numpy and matplotlib


# Test speed of tensor operations

## Create test data
This is equivalent to the first-dimension-concatenated array that results from $\nu_e$ and $\nu_\mu$ fluxes binned at $400 \;E \times 400 \cos\theta$ bins.

The resulting *input* array is then $2\times400\times400$, and the transform that must (effectively) left-multiply that is $3\times 2\times400\times400$.

This hopefully represents a realistic scenario for performing an accurate oscillation calculation.

In [2]:
np.random.seed(0)
xform = np.array(np.random.random_sample((3, 2, 400, 400)), dtype=np.float64)
inputs = np.array(np.random.random_sample((2, 400, 400)), dtype=np.float64)

xform_fp32 = np.array(xform, dtype=np.float32)
inputs_fp32 = np.array(inputs, dtype=np.float32)

input0 = inputs[0,::]
input1 = inputs[1,::]


## Numpy using einsum

### Float64 math on float64 inputs/transforms

In [3]:
ein_64m_64op = %timeit -r 10 -q -o np.einsum('ij..., j...', xform, inputs, dtype=np.float64, casting='unsafe');

In [4]:
ein_64m_64op_med = np.median(ein_64m_64op.all_runs) / ein_64m_64op.loops
print 'Median time, einsum FP64 math / FP64 operands:', \
        simpleFormat(ein_64m_64op_med) + ' sec'


Median time, einsum FP64 math / FP64 operands: 8.983e-3 sec


In [5]:
output_einsum = np.einsum('ij..., j...', xform, inputs)
print output_einsum.shape

(400, 400, 3)


check that it's doing what I want it to do

In [6]:
x = xform[:,:,1,10]
x

array([[ 0.01560606,  0.18573089],
       [ 0.10818345,  0.34420619],
       [ 0.74177861,  0.28394148]])

In [7]:
i = inputs[:,1,10]
i

array([ 0.89478297,  0.46917306])

In [8]:
o = np.dot(x, i)
o

array([ 0.10110397,  0.25829298,  0.79694856])

In [9]:
output_einsum[1,10,:]

array([ 0.10110397,  0.25829298,  0.79694856])

In [10]:
np.all(o == output_einsum[1,10,:])

True

### Float32 math on float64 inputs/transforms

In [11]:
ein_32m_64op = %timeit -r 10 -q -o np.einsum('ij..., j...', xform, inputs, dtype=np.float32, casting='unsafe');

In [12]:
ein_32m_64op_med = np.median(ein_32m_64op.all_runs) / ein_32m_64op.loops
print 'Median time, einsum FP32 math / FP64 operands:', \
        simpleFormat(ein_32m_64op_med) + ' sec'
print simpleFormat(ein_32m_64op_med / ein_64m_64op_med*100)+'% of ein64_64'


Median time, einsum FP32 math / FP64 operands: 6.057e-3 sec
67.42% of ein64_64


### Float32 math on float32 inputs/transforms

In [13]:
ein_32m_32op = %timeit -r 10 -q -o np.einsum('ij..., j...', xform_fp32, inputs_fp32, dtype=np.float32, casting='no');

In [14]:
ein_32m_32op_med = np.median(ein_32m_32op.all_runs) / ein_32m_32op.loops
print 'Median time, einsum FP32 math / FP32 operands:', \
        simpleFormat(ein_32m_32op_med) + ' sec'
print simpleFormat(ein_32m_32op_med / ein_64m_64op_med*100)+'% of ein64_64'

Median time, einsum FP32 math / FP32 operands: 2.983e-3 sec
33.2% of ein64_64


## Python looping

In [15]:
def apply_python(inputs, transform):
    N_k = inputs.shape[1]
    N_l = inputs.shape[2]
    output = np.empty((N_k, N_l, 3), np.float64)
    for k in range(N_k):
        for l in range(N_l):
            output[k,l,0] = (
                transform[0,0,k,l]*inputs[0,k,l] +
                transform[0,1,k,l]*inputs[1,k,l]
            )
            output[k,l,1] = (
                transform[1,0,k,l]*inputs[0,k,l] +
                transform[1,1,k,l]*inputs[1,k,l]
            )
            output[k,l,2] = (
                transform[2,0,k,l]*inputs[0,k,l] +
                transform[2,1,k,l]*inputs[1,k,l]
            )
    return output

### Float64 operands

In [16]:
py_64m_64op = %timeit -r 5 -q -o apply_python(inputs, xform);

In [17]:
py_64m_64op_med = np.median(py_64m_64op.all_runs) / py_64m_64op.loops
print 'Median time, Python FP64 math / FP64 operands:', \
        simpleFormat(py_64m_64op_med) + ' sec'
print simpleFormat(py_64m_64op_med / ein_64m_64op_med*100)+'% of ein64_64'

Median time, Python FP64 math / FP64 operands: 7.632e-1 sec
8496% of ein64_64


In [18]:
output_python = apply_python(inputs, xform)
np.all(output_python == output_einsum)

True

### Float32 operands

In [19]:
py_64m_32op = %timeit -r 5 -q -o apply_python(inputs_fp32, xform_fp32);

In [20]:
py_64m_32op_med = np.median(py_64m_32op.all_runs) / py_64m_32op.loops
print 'Median time, Python FP64 math / FP32 operands:', \
        simpleFormat(py_64m_32op_med) + ' sec'
print simpleFormat(py_64m_32op_med / ein_64m_64op_med*100)+'% of ein64_64'

Median time, Python FP64 math / FP32 operands: 7.971e-1 sec
8873% of ein64_64


## Numba

### Float64 math on float64 operands

In [21]:
@jit("float64[:,:,:](float64[:,:,:], float64[:,:,:,:])", nopython=False, nogil=True, cache=True)
def apply_numba_fp64(inputs, transform):
    N_k = inputs.shape[1]
    N_l = inputs.shape[2]
    output = np.empty((N_k, N_l, 3), float64)
    for k in range(N_k):
        for l in range(N_l):
            output[k,l,0] = (
                transform[0,0,k,l]*inputs[0,k,l] +
                transform[0,1,k,l]*inputs[1,k,l]
            )
            output[k,l,1] = (
                transform[1,0,k,l]*inputs[0,k,l] +
                transform[1,1,k,l]*inputs[1,k,l]
            )
            output[k,l,2] = (
                transform[2,0,k,l]*inputs[0,k,l] +
                transform[2,1,k,l]*inputs[1,k,l]
            )
    return output

In [22]:
nu_64m_64op = %timeit -r 10 -q -o apply_numba_fp64(inputs, xform)

In [23]:
nu_64m_64op_med = np.median(nu_64m_64op.all_runs) / nu_64m_64op.loops
print 'Median time, Numba FP64 math / FP64 operands:', \
        simpleFormat(nu_64m_64op_med) + ' sec'
print simpleFormat(nu_64m_64op_med / ein_64m_64op_med*100)+'% of ein64_64'

Median time, Numba FP64 math / FP64 operands: 4.731e-3 sec
52.66% of ein64_64


In [24]:
output_numba = apply_numba_fp64(inputs, xform)
np.all(output_numba == output_einsum)

True

### Float32 math on float32 operands

In [25]:
@jit("float32[:,:,:](float32[:,:,:], float32[:,:,:,:])", nopython=True, nogil=True, cache=True)
def apply_numba_fp32(inputs, transform):
    N_k = inputs.shape[1]
    N_l = inputs.shape[2]
    output = np.empty((N_k, N_l, 3), float32)
    for k in range(N_k):
        for l in range(N_l):
            output[k,l,0] = (
                transform[0,0,k,l]*inputs[0,k,l] +
                transform[0,1,k,l]*inputs[1,k,l]
            )
            output[k,l,1] = (
                transform[1,0,k,l]*inputs[0,k,l] +
                transform[1,1,k,l]*inputs[1,k,l]
            )
            output[k,l,2] = (
                transform[2,0,k,l]*inputs[0,k,l] +
                transform[2,1,k,l]*inputs[1,k,l]
            )
    return output

In [26]:
nu_32m_32op = %timeit -r 10 -q -o apply_numba_fp32(inputs_fp32, xform_fp32)

In [27]:
nu_32m_32op_med = np.median(nu_32m_32op.all_runs) / nu_32m_32op.loops
print 'Median time, Numba FP32 math / FP32 operands:', \
        simpleFormat(nu_32m_32op_med) + ' sec'
print simpleFormat(nu_32m_32op_med / ein_64m_64op_med*100)+'% of ein64_64'

Median time, Numba FP32 math / FP32 operands: 2.161e-3 sec
24.06% of ein64_64


# Show summary of timing results

## Timings in seconds

In [28]:
timings = [
    {'Python FP64math FP64op': py_64m_64op_med},
    {'Python FP64math FP32op': py_64m_32op_med},
    {'einsum FP64math FP64op': ein_64m_64op_med},
    {'einsum FP32math FP64op': ein_32m_64op_med},
    {'einsum FP32math FP32op': ein_32m_32op_med},
    {'Numba FP64math FP64op':  nu_64m_64op_med},
    {'Numba FP32math FP32op':  nu_32m_32op_med}
]
timings = pd.DataFrame(pd.Series(
    [t.values()[0] for t in timings],
    [t.keys()[0] for t in timings],
)).T
timings

Unnamed: 0,Python FP64math FP64op,Python FP64math FP32op,einsum FP64math FP64op,einsum FP32math FP64op,einsum FP32math FP32op,Numba FP64math FP64op,Numba FP32math FP32op
0,0.763191,0.797091,0.008983,0.006057,0.002983,0.004731,0.002161


## Timings as fraction of np.einsum 64 / 64 (baseline)

In [29]:
timings / timings['einsum FP64math FP64op'].values

Unnamed: 0,Python FP64math FP64op,Python FP64math FP32op,einsum FP64math FP64op,einsum FP32math FP64op,einsum FP32math FP32op,Numba FP64math FP64op,Numba FP32math FP32op
0,84.957015,88.730703,1,0.674207,0.332019,0.52661,0.240554


# Computer used for test

In [30]:
!!lscpu

['Architecture:          x86_64',
 'CPU op-mode(s):        32-bit, 64-bit',
 'Byte Order:            Little Endian',
 'CPU(s):                2',
 'On-line CPU(s) list:   0,1',
 'Thread(s) per core:    1',
 'Core(s) per socket:    2',
 'Socket(s):             1',
 'NUMA node(s):          1',
 'Vendor ID:             GenuineIntel',
 'CPU family:            6',
 'Model:                 23',
 'Model name:            Intel(R) Core(TM)2 Duo CPU     P8600  @ 2.40GHz',
 'Stepping:              10',
 'CPU MHz:               2394.000',
 'CPU max MHz:           2394.0000',
 'CPU min MHz:           798.0000',
 'BogoMIPS:              4778.16',
 'Virtualization:        VT-x',
 'L1d cache:             32K',
 'L1i cache:             32K',
 'L2 cache:              3072K',
 'NUMA node0 CPU(s):     0,1']