In [None]:
# explanation
''' Hey friends.. I'm writing some programs and they work but gosh
do they run slowly. I've been flirting with ways to increase the 
speed of some of the simpler operations. For example, I'm doing a lot
of iteration over 3d matrices (neuroimages of course) that I feel like
could be sped up.

I know this is possible because I drastically sped up one of my
my functions by just doing it a bit differently. I will show you
what I mean with a very simple example: I'll just add one to
every element on the matrix (the actual code was obviously more 
complicated)'''

In [2]:
import numpy as np
import math
# a tiny 3d matrix to practice on
mtx = np.arange(18).reshape(3,3,2)

In [17]:
# the original (slow, basic bitch) way
def spd_tst_old(mtx):
    x,y,z = mtx.shape                                                      
    for i in range(x):
        for j in range(y):
            for k in range(z):
                mtx[i][j][k] = mtx[i][j][k]+1
    return mtx

In [18]:
# working with a flattened matrix
def spd_tst_flat(mtx):
    for i in mtx.flat:                                                     
        i = i+1
    return mtx

In [19]:
# making use of nditer

def spd_tst_elp(mtx):
    for i in np.nditer(mtx,flags=['external_loop'],op_flags=['readwrite']):
        i[...] = i+1
    return mtx

In [None]:
# now we test their speeds!

In [20]:
%%timeit
spd_tst_old(mtx)

The slowest run took 883.14 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 31.5 µs per loop


In [21]:
%%timeit
spd_tst_flat(mtx)

The slowest run took 2874.58 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 5.69 µs per loop


In [22]:
%%timeit
spd_tst_elp(mtx)

The slowest run took 2802.23 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 8.68 µs per loop


In [None]:
# You can see that the latter two methods sped things up by 3-5x. Actually,
# I ended up using using the principals from the nditer method and it 
# increased the speed of my function by more like 10-50x. 

In [8]:
# My next issue, however, required not just indexing points in a matrix, but
# rather, slices. In other words, in a 3d matrix with shape x,y,z, iterating 
# over x*y 1d slices and performing a function on them. In order to find a 
# better method, I again started to experiment with a simple example, where
# I do this and take the mean. Here are a few things I have tried:

In [12]:
# Using basic python iteration

def arr_tst_old(mtx):
    means = []
    x,y,z = mtx.shape
    for i in range(x):
        for j in range(y):
            means.append(np.mean(mtx[i][j][:]))
    return means

In [2]:
# iterating on converted 2d matrix
def arr_tst_reshp(mtx):
    means = []
    x,y,z = mtx.shape
    for i in mtx.reshape(x*y,z):
        means.append(np.mean(i))
    return means

In [3]:
# Trying to use nditer
def arr_tst_nditer(data,axis):
    mnz = []
    yshape = list(data.shape)
    del yshape[axis]
    y = np.zeros(yshape, data.dtype)
    it = np.nditer(y, flags=['multi_index'])
    while not it.finished:
        xindex = list(it.multi_index)
        xindex.insert(axis, slice(None))
        mnz.append(np.mean(data[xindex]))
        it.iternext()
    return mnz

In [4]:
# Trying to use ndindex
def arr_tst_nindx(mtx):
    means = []
    for tup in np.ndindex((mtx.shape[:-1])):
        mn = np.mean(mtx[(tup[0],tup[1],slice(None))])
        means.append(mn)
    return means

In [10]:
%%timeit 
arr_tst_old(mtx)

10000 loops, best of 3: 154 µs per loop


In [11]:
%%timeit
arr_tst_reshp(mtx)

10000 loops, best of 3: 147 µs per loop


In [13]:
%%timeit
arr_tst_nditer(mtx,2)

1000 loops, best of 3: 180 µs per loop


In [14]:
%%timeit
arr_tst_nindx(mtx)

1000 loops, best of 3: 227 µs per loop


In [None]:
# You can see that none of the methods are giving me good speed up.
# This is really disappointing -- i KNOW there's a way to do this. I
# guess a good option is cython but I'm really afraid of it. But you
# guys work with shit like this all the time so I was wondering if 
# maybe you had some suggestions??

# <3 <3 <3 <3 and edam cheese!
# -- Jake (or has the Dutch call me, phonetically, Yakob Foggghhhel)

In [None]:
# Below are some things I tried with the suggestions I got....

In [6]:
def arr_tst_map(mtx):
    means = []
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    for i in map(np.mean,mtx_rsl):
        means.append(i)
    return means

In [8]:
def arr_tst_map2(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    means = [i/float(z) for i in map(np.sum, mtx_rsl )]
    return means

In [7]:
%%timeit
arr_tst_map(mtx)

10000 loops, best of 3: 135 µs per loop


In [10]:
%%timeit 
arr_tst_map2(mtx)

The slowest run took 243.73 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 58.9 µs per loop


In [13]:
a = arr_tst_old(mtx)
b = arr_tst_map(mtx)
a == b

True

In [16]:
def sd_tst_np(mtx):
    sds = []
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    for i in map(np.std,mtx_rsl):
        sds.append(i)
    return sds

In [53]:
def sd_tst_map(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    means = [i/float(z) for i in map(np.sum, mtx_rsl)]
    sds = [math.sqrt(i) for i in map(np.mean,np.subtract(mtx_rsl,np.array(means).reshape(9,1))**2)]
    return sds
        

In [88]:
def sd_tst_map2(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    means = np.array([i/float(z) for i in map(np.sum, mtx_rsl)]).reshape(x,y)
    sds = np.array([math.sqrt(k) for k in [i/float(z) for i in map(np.sum,np.subtract(mtx_rsl,np.array(means).reshape(9,1))**2)]]).reshape(x,y)
    return(means,sds)

In [106]:
def sd_tst_map3(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    means = np.array([i/float(z) for i in map(np.sum, mtx_rsl)]).reshape(x,y,1)
    var = [i/float(z) for i in map(np.sum,np.subtract(mtx_rsl,np.array(means).reshape(9,1))**2)]
    sds = np.array([math.sqrt(k) for k in var]).reshape(x,y,1)
    return(means,sds)

In [116]:
def sd_tst_map4(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    means = np.array([i/float(z) for i in map(sum, mtx_rsl)]).reshape(x,y,1)
    var = [i/float(z) for i in map(sum,np.subtract(mtx_rsl,np.array(means).reshape(9,1))**2)]
    sds = np.array([math.sqrt(k) for k in var]).reshape(x,y,1)
    return(means,sds)

In [35]:
%%timeit
sd_tst_np(mtx)

1000 loops, best of 3: 400 µs per loop


In [55]:
%%timeit
sd_tst_map(mtx)

1000 loops, best of 3: 200 µs per loop


In [101]:
%%timeit
sd_tst_map2(mtx)

1000 loops, best of 3: 200 µs per loop


In [107]:
%%timeit
sd_tst_map3(mtx)

1000 loops, best of 3: 203 µs per loop


In [117]:
%%timeit
sd_tst_map4(mtx)

10000 loops, best of 3: 135 µs per loop


In [113]:
def tst_mnsd_old(mtx):
    x,y,z = mtx.shape
    means = np.full((x,y),np.nan)
    sds = np.full((x,y),np.nan)
    for i in range(x):
        for j in range(y):
            means[i][j] = np.mean(mtx[i][j][:])
            sds[i][j] = np.std(mtx[i][j][:])
    means.reshape(x,y,1)
    sds.reshape(x,y,1)
    return means,sds

In [90]:
%%timeit
tst_mnsd_old(mtx)

1000 loops, best of 3: 573 µs per loop


In [92]:
%%timeit
sd_tst_map2(mtx)

1000 loops, best of 3: 197 µs per loop


In [114]:
%%timeit
tst_mnsd_old(mtx)

1000 loops, best of 3: 569 µs per loop


In [115]:
%%timeit
sd_tst_map3(mtx)

1000 loops, best of 3: 204 µs per loop


In [None]:
''' Okay, now for something different. I want to speed up a different
function now and will test some other ways of speeding things up'''

In [3]:
bigr_mtx = np.arange(1000).reshape(10,10,10)

In [10]:
r_msk = np.random.choice([0, 1], size=(10,10,10), p=[1./5, 4./5])
m_mtx = np.ma.masked_array(bigr_mtx,np.logical_not(r_msk))

In [36]:
def iter_tst(mtx):
    x,y,z = mtx.shape
    mtx_rsl = mtx.reshape(x*y,z)
    omtx = np.array([i+10 for i in mtx_rsl]).reshape(x*y,z)
    return omtx

In [50]:
%%timeit
omtx = iter_tst(m_mtx.data)

The slowest run took 5.21 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 262 µs per loop


In [52]:
print(omtx[1][:])

[10 11 22 23 24 25 26 27 28 29]


In [51]:
%%timeit
omtx = iter_tst(bigr_mtx)

1000 loops, best of 3: 258 µs per loop
