In [1]:
import time
from math import sqrt, ceil
import numpy as np
import scipy


#### a) Create Data

In [93]:
n=int(11e6)
d=28
sample_size=int(10e6)
X = np.array(np.random.rand(n, d))
X2 = np.array(np.random.rand(n, d), order='F' )

In [18]:
timeit np.random.choice(range(n), sample_size, replace=False)

1 loop, best of 3: 3.18 s per loop


In [19]:
timeit np.random.permutation(n)[0:sample_size]

1 loop, best of 3: 581 ms per loop


In [96]:
int_idx=np.random.permutation(n)[0:sample_size]

bool_idx = np.zeros(n,dtype=bool)
bool_idx[int_idx]=True


#### b) Sample rows

In [102]:
runs=30

##### 1. np.take

- boolean indexing is faster

In [103]:
time_l=[]
for i in range(runs):
    _X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=np.take(X,int_idx,axis=0)
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### integer indexing:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))

time_l=[]
for i in range(runs):
    _X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=np.compress(bool_idx,X,axis=0)
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### boolean indexing:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))



### integer indexing:
avg=  2.223499169067812
sd=  0.158276333685
### boolean indexing:
avg=  1.2602577272118956
sd=  0.128531334833


- w/o preinitialization slower

In [104]:
time_l=[]
for i in range(runs):
    #_X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=np.compress(bool_idx,X,axis=0)
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### boolean indexing w/o preinitialization:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))



### boolean indexing w/o preinitialization:
avg=  2.265117617727022
sd=  1.62556371877


- accessing rows in column major array slower

In [53]:
time_l=[]
for i in range(runs):
    _X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=np.compress(bool_idx,X2,axis=0)
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### boolean indexing column major:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))



### boolean indexing column major:
avg=  7.155123254208775
sd=  2.13494011109


##### 2. Fancy Indexing


- slower than ndarray.compress/take


In [45]:
time_l=[]
for i in range(runs):
    _X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=X[int_idx,:]
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### integer indexing:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))

time_l=[]
for i in range(runs):
    _X=np.zeros((sample_size,d))
    t_0=time.clock()
    _X=X[bool_idx,:]
    t_1=time.clock()
    time_l.append(t_1-t_0)
print('### boolean indexing:')
print ('avg= ', (sum(time_l))/runs)
print ('sd= ', np.std(time_l))



### integer indexing:
avg=  3.0003420948268817
sd=  0.338203225645
### boolean indexing:
avg=  1.8057687839254413
sd=  0.0920401465134
