# GEMV (Ryzen16)

In [1]:
import iarray as ia
import numpy as np
import scipy.io
import scipy.sparse

## Dense Arrays

### In-memory

#### ironArray

In [2]:
_ = ia.set_config(dtype=np.float32, codec=ia.Codec.LZ4HC, clevel=9, btune=False,
                  contiguous=False, seed=0, nthreads=8)

In [3]:
shape = (25000, 13859)
#chunks = (4096, 2048)
chunks = (8192, 4096)
#blocks = (64, 2048)
blocks = (128, 256)

a_ia = ia.random.normal(shape, 3, 2, chunks=chunks, blocks=blocks, fp_mantissa_bits=4)
a_ia.info

0,1
type,IArray
shape,"(25000, 13859)"
chunks,"(8192, 4096)"
blocks,"(128, 256)"
cratio,4.60


In [4]:
b_ia = ia.linspace((a_ia.shape[1],), 2, 10, chunks=(a_ia.chunks[1],), blocks=(a_ia.blocks[1],))

In [5]:
%%timeit

c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

30.5 ms ± 596 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

#### NumPy

In [7]:
a_np = a_ia.data

In [8]:
b_np = b_ia.data

In [9]:
%%timeit

c_np = a_np.dot(b_np)

50.3 ms ± 38.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
c_np = a_np.dot(b_np)

In [11]:
np.testing.assert_allclose(c_np, c_ia.data, rtol=1e-5 if a_ia.dtype == np.float32 else 1e-12)

### On-disk

#### ironArray

In [12]:
a_ia.copy(chunks=chunks, blocks=blocks, urlpath="dense.iarray", mode="w")

a_ia2 = ia.open("dense.iarray")

a_ia2.info

0,1
type,IArray
shape,"(25000, 13859)"
chunks,"(8192, 4096)"
blocks,"(128, 256)"
cratio,4.60


In [13]:
%%timeit

c_ia2 = ia.opt_gemv(a_ia2, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

43.8 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
c_ia2 = ia.opt_gemv(a_ia2, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

#### NumPy

In [15]:
np.save("dense", a_np)

In [16]:
%%timeit

a_np2 = np.load("dense.npy")
c_np2 = a_np2.dot(b_np)

243 ms ± 7.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
c_np2 = a_np.dot(b_np)

In [18]:
np.testing.assert_allclose(c_np2, c_ia2.data, rtol=1e-5 if a_ia.dtype == np.float32 else 1e-12)

### Bonus: High compression ratio

#### ironArray

In [19]:
a_ia = ia.linspace(shape, 3, 45.5, chunks=chunks, blocks=blocks)


a_ia.info

0,1
type,IArray
shape,"(25000, 13859)"
chunks,"(8192, 4096)"
blocks,"(128, 256)"
cratio,53.31


In [20]:
b_ia = ia.linspace((a_ia.shape[1],), 2, 10, chunks=(a_ia.chunks[1],), blocks=(a_ia.blocks[1],))

In [21]:
%%timeit

c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

20.2 ms ± 125 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

#### NumPy

In [23]:
a_np = a_ia.data

In [24]:
b_np = b_ia.data

In [25]:
%%timeit

c_np = a_np.dot(b_np)

50 ms ± 187 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
c_np = a_np.dot(b_np)

In [27]:
np.testing.assert_allclose(c_np, c_ia.data, rtol=1e-4 if a_ia.dtype == np.float32 else 1e-12)

## Sparse Arrays

In [28]:
_ = ia.set_config(dtype=np.float64, codec=ia.Codec.LZ4HC, clevel=9, btune=False,
                  contiguous=False, seed=0, nthreads=16)

### In-memory

#### SciPy

In [29]:
urlpath = "./human_gene1/human_gene1.mtx"

a_sp = scipy.io.mmread(urlpath)

print(f"Sparsity: {a_sp.getnnz() / np.prod(a_sp.shape)}")

Sparsity: 0.049683884579713435


In [30]:
shape = a_sp.shape
shape

(22283, 22283)

In [31]:
b_sp = np.linspace(0, 1, shape[1], dtype=a_sp.dtype)

In [32]:
%%timeit

c_sp = a_sp.dot(b_sp)

42.6 ms ± 45.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [33]:
c_sp = a_sp.dot(b_sp)

#### ironArray

In [34]:
chunks = (8192, 4096)
blocks = (128, 256)

a_ia = ia.numpy2iarray(a_sp.toarray(), chunks=chunks, blocks=blocks)
a_ia.info

0,1
type,IArray
shape,"(22283, 22283)"
chunks,"(8192, 4096)"
blocks,"(128, 256)"
cratio,9.72


In [35]:
b_ia = ia.numpy2iarray(b_sp, chunks=(chunks[1],), blocks=(blocks[1],))

In [36]:
%%timeit

c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(chunks[0],), blocks=(blocks[0],))

72.9 ms ± 2.56 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
c_ia = ia.opt_gemv(a_ia, b_ia, chunks=(chunks[0],), blocks=(blocks[0],))

In [38]:
np.testing.assert_allclose(c_sp, c_ia.data, rtol=1e-5 if a_ia.dtype == np.float32 else 1e-12)

### On-disk

#### SciPy

In [39]:
%%time

a_sp2 = scipy.io.mmread(urlpath)
c_sp2 = a_sp.dot(b_sp)

CPU times: user 13.4 s, sys: 405 ms, total: 13.8 s
Wall time: 13.4 s


In [40]:
c_sp2 = a_sp.dot(b_sp)

#### ironArray

In [41]:
a_ia.copy(chunks=chunks, blocks=blocks, urlpath="sparse.iarray", mode="w")

a_ia2 = ia.open("sparse.iarray")

a_ia2.info

0,1
type,IArray
shape,"(22283, 22283)"
chunks,"(8192, 4096)"
blocks,"(128, 256)"
cratio,9.72


In [42]:
%%timeit

c_ia2 = ia.opt_gemv(a_ia2, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))

86.7 ms ± 1.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
c_ia2 = ia.opt_gemv(a_ia2, b_ia, chunks=(a_ia.chunks[0],), blocks=(a_ia.blocks[0],))