## Let us do some tests related to the efficiency of numpy and scipy functions

In [1]:
import numpy as np

import scipy.sparse

from scipy.sparse import csr_matrix

### Generate sparse random data

In [2]:
n = 5000
dim = 2000
A = csr_matrix(scipy.sparse.random(n, dim))
x = csr_matrix(scipy.sparse.random(dim, 1))
A_ = scipy.sparse.csc_matrix(A)
x_ = scipy.sparse.csc_matrix(x)

### Compare csc_matrix and csr_matrix

In [3]:
%timeit A.dot(x)

321 µs ± 10.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
%timeit A.dot(x_)

364 µs ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
%timeit A_.dot(x)

152 µs ± 4.91 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [6]:
%timeit A_.dot(x_)

104 µs ± 1.57 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


#### Let's check that the results match.

In [7]:
abs(A.dot(x) - A.dot(x_)).sum()

0.0

#### Is Numpy faster?

In [8]:
x = x.toarray().squeeze()
A = A.toarray()

In [9]:
%timeit A @ x

4.37 ms ± 137 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Now let us sample rows (used in SGD)

In [10]:
def fake_stochastic_gradient(A, x):
    n = A.shape[0]
    i = np.random.choice(n)
    return A[i].dot(x)

In [11]:
%timeit fake_stochastic_gradient(A, x)

5.96 µs ± 153 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [12]:
%timeit fake_stochastic_gradient(A, x_)

70.7 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [13]:
%timeit fake_stochastic_gradient(A_, x)

222 µs ± 3.45 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [14]:
%timeit fake_stochastic_gradient(A_, x_)

337 µs ± 19.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Conclusions:
### 1. Use csc for deterministic (full batch) gradient computation
### 2. Use csr if stochastic gradients are required