In [2]:
import numpy
import cupy

import scipy
import cupyx

In [3]:
mempool = cupy.get_default_memory_pool()

with cupy.cuda.Device(0):
    mempool.set_limit(size=3*1024**3)  # 2 GiB

# Dense matrices performance

Low-D, numpy, no correlation

In [5]:
numpy.random.seed(42)
G = numpy.random.randn(100, 10000)
m = numpy.ones((G.shape[1], 1))
d0 = 0.2 * numpy.ones((G.shape[0], 1))

# pre-transpose
Gt = G.T
d0t = d0.T

covariance = 1.2 * numpy.ones_like(d0)
sigma = covariance ** 0.5

# pre-multiply
GtG = Gt @ numpy.diag(1.0 / covariance[:, 0]) @ G
Gtd0 = Gt @ numpy.diag(1.0 / covariance[:, 0]) @ d0
d0td0 = (d0.T @ numpy.diag(1.0 / covariance[:, 0]) @ d0).item()

print(0.5 * numpy.linalg.norm((G @ m - d0) / sigma) ** 2)
print(0.5 * ((G @ m - d0).T @ ((G @ m - d0) / covariance)).item())
print(0.5 * ((m.T @ Gt - d0t) @ ((G @ m - d0) / covariance)).item())
print(0.5 * (m.T @ (GtG @ m - 2 * Gtd0) + d0td0).item())

%timeit -n 50 0.5 * numpy.linalg.norm((G @ m - d0) / sigma) ** 2 # Winner for wide
%timeit -n 50 0.5 * ((G @ m - d0).T @ ((G @ m - d0) / covariance)).item()
%timeit -n 50 0.5 * ((m.T @ Gt - d0t) @ ((G @ m - d0) / covariance)).item()
%timeit -n 50 0.5 * (m.T @ (GtG @ m - 2 * Gtd0) + d0td0).item() # Winner for tall

395198.59447737667
395198.59447737667
395198.59447737667
395198.5944773766
219 µs ± 68.2 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
The slowest run took 8.86 times longer than the fastest. This could mean that an intermediate result is being cached.
715 µs ± 780 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
311 µs ± 34.3 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
33.3 ms ± 758 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


Low-D, numpy, correlation

In [6]:
covariance = (
    1.2 * numpy.eye(d0.size)
    - 0.6 * numpy.eye(d0.size, k=-1)
    - 0.6 * numpy.eye(d0.size, k=1)
)
inv_covariance = numpy.linalg.inv(covariance)
cholesky_upper_inv_covariance = numpy.linalg.cholesky(inv_covariance).T

# pre-multiply
GtG = Gt @ inv_covariance @ G
Gtd0 = Gt @ inv_covariance @ d0
d0td0 = (d0.T @ inv_covariance @ d0).item()

print(0.5 * numpy.linalg.norm(cholesky_upper_inv_covariance @ (G @ m - d0)) ** 2)
print(0.5 * ((G @ m - d0).T @ inv_covariance @ ((G @ m - d0))).item())
print(0.5 * ((m.T @ Gt - d0t) @ inv_covariance @ ((G @ m - d0))).item())

%timeit -n 50 0.5 * numpy.linalg.norm(cholesky_upper_inv_covariance @ (G @ m - d0)) ** 2 # Winner for wide
%timeit -n 50 0.5 * ((G @ m - d0).T @ inv_covariance @ ((G @ m - d0))).item() 
%timeit -n 50 0.5 * ((m.T @ Gt - d0t) @ inv_covariance @ ((G @ m - d0))).item()
%timeit -n 50 0.5 * (m.T @ (GtG @ m - 2 * Gtd0) + d0td0).item() # Winner for tall

16721368.323694183
16721368.323694177
16721368.323694177
172 µs ± 3.74 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
325 µs ± 21 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
The slowest run took 10.10 times longer than the fastest. This could mean that an intermediate result is being cached.
745 µs ± 947 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
32.4 ms ± 633 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


Low-D, cupy, no correlation

In [27]:
G_gpu = cupy.array(G)
d0_gpu = cupy.array(d0)

Gt_gpu = G_gpu.T
d0t_gpu = d0_gpu.T

covariance_gpu = 1.2 * cupy.ones_like(d0_gpu)
sigma_gpu = covariance_gpu ** 0.5

print(0.5 * cupy.linalg.norm((G_gpu @ cupy.array(m) - d0_gpu) / sigma_gpu) ** 2)
print(
    0.5
    * (
        (G_gpu @ cupy.array(m) - d0_gpu).T
        @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)
    ).item()
)
print(
    0.5
    * (
        (cupy.array(m).T @ Gt_gpu - d0t_gpu)
        @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)
    ).item()
)

%timeit -n 50 0.5 * cupy.linalg.norm((G_gpu @ cupy.array(m) - d0_gpu) / sigma_gpu).item() ** 2 # Winner for all
%timeit -n 50 0.5 * ((G_gpu @ cupy.array(m) - d0_gpu).T @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)).item()
%timeit -n 50 0.5 * ((cupy.array(m).T @ Gt_gpu - d0t_gpu) @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)).item()

45613.27902699428
45613.27902699427
45613.27902699427
226 µs ± 56.5 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
194 µs ± 32.2 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
186 µs ± 20.1 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


Low-D, cupy, correlation

In [29]:
covariance_gpu = (
    1.2 * cupy.eye(d0.size)
    - 0.6 * cupy.eye(d0.size, k=-1)
    - 0.6 * cupy.eye(d0.size, k=1)
)
inv_covariance_gpu = cupy.asarray(inv_covariance)
cholesky_upper_inv_covariance_gpu = cupy.asarray(cholesky_upper_inv_covariance)

print(0.5 * cupy.linalg.norm(cholesky_upper_inv_covariance_gpu @ (G_gpu @ cupy.array(m) - d0_gpu)) ** 2)
print(0.5 * ((G_gpu @ cupy.array(m) - d0_gpu).T @ inv_covariance_gpu@ ((G_gpu @ cupy.array(m) - d0_gpu))).item())
print(0.5 * ((cupy.array(m).T @ Gt_gpu - d0t_gpu) @ inv_covariance_gpu@ ((G_gpu @ cupy.array(m) - d0_gpu) )).item())

%timeit -n 50 0.5 * cupy.linalg.norm(cholesky_upper_inv_covariance_gpu @ (G_gpu @ cupy.array(m) - d0_gpu)) ** 2 # Winner for all
%timeit -n 50 0.5 * ((G_gpu @ cupy.array(m) - d0_gpu).T @ inv_covariance_gpu@ ((G_gpu @ cupy.array(m) - d0_gpu))).item()
%timeit -n 50 0.5 * ((cupy.array(m).T @ Gt_gpu - d0t_gpu) @ inv_covariance_gpu@ ((G_gpu @ cupy.array(m) - d0_gpu) )).item()

517908.4648489458
517908.46484894573
517908.46484894573
199 µs ± 34.7 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
165 µs ± 8.52 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
159 µs ± 2.7 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


## Gradients

In [48]:
numpy.random.seed(42)
G = numpy.random.randn(2000, 100)
m = numpy.ones((G.shape[1], 1))
d0 = 0.2 * numpy.ones((G.shape[0], 1))

# pre-transpose
Gt = G.T
d0t = d0.T

covariance = 1.2 * numpy.ones_like(d0)
sigma = covariance ** 0.5

# pre-multiply
GtG = Gt @ numpy.diag(1.0 / covariance[:, 0]) @ G
Gtd0 = Gt @ numpy.diag(1.0 / covariance[:, 0]) @ d0

print((GtG @ m - Gtd0).T @ m)
print((G.T @ ((G @ m - d0) / covariance)).T @ m)
print((Gt @ ((G @ m - d0) / covariance)).T @ m)

%timeit -n 50 (GtG @ m - Gtd0) # Winner for tall
%timeit -n 50 G.T @ ((G @ m - d0) / covariance)
%timeit -n 50 Gt @ ((G @ m - d0) / covariance) # Winner for wide

[[171655.5852016]]
[[171655.5852016]]
[[171655.5852016]]
The slowest run took 7.74 times longer than the fastest. This could mean that an intermediate result is being cached.
26.4 µs ± 29.9 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
The slowest run took 6.50 times longer than the fastest. This could mean that an intermediate result is being cached.
219 µs ± 189 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
119 µs ± 40.2 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


In [13]:
covariance = (
    1.2 * numpy.eye(d0.size)
    - 0.6 * numpy.eye(d0.size, k=-1)
    - 0.6 * numpy.eye(d0.size, k=1)
)
inv_covariance = numpy.linalg.inv(covariance)
cholesky_lower_inv_covariance = numpy.linalg.cholesky(inv_covariance).T
cholesky_upper_inv_covariance = cholesky_lower_inv_covariance.T

# pre-multiply
GtG = Gt @ inv_covariance @ G
Gtd0 = Gt @ inv_covariance @ d0

# partOp = cholesky_upper_inv_covariance @ G
# partVec = - Gt @ inv_covariance @ d0

print((GtG @ m - Gtd0).T @ m)
print((G.T @ inv_covariance @ (G @ m - d0) ).T @ m)
print((Gt @ inv_covariance @ ((G @ m - d0))).T @ m)
# print((partOp.T @ partOp @ m + partVec).T @ m)

GtinvCov = Gt @ inv_covariance @ d0
preffac = cholesky_upper_inv_covariance @G
preffacT = preffac.T

%timeit -n 50 GtG @ m - Gtd0 # Winner for tall
%timeit -n 50 G.T @ inv_covariance @ (G @ m - d0)
%timeit -n 50 Gt @ inv_covariance @ ((G @ m - d0)) # Winner for wide
# %timeit -n 50 partOp.T @ partOp @ m + partVec

[[33071413.41200218]]
[[33071413.41200218]]
[[33071413.41200218]]
5.59 ms ± 858 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
5.87 ms ± 323 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


KeyboardInterrupt: 

In [52]:
G_gpu = cupy.array(G)
d0_gpu = cupy.array(d0)

Gt_gpu = G_gpu.T
d0t_gpu = d0_gpu.T

covariance_gpu = 1.2 * cupy.ones_like(d0_gpu)
sigma_gpu = covariance_gpu ** 0.5

# pre-multiply
GtG_gpu = Gt_gpu @ numpy.diag(1.0 / covariance_gpu[:, 0]) @ G_gpu
Gtd0_gpu = Gt_gpu @ numpy.diag(1.0 / covariance_gpu[:, 0]) @ d0_gpu

print((GtG_gpu @ cupy.asarray(m) - Gtd0_gpu).T @ cupy.asarray(m))
print(
    (G_gpu.T @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)).T @ cupy.asarray(m)
)
print(
    (Gt_gpu @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)).T @ cupy.asarray(m)
)

%timeit -n 50 (GtG_gpu @ cupy.asarray(m) - Gtd0_gpu) # Winner for tall
%timeit -n 50 (G_gpu.T @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu))
%timeit -n 50 (Gt_gpu @ ((G_gpu @ cupy.array(m) - d0_gpu) / covariance_gpu)) # Winner for wide

[[171655.5852016]]
[[171655.5852016]]
[[171655.5852016]]
55.8 µs ± 3.95 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
74.5 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
73.6 µs ± 1.82 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


In [53]:
covariance_gpu = (
    1.2 * cupy.eye(d0_gpu.size)
    - 0.6 * cupy.eye(d0_gpu.size, k=-1)
    - 0.6 * cupy.eye(d0_gpu.size, k=1)
)
inv_covariance_gpu = cupy.linalg.inv(covariance_gpu)
cholesky_upper_inv_covariance_gpu = cupy.linalg.cholesky(inv_covariance_gpu).T

# pre-multiply
GtG_gpu = Gt_gpu @ inv_covariance_gpu @ G_gpu
Gtd0_gpu = Gt_gpu @ inv_covariance_gpu @ d0_gpu

print((GtG_gpu @ cupy.asarray(m) - Gtd0_gpu).T @ cupy.asarray(m))
print((G_gpu.T @ inv_covariance_gpu @ (G_gpu @ cupy.asarray(m) - d0_gpu) ).T @ cupy.asarray(m))
print((Gt_gpu @ inv_covariance_gpu @ ((G_gpu @ cupy.asarray(m) - d0_gpu))).T @ cupy.asarray(m))

%timeit -n 50 (GtG_gpu @ cupy.asarray(m) - Gtd0_gpu) # Winner for all
%timeit -n 50 (G_gpu.T @ inv_covariance_gpu @ (G_gpu @ cupy.asarray(m) - d0_gpu) )
%timeit -n 50 (Gt_gpu @ inv_covariance_gpu @ ((G_gpu @ cupy.asarray(m) - d0_gpu)))

[[5633777.08538338]]
[[5633777.08538341]]
[[5633777.08538341]]
51.9 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
The slowest run took 255.85 times longer than the fastest. This could mean that an intermediate result is being cached.
4.76 ms ± 7.44 ms per loop (mean ± std. dev. of 7 runs, 50 loops each)
18 ms ± 306 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


# Conclusions:
High dimensional, go CuPy if possible.