<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#my_timeit" data-toc-modified-id="my_timeit-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>my_timeit</a></span></li><li><span><a href="#$|x|_2$" data-toc-modified-id="$|x|_2$-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>$|x|_2$</a></span></li><li><span><a href="#$\sum_j-X_{ij}-Y_{jk}$" data-toc-modified-id="$\sum_j-X_{ij}-Y_{jk}$-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>$\sum_j X_{ij} Y_{jk}$</a></span></li><li><span><a href="#$\sum_j-X_{ij}-Y_{kj}$" data-toc-modified-id="$\sum_j-X_{ij}-Y_{kj}$-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>$\sum_j X_{ij} Y_{kj}$</a></span></li><li><span><a href="#$\sum_i-X_{ij}-Y_{ij}$" data-toc-modified-id="$\sum_i-X_{ij}-Y_{ij}$-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>$\sum_i X_{ij} Y_{ij}$</a></span></li><li><span><a href="#$\sum_j-X_{ij}-Y_{ij}$" data-toc-modified-id="$\sum_j-X_{ij}-Y_{ij}$-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>$\sum_j X_{ij} Y_{ij}$</a></span></li><li><span><a href="#solve-vs-lstsq" data-toc-modified-id="solve-vs-lstsq-7"><span class="toc-item-num">7&nbsp;&nbsp;</span><code>solve</code> vs <code>lstsq</code></a></span></li></ul></div>

In [28]:
import numpy as np
import scipy as sp
import scipy.linalg
import sys
print(sys.version)
print(np.__version__)
print(sp.__version__)

try:
    import cpuinfo    
    info = cpuinfo.get_cpu_info()
    if "brand" in info:
        print(info['brand'])
    elif "brand_raw" in info:
        print(info['brand_raw'])
    if "l3_cache_size" in info:
        print("l3_cache_size",info["l3_cache_size"])
    if "l2_cache_size" in info:
        print("l2_cache_size", info["l2_cache_size"])
    if "l1_cache_size" in info:
        print("l1_cache_size",info["l1_cache_size"])
except:
    pass

3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
1.19.2
1.5.2
AMD Ryzen 7 3700X 8-Core Processor
l3_cache_size 33554432
l2_cache_size 4194304


## my_timeit

In [2]:
import timeit

X=None
x=None
Y=None
A=None
y=None
def my_timeit(stat):
    timer = timeit.Timer(stat, globals={"sp":sp, "np": np, "X": X, "x": x, "Y": Y, "y":y, "A": A})
    n, time = timer.autorange()
    time = time / n
    if time < 1E-3:
        print("%-50s [%.1f µs]" % (stat, time * 1E6))
    elif time < 1:
        print("%-50s [%.1f ms]" % (stat, time * 1E3))
    else:
        print("%-50s [%.1f s]" % (stat, time))

## $|x|_2$

In [3]:
N = 1000
x = np.random.randn(N)
# dot and inner avoid temp. memory
my_timeit("np.dot(x,x)")
my_timeit("np.inner(x,x)")
# np.linalg.norm use np.sum(conj(x)*x) to calculate norm of order 2 for vector
my_timeit("np.linalg.norm(x, ord=2)")
# sp.linalg.norm use blas to calculate norm of order 2 for vector
my_timeit("sp.linalg.norm(x, ord=2, check_finite=False)")

np.dot(x,x)                                        [1.2 µs]
np.inner(x,x)                                      [1.2 µs]
np.linalg.norm(x, ord=2)                           [4.2 µs]
sp.linalg.norm(x, ord=2, check_finite=False)       [2.3 µs]


## $\sum_j X_{ij} Y_{jk}$

In [4]:
N = 1000
M = 999
X = np.random.randn(N, M)
Y = np.random.randn(M, N)
assert np.allclose(X.dot(Y), np.einsum("ij,jk->ik", X, Y))


#print(np.einsum_path("ij,jk->ik", X, Y)[1])
print(np.einsum_path("ij,jk->ik", X, Y, optimize=True, einsum_call=True)[1])
print(np.einsum_path("ij,jk->ik", X, Y, optimize=True, einsum_call=True)[1])

# memory order is not good
my_timeit("X.dot(Y)")
# huge temp. memory
#my_timeit("np.sum(X[:,:,np.newaxis]*Y[np.newaxis,:,:], axis=1)")
my_timeit('np.einsum("ij,jk->ik", X, Y)')
my_timeit('np.einsum("ij,jk->ik", X, Y, optimize=True)')


[((1, 0), {'j'}, 'jk,ij->ik', ['ik'], True)]
[((1, 0), {'j'}, 'jk,ij->ik', ['ik'], True)]
X.dot(Y)                                           [28.3 ms]
np.einsum("ij,jk->ik", X, Y)                       [417.7 ms]
np.einsum("ij,jk->ik", X, Y, optimize=True)        [28.6 ms]


## $\sum_j X_{ij} Y_{kj}$

In [5]:
N = 1000
M = 999
X = np.random.randn(N, M)
Y = np.random.randn(N, M)
assert np.allclose(X.dot(Y.T), np.einsum("ij,kj->ik", X, Y))
assert np.allclose(np.sum(X[:,np.newaxis,:]*Y[np.newaxis,:,:], axis=-1), np.einsum("ij,kj->ik", X, Y))

# memory order is not good
my_timeit("X.dot(Y.T)")
# huge temp. memory
#my_timeit("np.sum(X[:,np.newaxis,:]*Y[np.newaxis,:,:], axis=-1)")
my_timeit('np.einsum("ij,kj->ik",X, Y, optimize=False)')
my_timeit('np.einsum("ij,kj->ik",X, Y, optimize=True)')


X.dot(Y.T)                                         [67.8 ms]
np.einsum("ij,kj->ik",X, Y, optimize=False)        [412.8 ms]
np.einsum("ij,kj->ik",X, Y, optimize=True)         [63.8 ms]


## $\sum_i X_{ij} Y_{ij}$

In [6]:
N = 1000
M = 999
X = np.random.randn(N, M)
Y = np.random.randn(N, M)
assert np.allclose(np.sum(X * Y, axis=0), np.einsum("ij,ij->j", X, Y))

# einsum avoid temp. memory
my_timeit("np.sum(X*Y, axis=0)")
my_timeit('np.einsum("ij,ij->j",X, Y)')

np.sum(X*Y, axis=0)                                [2.5 ms]
np.einsum("ij,ij->j",X, Y)                         [725.5 µs]


## $\sum_j X_{ij} Y_{ij}$

In [7]:
N = 1000
M = 999
X = np.random.randn(N, M)
Y = np.random.randn(N, M)
assert np.allclose(np.sum(X * Y, axis=-1), np.einsum("ij,ij->i", X, Y))

# einsum avoid temp. memory
my_timeit("np.sum(X*Y, axis=1)")
my_timeit('np.einsum("ij,ij->i",X, Y)')

np.sum(X*Y, axis=1)                                [2.9 ms]
np.einsum("ij,ij->i",X, Y)                         [620.3 µs]


## `solve` vs `lstsq`

In [8]:
N = 1000
A = np.random.randn(N, N)
y = np.random.randn(N)
my_timeit("np.linalg.solve(A, y)")
my_timeit("np.linalg.lstsq(A, y, rcond=None)")
my_timeit("U,S,VT=np.linalg.svd(A)")
my_timeit("U,S,VT=np.linalg.svd(A); VT.T.dot(1/S*U.T.dot(y))")

np.linalg.solve(A, y)                              [10.0 ms]
np.linalg.lstsq(A, y, rcond=None)                  [161.4 ms]
U,S,VT=np.linalg.svd(A)                            [261.9 ms]
U,S,VT=np.linalg.svd(A); VT.T.dot(1/S*U.T.dot(y))  [276.4 ms]
