# Scratch work and preliminary performance notes for `swe-python` with QTT.

In [1]:
import os
import time
import numpy as np
import netCDF4 as nc
import argparse

from stb import strtobool

from msh import load_mesh, load_flow, \
                sort_mesh, sort_flow
from ops import trsk_mats

from _dx import HH_TINY, UU_TINY
from _dx import invariant, diag_vars, tcpu
from _dt import step_eqns

import torch as tn
import torchtt as tt

from sympy.ntheory import factorint
import math

from temp_init import init_file
from timer import Timer

## Set up

Here, we make a "fake" `cnfg` object with defaults since we don't have access to `argparse` from the notebook.

In [2]:
class base: pass

cnfg = base()

cnfg.save_freq = 100
cnfg.stat_freq = 100

cnfg.integrate = 'RK44'
cnfg.operators = 'TRSK-CV'
cnfg.equations = 'SHALLOW-WATER'
cnfg.ke_upwind = 'AUST-CONST'
cnfg.ke_scheme = 'CENTRE'
cnfg.pv_upwind = 'AUST-ADAPT'
cnfg.pv_scheme = 'UPWIND'

cnfg.du_damp_4 = 0.0
cnfg.vu_damp_4 = 0.0

cnfg.iteration = 0
cnfg.no_rotate = False

# testing with simple, quasi-linear
# test case on low resolution mesh
name = 'io/ltc1_cvt_5.nc'
path, file = os.path.split(name)
save = os.path.join(path, "out_" + file)

Load configs, init file, and build the RHS TRiSK operators.

The `flow` object stores the current model state, and the `mesh` object stores the mesh itself.

The RHS operators are all stored in the `trsk` object -- these are built as `scipy.sparse.csr_matrix` objects (compressed sparse row matricies). Note that we often see lines of the form
```
trsk.<operator> * <state_vector>
```
This performs a matrix-vector multiply, using the soon-to-be-depreciated `*` symbol, overloaded with the proper sparse matrix multiplication routine.
The modern standard is to use `@` as in `numpy` -- at present, both `*` and `@` do the same thing, with `@` to replace `*` in the next version of `scipy`.

In [3]:
print("Loading input assets...")

# load mesh + init. conditions
mesh = load_mesh(name)
flow = load_flow(name, None, lean=True)


print("Creating output file...")

init_file(name, cnfg, save, mesh, flow)


print("Reordering mesh data...")

mesh = sort_mesh(mesh, True)
flow = sort_flow(flow, mesh, lean=True)

u0_edge = flow.uu_edge[-1, :, 0]
uu_edge = u0_edge
ut_edge = u0_edge * 0.0

h0_cell = flow.hh_cell[-1, :, 0]
hh_cell = h0_cell
ht_cell = h0_cell * 0.0

hh_cell = np.maximum(HH_TINY, hh_cell)


print("Forming coefficients...")

# set sparse spatial operators
trsk = trsk_mats(mesh)

# remap fe,fc is more accurate?
flow.ff_edge = trsk.edge_stub_sums * flow.ff_vert
flow.ff_edge = \
    (flow.ff_edge / mesh.edge.area)

flow.ff_cell = trsk.cell_kite_sums * flow.ff_vert
flow.ff_cell = \
    (flow.ff_cell / mesh.cell.area)

flow.ff_cell *= (not cnfg.no_rotate)
flow.ff_edge *= (not cnfg.no_rotate)
flow.ff_vert *= (not cnfg.no_rotate)

kp_sums = np.zeros((
    cnfg.iteration // cnfg.stat_freq + 1), dtype=float)
en_sums = np.zeros((
    cnfg.iteration // cnfg.stat_freq + 1), dtype=float)


print('Done.')

Loading input assets...
Creating output file...
Reordering mesh data...
Forming coefficients...
Done.


## QTT representations by padding the data

Let's look at `trsk.edge_flux_perp`.
This has shape `(nedges, nedges)` and multiplies `uu_edge` (normal velocity at cell edges) to get the tangential velocity at cell edges, e.g.
```
vv_edge = trsk.edge_flux_perp @ uu_edge
```
For our performance tests, we will work with a random `uu_edge`.

In order to fold our data and operator tensors into a QTT compatable shape, they need to have sizes that are a power of 2. At present, `uu_edge` has size `30720`.
Luckly, not far off from from a power of 2, `2**15 = 32768`.
We will try padding the vector and operator matrix with zeros to see if that gets us anywhere vs. gettins TTs of shapes depending on a prime factorization of `30720`.

In [4]:
#uu_edge = np.random.rand(mesh.edge.size)
uu_edge = np.ones(mesh.edge.size)
print(f'nedges = {mesh.edge.size}')

npad = 2**15 - mesh.edge.size
print(f'npad = {npad}')

nedges = 30720
npad = 2048


In [5]:
uu_edge_pad = np.pad(uu_edge, [(0, npad)])
print(f'uu_edge_pad.shape = {uu_edge_pad.shape}')


# need to get dense version of the operator
dense_edge_flux_perp = trsk.edge_flux_perp.todense()

dense_edge_flux_perp_pad = np.pad(dense_edge_flux_perp, [(0, npad), (0, npad)])
print(f'dense_edge_flux_perp_pad.shape = {dense_edge_flux_perp_pad.shape}')

uu_edge_pad.shape = (32768,)
dense_edge_flux_perp_pad.shape = (32768, 32768)


Now that we have the padded vector and operator, we know that their QTT shapes will be `qtt_tens_shape = [2] * L` and `qtt_op_shape = [(2, 2)] * L`, where `nedges = 2**L`.

In [6]:
# shapes for QTT representations
L = int(math.log(uu_edge_pad.shape[0], 2))
qtt_tens_shape = [2] * L
qtt_op_shape = [(2, 2)] * L


print(f'qtt_tens_shape = {qtt_tens_shape}')
print(f'qtt_op_shape = {qtt_op_shape}')

qtt_tens_shape = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
qtt_op_shape = [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]


Now, let's form the TT operator and corresponding TT tensor, which will have shapes `qtt_op_shape` and `qtt_tens_shape` respectivly.

Note that:
- The time to compute and the compression is not any better here with QTT, versus the TT with shapes defined by the prime factorization of `nedges`.
    - Actually slightly worse, but the initial vectors are larger so this is fine. 

In [7]:
qtt_op_timer = Timer()
qtt_round_op_timer = Timer()

qtt_tens_timer = Timer()

# form the tt tensor
qtt_tens_timer.start()
qtt_uu_edge = tt.TT(uu_edge_pad, qtt_tens_shape)
qtt_tens_timer.stop()

print(f'qtt_uu_edge = {qtt_uu_edge}')
print(f'time to tt.TT(uu_edge_pad) = {qtt_tens_timer.get_time()}')


# form the tt operator
qtt_op_timer.start()
qtt_edge_flux_perp = tt.TT(dense_edge_flux_perp_pad, qtt_op_shape)
qtt_op_timer.stop()

print(f'\nqtt_edge_flux_perp = {qtt_edge_flux_perp}')
print(f'time to tt.TT(dense_edge_flux_perp_pad) = {qtt_op_timer.get_time()}')

qtt_uu_edge = TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 2, np.int64(2), np.int64(2), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), 1]

Device: cpu, dtype: torch.float64
#entries 46 compression 0.00140380859375

time to tt.TT(uu_edge_pad) = 0.0018210411071777344

qtt_edge_flux_perp = TT-matrix with sizes and ranks:
M = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 4, np.int64(10), np.int64(22), np.int64(43), np.int64(88), np.int64(178), np.int64(564), np.int64(1532), np.int64(2900), 1024, 256, 64, 16, 4, 1]
Device: cpu, dtype: torch.float64
#entries 34708472 compression 0.03232478350400925

time to tt.TT(dense_edge_flux_perp_pad) = 164.0189368724823


CSR gives us a compression rate on the order of 1e-4.
Two orders of magnitude better than TT.

Let's look at multiplication speeds.
First, at our TT multiply.
Since linear algebra operations on TTs cause the rank of the result to increase, we generally round after each `@` (note the awful compression before rounding).
We will time these two perations seperately, but understand that the total time is what is important.

In [8]:
qtt_mult_timer = Timer()
qtt_round_timer = Timer()

qtt_mult_timer.start()
qtt_mult_result = qtt_edge_flux_perp @ qtt_uu_edge
qtt_mult_timer.stop()

print(f'qtt_mult_result (before round) = {qtt_mult_result}')
print(f'time to @ = {qtt_mult_timer.get_time()}')


qtt_round_timer.start()
qtt_mult_result = qtt_mult_result.round()
qtt_round_timer.stop()

print(f'\ntt_mult_result (after round) = {qtt_mult_result}')
print(f'time to round = {qtt_round_timer.get_time()}')

print(f'\ntotal time = {qtt_mult_timer.get_time() + qtt_round_timer.get_time()}')

qtt_mult_result (before round) = TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 8, 20, 44, 43, 88, 178, 564, 1532, 2900, 1024, 256, 64, 16, 4, 1]

Device: cpu, dtype: torch.float64
#entries 17357696 compression 529.71484375

time to @ = 0.020586013793945312

tt_mult_result (after round) = TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 2, 4, 8, 15, 30, 60, 120, 128, 64, 32, 16, 8, 4, 2, 1]

Device: cpu, dtype: torch.float64
#entries 71788 compression 2.1907958984375

time to round = 0.15366911888122559

total time = 0.3485102653503418


This seems pretty slow, let's look at the CSR multiply and the ratio of the two times.
Also, look at the compression for `qtt_mult_result` -- extremely bad even after rounding.

In [9]:
csr_mult_timer = Timer()

csr_mult_timer.start()
csr_mult_result = trsk.edge_flux_perp @ uu_edge
csr_mult_timer.stop()

print(f'csr mult time = {csr_mult_timer.get_time()}')


print(f'qtt mult time / csr mult time = {(qtt_mult_timer.get_time() + qtt_round_timer.get_time()) / csr_mult_timer.get_time()}')

csr mult time = 0.0009119510650634766
qtt mult time / csr mult time = 286.6192156862745


CSR multiply is over 200x faster than the TT multiply + rounding.
We could do these TT operations on a GPU, but getting 200x back will be a challenge, and even then we are comparing apples to oranges and dealing with CPU-GPU communication in the full model.

In summary, it does not seem that QTT is helping us versus "standard" TT.

In [10]:
shape = [2] * (L * 2)
edge_flux_perp_pad_tens = dense_edge_flux_perp_pad.reshape(shape)

permute = tuple( np.arange(2 * L).reshape([2, 15]).transpose().flatten() )
edge_flux_perp_pad_tens = tn.permute(tn.from_numpy(edge_flux_perp_pad_tens), permute)

edge_flux_perp_pad_tens = edge_flux_perp_pad_tens.reshape([4] * L)

flat = edge_flux_perp_pad_tens.flatten().numpy()
flat

array([ 0.        ,  0.21793531, -0.21793362, ...,  0.        ,
        0.        ,  0.        ])

In [12]:
with open('dense_edge_flux_perp.tsv', 'w') as f:
    print('opened file')
    for i, x in enumerate(flat):
        if x != 0:
            f.write(f'{i}\t{x:f}\n')
    # END for
# END with

opened file
