In [1]:
import os
import time
import numpy as np
import netCDF4 as nc
import argparse

from stb import strtobool

from msh import load_mesh, load_flow, \
                sort_mesh, sort_flow
from ops import trsk_mats

from _dx import HH_TINY, UU_TINY
from _dx import invariant, diag_vars, tcpu
from _dt import step_eqns

#import torch
import scorch as torch
import torchtt as tt

from sympy.ntheory import factorint
import math
import pickle

from temp_init import init_file
from timer import Timer

In [2]:
class base: pass

cnfg = base()

cnfg.save_freq = 100
cnfg.stat_freq = 100

cnfg.integrate = 'RK44'
cnfg.operators = 'TRSK-CV'
cnfg.equations = 'SHALLOW-WATER'
cnfg.ke_upwind = 'AUST-CONST'
cnfg.ke_scheme = 'CENTRE'
cnfg.pv_upwind = 'AUST-ADAPT'
cnfg.pv_scheme = 'UPWIND'

cnfg.du_damp_4 = 0.0
cnfg.vu_damp_4 = 0.0

cnfg.iteration = 0
cnfg.no_rotate = False

# testing with simple, quasi-linear
# test case on low resolution mesh
name = 'io/ltc1_cvt_5.nc'
path, file = os.path.split(name)
save = os.path.join(path, "out_" + file)

In [3]:
print("Loading input assets...")

# load mesh + init. conditions
mesh = load_mesh(name)
flow = load_flow(name, None, lean=True)


print("Creating output file...")

init_file(name, cnfg, save, mesh, flow)


print("Reordering mesh data...")

mesh = sort_mesh(mesh, True)
flow = sort_flow(flow, mesh, lean=True)

u0_edge = flow.uu_edge[-1, :, 0]
uu_edge = u0_edge
ut_edge = u0_edge * 0.0

h0_cell = flow.hh_cell[-1, :, 0]
hh_cell = h0_cell
ht_cell = h0_cell * 0.0

hh_cell = np.maximum(HH_TINY, hh_cell)


print("Forming coefficients...")

# set sparse spatial operators
trsk = trsk_mats(mesh)

# remap fe,fc is more accurate?
flow.ff_edge = trsk.edge_stub_sums * flow.ff_vert
flow.ff_edge = \
    (flow.ff_edge / mesh.edge.area)

flow.ff_cell = trsk.cell_kite_sums * flow.ff_vert
flow.ff_cell = \
    (flow.ff_cell / mesh.cell.area)

flow.ff_cell *= (not cnfg.no_rotate)
flow.ff_edge *= (not cnfg.no_rotate)
flow.ff_vert *= (not cnfg.no_rotate)

kp_sums = np.zeros((
    cnfg.iteration // cnfg.stat_freq + 1), dtype=float)
en_sums = np.zeros((
    cnfg.iteration // cnfg.stat_freq + 1), dtype=float)


print('Done.')

Loading input assets...
Creating output file...
Reordering mesh data...
Forming coefficients...
Done.


In [4]:
with open('stt.pkl', 'rb') as pkl:
    sqtt_mat_cores = pickle.load(pkl)
# END with

In [5]:
#uu_edge = np.random.rand(mesh.edge.size)
uu_edge = np.ones(mesh.edge.size)
print(f'nedges = {mesh.edge.size}')

npad = 2**15 - mesh.edge.size
print(f'npad = {npad}')

uu_edge_pad = np.pad(uu_edge, [(0, npad)])
print(f'uu_edge_pad.shape = {uu_edge_pad.shape}')

nedges = 30720
npad = 2048
uu_edge_pad.shape = (32768,)


In [6]:
# shapes for QTT representations
L = int(math.log(uu_edge_pad.shape[0], 2))
qtt_tens_shape = [2] * L
qtt_op_shape = [(2, 2)] * L


print(f'qtt_tens_shape = {qtt_tens_shape}')
print(f'qtt_op_shape = {qtt_op_shape}')

qtt_uu_edge = tt.TT(uu_edge_pad, qtt_tens_shape)
qtt_vec_cores = qtt_uu_edge.cores

print(qtt_uu_edge)

qtt_tens_shape = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
qtt_op_shape = [(2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2)]
TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 2, np.int64(2), np.int64(2), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), 1]

Device: cpu, dtype: torch.float64
#entries 46 compression 0.00140380859375



In [38]:
def matvec_core(mat_core, vec_core):
    # unclear why this is needed
    mat_core = mat_core.type(torch.float)
    vec_core = vec_core.type(torch.float)

    # unclear why this is needed
    vec_core = vec_core.to_sparse()

    mult_timer.start()
    result_core = torch.einsum('ijkl,mkp->imjlp', mat_core, vec_core)
    mult_timer.stop()

    # scorch's STensor doesn't like to be reshaped directly
    result_core = result_core.to_torch()

    mult_timer.start()
    result_core = result_core.reshape([ mat_core.shape[0] * vec_core.shape[0],
                                        mat_core.shape[1],
                                        mat_core.shape[3] * vec_core.shape[2] ])
    mult_timer.stop()
    
    return result_core
# END matvec_core()

mult_timer = Timer()

result_cores = []
for i in range(L):
    result_core = matvec_core(sqtt_mat_cores[i], qtt_vec_cores[i])
    result_cores.append(result_core)
# END for

print('Run this cell twice for timer results --')
print('Scorch needs to compile a custom kernel for the first einsum.')

print(f'\ntime to mult and reshape all cores = {mult_timer.get_time_reset()}')

Run this cell twice for timer results --
Scorch needs to compile a custom kernel for the first einsum.

time to mult and reshape all cores = 3.0994415283203125e-06


In [39]:
round_timer = Timer()

result_tt = tt.TT(result_cores)
print(f'result_tt = {result_tt}')

round_timer.start()
result_tt_round = result_tt.round()
round_timer.stop()

print(f'result_tt_round = {result_tt_round}')
print(f'time to round = {round_timer.get_time_reset()}')

result_tt = TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 8, 20, 44, 43, 88, 178, 564, 16331, 4096, 1024, 256, 64, 16, 4, 1]

Device: cpu, dtype: torch.float32
#entries 161398328 compression 4925.486083984375

result_tt_round = TT with sizes and ranks:
N = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
R = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Device: cpu, dtype: torch.float32
#entries 30 compression 0.00091552734375

time to round = 0.18793869018554688
