# Fast tensor contraction

## Block-sparse tensor construction

In [1]:
using ITensors: Tensor, BlockSparseTensor, DiagBlockSparseTensor, Block, contract

In [2]:
# N = 3
# 1st dimension => 3 blocks of sizes 2, 2, 3
# 2nd dimension => 2 blocks of sizes 4, 3
# 3rd dimension => 2 blocks of sizes 3, 4
bst_dims = ([2, 2, 3], [4, 3], [3, 4])
# Multi-indices of two non-vanishing blocks
bst_blocks = [(1, 1, 1), (3, 2, 2)]

# Construct a block-sparse tensor with zero-initialized memory
bst = BlockSparseTensor(ComplexF64, bst_blocks, bst_dims)

Dim 1: [2, 2, 3]
Dim 2: [4, 3]
Dim 3: [3, 4]
NDTensors.BlockSparse{ComplexF64, Vector{ComplexF64}, 3}
 7×7×7
Block(1, 1, 1)
 [1:2, 1:4, 1:3]
[:, :, 1] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

[:, :, 2] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

[:, :, 3] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

Block(3, 2, 2)
 [5:7, 5:7, 4:7]
[:, :, 1] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

[:, :, 2] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

[:, :, 3] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im

[:, :, 4] =
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.0im
 0.0 + 0.0im  0.0 + 0.0im  0.0 + 0.

## Access to individual blocks

In [3]:
bst[Block(1, 1, 1)] = 2.0;
bst[Block(3, 2, 2)] = 3.0;

@show bst

bst = Dim 1: [2, 2, 3]
Dim 2: [4, 3]
Dim 3: [3, 4]
NDTensors.BlockSparse{ComplexF64, Vector{ComplexF64}, 3}
 7×7×7
Block(1, 1, 1)
 [1:2, 1:4, 1:3]
[:, :, 1] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

[:, :, 2] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

[:, :, 3] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

Block(3, 2, 2)
 [5:7, 5:7, 4:7]
[:, :, 1] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 2] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 3] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 4] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.

Dim 1: [2, 2, 3]
Dim 2: [4, 3]
Dim 3: [3, 4]
NDTensors.BlockSparse{ComplexF64, Vector{ComplexF64}, 3}
 7×7×7
Block(1, 1, 1)
 [1:2, 1:4, 1:3]
[:, :, 1] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

[:, :, 2] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

[:, :, 3] =
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im
 2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im  2.0 + 0.0im

Block(3, 2, 2)
 [5:7, 5:7, 4:7]
[:, :, 1] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 2] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 3] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im

[:, :, 4] =
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.0im
 3.0 + 0.0im  3.0 + 0.0im  3.0 + 0.

## Test simple contractions

In [4]:
A = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]));
A11 = [1 2; 3 4];
A22 = [5 6; 7 8];
A[Block(1, 1)] = A11
A[Block(2, 2)] = A22

B = BlockSparseTensor(ComplexF64, [(1, 1), (2, 2)], ([2, 2], [2, 2]));
B11 = [9 10; 11 12];
B22 = [13 14; 15 16];
B[Block(1, 1)] = B11;
B[Block(2, 2)] = B22;

Outer product

In [5]:
res = contract(A, (1, 2), B, (3, 4), (1, 2, 3, 4))
@assert res[Block(1, 1, 1, 1)] == [A11[i, j] * B11[k, l] for i=1:2, j=1:2, k=1:2, l=1:2]
@assert res[Block(1, 1, 2, 2)] == [A11[i, j] * B22[k, l] for i=1:2, j=1:2, k=1:2, l=1:2]
@assert res[Block(2, 2, 1, 1)] == [A22[i, j] * B11[k, l] for i=1:2, j=1:2, k=1:2, l=1:2]
@assert res[Block(2, 2, 2, 2)] == [A22[i, j] * B22[k, l] for i=1:2, j=1:2, k=1:2, l=1:2]

Matrix product

In [6]:
res = contract(A, (1, 2), B, (2, 3), (1, 3))
@assert res[Block(1, 1)] == A11 * B11
@assert res[Block(2, 2)] == A22 * B22

Trace of matrix product

In [7]:
using LinearAlgebra: tr

res = contract(A, (1, 2), B, (2, 1), ())
@assert res[] == tr(A11 * B11) + tr(A22 * B22)

## Contraction FLOPS cost and peak memory consumption

In [8]:
using ITensors.NDTensors: inds,
                          nzblocks,
                          blockdims,
                          contract_inds,
                          contract_labels,
                          contract_blocks,
                          are_blocks_contracted,
                          ValLength

In [9]:
"""
FLOPS cost of a single tensor pair contraction.
"""
function flops_cost(blocks1, inds1, labels1, blocks2, inds2, labels2, labelsR)
    labels1_to_labels2, labels1_to_labelsR, labels2_to_labelsR = contract_labels(labels1, labels2, labelsR)
    cost = 0
    for block1 in blocks1
        for block2 in blocks2
            if are_blocks_contracted(block1, block2, labels1_to_labels2)
                dims1 = blockdims(inds1, block1)
                dims2 = blockdims(inds2, block2)
                cost += prod(dims1) * prod(dims2[a2] for (a2, aR) in enumerate(labels2_to_labelsR) if aR != 0)
            end
        end
    end
    return cost
end

"""
Cost of a single tensor pair contraction.
"""
function flops_cost(T1::BlockSparseTensor, labels1, T2::BlockSparseTensor, labels2, labelsR)
    return flops_cost(nzblocks(T1), inds(T1), labels1,
                      nzblocks(T2), inds(T2), labels2,
                      labelsR
    )
end;

In [10]:
"""Memory occupied by a tensor"""
mem(blocks, inds) = sum(prod(blockdims(inds, b)) for b in blocks)

"""
Memory required to perform a single tensor pair contraction.
"""
function peak_memory(blocks1, inds1, labels1, blocks2, inds2, labels2, labelsR)
    labels1_to_labels2, labels1_to_labelsR, labels2_to_labelsR = contract_labels(labels1, labels2, labelsR)
    blocksR = []
    for block1 in blocks1
        for block2 in blocks2
            if are_blocks_contracted(block1, block2, labels1_to_labels2)
                push!(blocksR, contract_blocks(block1, labels1_to_labelsR, block2, labels2_to_labelsR, ValLength(labelsR)))
            end
        end
    end
    indsR = contract_inds(inds1, labels1, inds2, labels2, labelsR)
    return mem(blocks1, inds1) + mem(blocks2, inds2) + mem(blocksR, indsR)
end

"""
Memory required to perform a single tensor pair contraction.
"""
function peak_memory(T1::BlockSparseTensor, labels1, T2::BlockSparseTensor, labels2, labelsR)
    return peak_memory(nzblocks(T1), inds(T1), labels1,
                       nzblocks(T2), inds(T2), labels2,
                       labelsR
    )
end;

Test `flops_cost()` and `peak_memory()`

In [11]:
T1 = BlockSparseTensor(ComplexF64, [(1, 1, 2, 3, 1),
                                    (2, 2, 3, 2, 1),
                                    (3, 2, 1, 2, 1)],
                                    ([2, 3, 4], [3, 2, 3], [4, 5, 2], [5, 1, 3], [6, 2, 7]));
T2 = BlockSparseTensor(ComplexF64, [(1, 1, 1, 2, 2),
                                    (2, 3, 1, 2, 1),
                                    (1, 2, 3, 1, 1)],
                                    ([10, 2, 3], [5, 1, 3], [2, 3, 4], [7, 4, 1], [8, 2, 2]));

# Contraction over 2 labels, '1' and '4'
labels1 = (1, 2, 3, 4, 5)
labels2 = (6, 4, 1, 9, 10)
labelsR = (2, 3, 5, 6, 10, 9)

R = contract(T1, labels1, T2, labels2, labelsR)

cost = flops_cost(T1, labels1, T2, labels2, labelsR)
memory = peak_memory(T1, labels1, T2, labels2, labelsR)

@assert cost == (2*3*5*3*6)*(2*4*8) + # Block(1, 1, 2, 3, 1) ∘ Block(2, 3, 1, 2, 1)
                (4*2*4*1*6)*(10*7*8)  # Block(3, 2, 1, 2, 1) ∘ Block(1, 2, 3, 1, 1)
@assert memory == length(T1.storage.data) + length(T2.storage.data) + length(R.storage.data)

Binary tree of pairwise contractions

In [12]:
using AbstractTrees

mutable struct ContractionTree
    "Left tensor in pair contraction"
    left::Union{ContractionTree, Nothing}
    "Right tensor in pair contraction"
    right::Union{ContractionTree, Nothing}
    "Position of the tensor in the tensor network input list (valid only for leaves)"
    pos::Union{Int, Nothing}
    "List of non-zero blocks of the tensor"
    blocks::Vector{Block}
    "Block sizes of the tensor"
    inds::NTuple
    "Labels carried by the tensor"
    labels::NTuple
end

isleaf(tree::ContractionTree) = (tree.left === nothing) && (tree.right === nothing);
AbstractTrees.children(tree::ContractionTree) = isleaf(tree) ? [] : [tree.left, tree.right];
AbstractTrees.printnode(io::IO, tree::ContractionTree) = print(io, tree.labels);

"Make a leaf of a contraction tree"
ContractionTree(T, pos, labels) = ContractionTree(nothing, nothing, pos, nzblocks(T), inds(T), labels);

Test the contraction tree

In [13]:
let t1 = ContractionTree(nothing, nothing, 1, [], (5, 5, 5), (1, 2, 3)),
    t2 = ContractionTree(nothing, nothing, 2, [], (5, 5, 5), (3, 2, 4)),
    t12 = ContractionTree(t1, t2, nothing, [], (5, 5), (1, 4))
    print_tree(t1)
    print_tree(t2)
    print_tree(t12)
end

(1, 2, 3)
(3, 2, 4)
(1, 4)
├─ (1, 2, 3)
└─ (3, 2, 4)


Cost of a tensor network contraction

In [14]:
"""
FLOPS cost of a tensor network contraction.
"""
function flops_cost(tree::ContractionTree)
    isleaf(tree) && return 0

    cost_left = flops_cost(tree.left)
    cost_right = flops_cost(tree.right)
    cost = flops_cost(tree.left.blocks, tree.left.inds, tree.left.labels,
                      tree.right.blocks, tree.right.inds, tree.right.labels,
                      tree.labels)
    
    return cost_left + cost_right + cost
end;

In [15]:
"""
Peak memory required to store intermediate results of a tensor network contraction.
"""
function peak_memory(tree::ContractionTree)
    isleaf(tree) && return 0

    memory_left = peak_memory(tree.left)
    memory_right = peak_memory(tree.right)
    memory = peak_memory(tree.left.blocks, tree.left.inds, tree.left.labels,
                         tree.right.blocks, tree.right.inds, tree.right.labels,
                         tree.labels)

    return max(memory_left, memory_right, memory)
end;

## Construct a tensor network describing a given $\Sigma$ topology

`Topology` type from QInchworm

In [16]:
using Combinatorics: levicivita

const PairVector = Vector{Pair{Int,Int}}

struct Topology
    "Topology order ``n``"
    order::Int
    "List of pairs ``\\{(\\pi(1), \\pi(2)), ..., (\\pi(2n-1), \\pi(2n))\\}``"
    pairs::PairVector
    "Parity of the permutation ``\\pi``"
    parity::Int

    function Topology(pairs::PairVector, parity::Int)
        return new(length(pairs), pairs, parity)
    end
end

function Topology(pairs::PairVector)
    p = levicivita(collect(Iterators.flatten(pairs)))
    return Topology(pairs, p)
end;

Construct a tensor network describing a PP self-energy diagram

In [73]:
function make_Σ_tensor_network(top::Topology, P::Vector, O, pair_ints::Vector)
    n = top.order
    @assert length(P) == 2n - 1
    @assert length(pair_ints) == n
    
    # First, define the PP backbone
    # Labels of PP-indices are assigned in the contour order
    # Labels of interaction indices are assigned in the contour order of the respective operators O
    network = []
    int_label = 1
    for pos in 1:(4n - 1)
        if isodd(pos)
            # Add an interaction vertex
            labels = (4n + int_label, pos + 1, pos)
            push!(network, O => labels)
            int_label += 1
        else
            # Add a PP propagator
            labels = (pos + 1, pos)
            push!(network, P[div(pos, 2)] => labels)
        end
    end

    # Translate positions of interaction arc ends and sort the arcs
    # according to the order their heads appear in a configuration
    arcs = sort([(2n + 1 - p[2], 2n + 1 - p[1]) for p in top.pairs], lt=(p1, p2) -> p1[1] < p2[1])
    
    # Add the interaction lines
    for (pair_int, arc) in zip(pair_ints, arcs)
        labels = (4n + arc[2], 4n + arc[1])
        push!(network, pair_int => labels)
    end
    
    return network
end;

Construct a similar tensor network without PP propagators sandwiched between vertices

In [98]:
function make_Σ_tensor_network_wo_P(top::Topology, O, pair_ints::Vector)
    n = top.order
    @assert length(P) == 2n - 1
    @assert length(pair_ints) == n
    
    # First, define the PP backbone
    # Labels of PP-indices are assigned in the contour order
    # Labels of interaction indices are assigned in the contour order of the respective operators O
    network = []
    #int_label = 1
    for pos in 1:2n
        labels = (2n + 1 + pos, pos + 1, pos)
        push!(network, O => labels)
    end

    # Translate positions of interaction arc ends and sort the arcs
    # according to the order their heads appear in a configuration
    arcs = sort([(2n + 1 - p[2], 2n + 1 - p[1]) for p in top.pairs], lt=(p1, p2) -> p1[1] < p2[1])
    
    # Add the interaction lines
    for (pair_int, arc) in zip(pair_ints, arcs)
        labels = (2n + 1 + arc[2], 2n + 1 + arc[1])
        push!(network, pair_int => labels)
    end
    
    return network
end;

## Contraction of a chain tensor network ($n = 3$)

In [99]:
# Propagators: 2 dimension, 3 square blocks per each dimension
P_block_dims = ([2, 3, 4], [2, 3, 4])
P_dims = sum.(P_block_dims)
# The only non-zero blocks are diagonal
P_blocks = [(i, i) for i in 1:length(first(P_block_dims))]

P = [BlockSparseTensor(ComplexF64, P_blocks, P_block_dims) for i in 1:5];

Test contraction of propagators alone

In [100]:
using Random: MersenneTwister, rand
rng = MersenneTwister(12345678)

for i in 1:5
    for b in 1:length(first(P_block_dims))
        P[i][Block(b, b)] = rand(rng, P_block_dims[1][b], P_block_dims[2][b])
    end
end

# Labels are attached to propagators as follows:
# P5_{6,5} P4_{5,4} P3_{4,3} P2_{3,2} P1_{2,1}

# Test contraction order: (P5 * (P4 * P3)) * (P2 * P1)
R43 = contract(P[4], (5, 4), P[3], (4, 3), (5, 3));
R543 = contract(P[5], (6, 5), R43, (5, 3), (6, 3));
R21 = contract(P[2], (3, 2), P[1], (2, 1), (3, 1));
R = contract(R543, (6, 3), R21, (3, 1), (6, 1))

for b in 1:length(first(P_block_dims))
    bl = Block(b, b)
    R_mat = convert(Matrix{ComplexF64}, R[bl])
    R_mat_ref = convert(Matrix{ComplexF64}, P[5][bl]) *
                convert(Matrix{ComplexF64}, P[4][bl]) *
                convert(Matrix{ComplexF64}, P[3][bl]) *
                convert(Matrix{ComplexF64}, P[2][bl]) *
                convert(Matrix{ComplexF64}, P[1][bl])
    @assert isapprox(R_mat, R_mat_ref)
end

In [136]:
# Interaction lines
N_int = 6 # Number of pair interactions
#Δ = [BlockSparseTensor(ComplexF64, [(n, n) for n in 1:N_int], (ones(Int, N_int), ones(Int, N_int))) for i=1:3];
Δ = [DiagBlockSparseTensor(ComplexF64(i), [Block(1, 1)], ([N_int], [N_int])) for i=1:3];

In [137]:
# Interaction vertices: 3 dimensions
# There is always only one block along the 1st dimenstion (interaction index)
O_block_dims = ([N_int], P_block_dims...)
O_blocks = sort([(1, 1, 2),
                 (1, 2, 1),
                 (1, 2, 3),
                 (1, 3, 2),
                 (1, 1, 3),
                 (1, 3, 1)])

O = BlockSparseTensor(ComplexF64, O_blocks, O_block_dims);

Define tensor network corresponding to the $n=3$ chain diagram

![Chain diagram n=3](chain_n3.jpg)

Test `make_Σ_tensor_network()` and `make_Σ_tensor_network_wo_P()`

In [138]:
top = Topology([1 => 3, 2 => 5, 4 => 6])

tensor_network = make_Σ_tensor_network(top, P, O, Δ);
tensor_network_ref = Any[
    # PP-backbone
    O    => (13, 2, 1),
    P[1] => (3, 2),
    O    => (14, 4, 3),
    P[2] => (5, 4),
    O    => (15, 6, 5),
    P[3] => (7, 6),
    O    => (16, 8, 7),
    P[4] => (9, 8),
    O    => (17, 10, 9),
    P[5] => (11, 10),
    O    => (18, 12, 11),
    # Interaction lines
    Δ[1] => (15, 13),
    Δ[2] => (17, 14),
    Δ[3] => (18, 16)
];
@assert tensor_network == tensor_network_ref

tensor_network_wo_P = make_Σ_tensor_network_wo_P(top, O, Δ);
tensor_network_wo_P_ref = Any[
    # PP-backbone
    O    => (8, 2, 1),
    O    => (9, 3, 2),
    O    => (10, 4, 3),
    O    => (11, 5, 4),
    O    => (12, 6, 5),
    O    => (13, 7, 6),
    # Interaction lines
    Δ[1] => (10, 8),
    Δ[2] => (12, 9),
    Δ[3] => (13, 11)
];
@assert tensor_network_wo_P == tensor_network_wo_P_ref

## Study various contraction orders

In [139]:
function contraction_tree_orig_order(network)::ContractionTree
    n = div(length(network) + 1, 5)
    T1, labels1 = first(network)
    pos = 1
    tree = ContractionTree(T1, pos, labels1)
    for (T, labels) in network[2:5n-1]
        pos += 1
        nextleaf = ContractionTree(T, pos, labels)

        labelsR = Tuple(symdiff(nextleaf.labels, tree.labels))
     
        labels1_to_labels2, labels1_to_labelsR, labels2_to_labelsR = contract_labels(nextleaf.labels, tree.labels, labelsR)
        blocksR = [contract_blocks(b1, labels1_to_labelsR, b2, labels2_to_labelsR, ValLength(labelsR)) for b1 in nextleaf.blocks for b2 in tree.blocks]
        indsR = contract_inds(nextleaf.inds, nextleaf.labels, tree.inds, tree.labels, labelsR)
        
        tree = ContractionTree(nextleaf, tree, 0, blocksR, indsR, labelsR)
    end
    return tree
end;

In [148]:
Δ = [BlockSparseTensor(ComplexF64, [(n, n) for n in 1:N_int], (ones(Int, N_int), ones(Int, N_int))) for i=1:3];
tensor_network_wo_P = make_Σ_tensor_network_wo_P(top, O, Δ);

tree = contraction_tree_orig_order(tensor_network_wo_P)
print_tree(tree, maxdepth=20)
println("FLOPs cost: ", Float64(flops_cost(tree)))
println("Peak memory: ", Float64(peak_memory(tree)))

(7, 1)
├─ (13, 11)
└─ (13, 7, 11, 1)
   ├─ (12, 9)
   └─ (13, 7, 12, 11, 9, 1)
      ├─ (10, 8)
      └─ (13, 7, 12, 11, 10, 9, 8, 1)
         ├─ (13, 7, 6)
         └─ (12, 6, 11, 10, 9, 8, 1)
            ├─ (12, 6, 5)
            └─ (11, 5, 10, 9, 8, 1)
               ├─ (11, 5, 4)
               └─ (10, 4, 9, 8, 1)
                  ├─ (10, 4, 3)
                  └─ (9, 3, 8, 1)
                     ├─ (9, 3, 2)
                     └─ (8, 2, 1)
FLOPs cost: 2.0054470536e10
Peak memory: 2.0135236614e10


In [149]:
Δ = [DiagBlockSparseTensor(ComplexF64(i), [Block(1, 1)], ([N_int], [N_int])) for i=1:3];
tensor_network_wo_P = make_Σ_tensor_network_wo_P(top, O, Δ);

tree = contraction_tree_orig_order(tensor_network_wo_P)
print_tree(tree, maxdepth=20)
println("FLOPs cost: ", Float64(flops_cost(tree)))
println("Peak memory: ", Float64(peak_memory(tree)))

(7, 1)
├─ (13, 11)
└─ (13, 7, 11, 1)
   ├─ (12, 9)
   └─ (13, 7, 12, 11, 9, 1)
      ├─ (10, 8)
      └─ (13, 7, 12, 11, 10, 9, 8, 1)
         ├─ (13, 7, 6)
         └─ (12, 6, 11, 10, 9, 8, 1)
            ├─ (12, 6, 5)
            └─ (11, 5, 10, 9, 8, 1)
               ├─ (11, 5, 4)
               └─ (10, 4, 9, 8, 1)
                  ├─ (10, 4, 3)
                  └─ (9, 3, 8, 1)
                     ├─ (9, 3, 2)
                     └─ (8, 2, 1)
FLOPs cost: 3.9554812296e10
Peak memory: 2.0135236644e10
