In [1]:
ENV["JULIA_REVISE_POLL"] = "1"
using Revise

In [2]:
using Pkg
Pkg.activate("..")

[32m[1m  Activating[22m[39m environment at `~/.julia/dev/SymArrays/Project.toml`


In [3]:
using SymArrays
using BenchmarkTools
using TensorOperations
using Random
using CUDA
CUDA.allowscalar(false)

In [4]:
N = 300
A = CUDA.rand(Float64,N)
S = SymArray{(3,),Float64}(CuArray,N,N,N);
S.data .= 1:length(S)
B = CuArray(collect(collect(S)))
C1 = SymArray{(2,),Float64}(CuArray,N,N)
C2 = copy(C1)
@tensor C3[j,k] := A[i]*B[i,j,k]
contract!(C1,A,S,Val(1),Val(1))
# this is the "hand-written" version where A has to be 1D
#contract!(C2,A,S,Val(1))
#@assert C1 ≈ C2
@assert collect(C1) ≈ collect(C3)

In [41]:
@btime CUDA.@sync @tensor C3[j,k] = A[i]*B[i,j,k]
@btime CUDA.@sync contract!(C1,A,S,Val(1),Val(1))
# @btime CUDA.@sync contract!(C1,A,S,Val(1));

  952.482 μs (38 allocations: 17.23 KiB)
  1.562 ms (26 allocations: 1.62 KiB)


In [42]:
N, M = 30, 40
A = CUDA.rand(Float64,N)
S = SymArray{(2,1),Float64}(CuArray,N,N,M)
S.data[:] .= 1:length(S)
B = CuArray(collect(collect(S)))
C1 = SymArray{(1,1),Float64}(CuArray,N,M)
C2 = collect(C1)
@tensor C3[j,k] := A[i]*B[i,j,k]
contract!(C1,A,S,Val(1),Val(1))
# this is the "hand-written" version where A has to be 1D
# contract!(C2,A,S,Val(1))
# @assert C1 ≈ C2
@assert collect(C1) ≈ collect(C3)

In [43]:
@btime CUDA.@sync @tensor C3[j,k] = A[i]*B[i,j,k]
@btime CUDA.@sync contract!(C1,A,S,Val(1),Val(1))
# @btime CUDA.@sync contract!(C2,A,S,Val(1));

  31.209 μs (38 allocations: 17.23 KiB)
  25.419 μs (26 allocations: 1.62 KiB)


In [31]:
N, M = 30, 40
A = CUDA.rand(Float64,M)
S = SymArray{(2,1),Float64}(CuArray,N,N,M);
S.data[:] .= 1:length(S)
B = CuArray(collect(collect(S)))
C1 = SymArray{(2,),Float64}(CuArray,N,N)
C2 = copy(C1)
@tensor C3[i,j] := A[k]*B[i,j,k]
contract!(C1,A,S,Val(1),Val(3))
# this is the "hand-written" version where A has to be 1D
contract!(C2,A,S,Val(3))
@assert collect(C1) ≈ collect(C2)
@assert collect(C1) ≈ collect(C3)

In [44]:
@btime CUDA.@sync @tensor C3[i,j] = B[i,j,k]*A[k]
@btime CUDA.@sync contract!(C1,A,S,Val(1),Val(3))
# this is the "hand-written" version where A has to be 1D
@btime CUDA.@sync contract!(C2,A,S,Val(3));

LoadError: DimensionMismatch("non-matching sizes in contracted dimensions")

In [47]:
N1, N2, N3 = 10, 12, 13
A = CUDA.rand(Float64,N1,N2,N3)
S = SymArray{(3,2,1),Float64}(CuArray,N1,N1,N1,N2,N2,N3)
rand!(S.data)
# 
C11 = SymArray{(1,1,2,2,1),Float64}(CuArray,N2,N3,N1,N1,N2,N2,N3)
C12 = copy(C11)
C13 = copy(C11)
contract!(C11,A,S,Val(1),Val(1))
contract!(C12,A,S,Val(1),Val(2))
contract!(C13,A,S,Val(1),Val(3))
@assert C11 == C12
@assert C11 == C13
C24 = SymArray{(1,1,3,1,1),Float64}(CuArray,N1,N3,N1,N1,N1,N2,N3)
contract!(C24,A,S,Val(2),Val(4))
C25 = SymArray{(1,1,3,1,1),Float64}(CuArray,N1,N3,N1,N1,N1,N2,N3)
contract!(C25,A,S,Val(2),Val(5))
@assert C24 == C25

C36 = SymArray{(1,1,3,2),Float64}(CuArray,N1,N2,N1,N1,N1,N2,N2)
contract!(C36,A,S,Val(3),Val(6));

In [48]:
contract!(C24,A,S,Val(2),Val(4));
B = CuArray(collect(collect(S)))
@tensor C24_AB[i,k,l,m,n,o,p] := A[i,j,k] * B[l,m,n,j,o,p]
@assert collect(C24) ≈ collect(C24_AB)

In [49]:
contract!(C36,A,S,Val(3),Val(6));
B = CuArray(collect(collect(S)))
@tensor C36_AB[i,j,l,m,n,o,p] := A[i,j,k] * B[l,m,n,o,p,k]
@assert collect(C36) ≈ collect(C36_AB)

In [50]:
@tensor C11_AB[j,k,l,m,n,o,p] := A[i,j,k] * B[i,l,m,n,o,p]
@btime CUDA.@sync contract!(C11,A,S,Val(1),Val(1))
@assert collect(C11) ≈ collect(C11_AB)
@btime CUDA.@sync @tensor C11_AB[j,k,l,m,n,o,p] = A[i,j,k] * B[i,l,m,n,o,p];

  4.671 ms (27 allocations: 1.64 KiB)
  5.532 ms (36 allocations: 17.39 KiB)


In [51]:
@btime CUDA.@sync contract!(C36,A,S,Val(3),Val(6));
@btime CUDA.@sync @tensor C36_AB[i,j,l,m,n,o,p] = A[i,j,k] * B[l,m,n,o,p,k];
@assert collect(C36) ≈ collect(C36_AB)

  260.905 μs (39 allocations: 17.45 KiB)
  2.239 ms (36 allocations: 17.39 KiB)


In [55]:
@generated function benchtensor(res_B, A, B::AbstractArray{T,Ndim},::Val{mm}) where {T,Ndim,mm}
    inds_B = Symbol.(:i,1:Ndim)
    inds_B[mm] = :j
    inds_res = (:i,inds_B[1:Ndim .!= mm]...)
    :( @tensor res_B[$(inds_res...)] = A[i,j]*B[$(inds_B...)] )
end

NN = 4
maxNdim = 12
for Ndim = 2:2:maxNdim
    S = SymArray{(Ndim,),Float64}(CuArray,ntuple(i->NN,Ndim)...)
    rand!(S.data)
    println(Ndim," ",length(S)," ",prod(size(S)))
    B = CuArray(collect(collect(S)))

    A = CUDA.rand(Float64,NN,NN)
    res = SymArray{(1,Ndim-1),Float64}(CuArray,size(S)...)
    res_B = CuArray{Float64,Ndim}(undef,size(res))

    mm = Val(Ndim÷2)
    @btime CUDA.@sync contract!($res,$A,$S,Val(2),$mm)
    @btime CUDA.@sync benchtensor($res_B, $A, $B, $mm)
    
    @assert collect(res) ≈ collect(res_B)
end

2 10 16
  12.041 μs (25 allocations: 1.58 KiB)
  38.986 μs (36 allocations: 17.17 KiB)
4 35 256
  26.390 μs (25 allocations: 1.58 KiB)
  55.958 μs (36 allocations: 17.23 KiB)
6 84 4096
  38.143 μs (25 allocations: 1.58 KiB)
  64.833 μs (36 allocations: 17.33 KiB)
8 165 65536
  61.052 μs (25 allocations: 1.58 KiB)
  114.848 μs (36 allocations: 17.39 KiB)
10 286 1048576
  83.978 μs (25 allocations: 1.58 KiB)
  951.944 μs (36 allocations: 17.48 KiB)
12 455 16777216
  104.330 μs (25 allocations: 1.58 KiB)
  10.487 ms (39 allocations: 17.92 KiB)
