In [1]:
println(Threads.nthreads())

32


In [6]:
using BenchmarkTools
n = 10000

# A = rand(Float32, n, n) #T
# A_T = rand(Float32, n, n)' #T'
# B = rand(Float32, n, n) #n
# B_T = rand(Float32, n, n)'
# a = rand(Float32, n) #R
# b = rand(Float32, n) #W;
# c = rand(Float32, n) #S_F, L_F
# d = rand(Float32, n) #m
# dR_1 = zeros(Float32, n)
# dR_2 = zeros(Float32, n)
# dW = zeros(Float32, n)
# S_bar = 1.0;

A = rand(Float64, n, n) #T
A_T = rand(Float64, n, n)' #T'
B = rand(Float64, n, n) #n
B_T = rand(Float64, n, n)'
a = rand(Float64, n) #R
b = rand(Float64, n) #W;
c = rand(Float64, n) #S_F, L_F
d = rand(Float64, n) #m
dR_1 = zeros(Float64, n)
dR_2 = zeros(Float64, n)
dW = zeros(Float64, n)
S_bar = 1.0;

In [13]:
@time dR_1 .= S_bar .- b .* sum(A .* (a' .* B), dims=2) .- c .* d;

  0.217787 seconds (15 allocations: 763.016 MiB, 16.10% gc time)


In [8]:
#列を参照
@time begin
for i in 1:n
    Ln = 0
    for j in 1:n
        Ln += A_T[i, j] * a[j] * B_T[i, j]
    end
    dW[i] = S_bar - b[i] * Ln - c[i] * d[i]
end
end

 17.192987 seconds (789.89 M allocations: 13.261 GiB, 1.84% gc time)


In [15]:
#列を参照
@time begin
for i in 1:n
    Ln = sum(@views A_T[i, :] .* a .* B_T[i, :])
    dW[i] = S_bar - b[i] * Ln - c[i] * d[i]
end
end

  0.333425 seconds (226.94 k allocations: 768.538 MiB, 17.17% gc time)


In [4]:
#行を参照
@time begin
for j in 1:n
    Ln = 0
    for i in 1:n
        Ln += A[i, j] * a[i] * B[i, j]
    end
    dW[j] = S_bar - b[j] * Ln - c[j] * d[j]
end
end

 17.193576 seconds (789.89 M allocations: 13.261 GiB, 1.56% gc time, 0.02% compilation time)


In [5]:
#行を参照
@time begin
for j in 1:n
    Ln = sum(@views A[:, j] .* a .* B[:, j])
    dW[j] = S_bar - b[j] * Ln - c[j] * d[j]
end
end

  0.463563 seconds (457.35 k allocations: 784.252 MiB, 12.59% gc time, 27.49% compilation time)


In [146]:
for i in 1:n
    Sn = sum(A[i, :] .* b .* B[i, :])  # ベクトル演算
    dR_2[i] = S_bar - a[i] * Sn - c[i] * d[i]
end

In [147]:
maximum(dR_1 - dR_2)

2.4158453015843406e-13

In [148]:
minimum(dR_1 - dR_2)

-1.8474111129762605e-13

In [5]:
A .= [1, 2, 3]

LoadError: DimensionMismatch: array could not be broadcast to match destination

In [2]:
n = 10000

A = collect(1:n)
B = collect(1:n)

C = zeros(n, n);

In [3]:
using Base.Threads

@time Threads.@threads for i in 1:n
    for j in 1:n
        C[i, j] = A[i] * B[j]
    end
end

  4.577352 seconds (769.38 M allocations: 12.957 GiB, 9.33% gc time, 19.71% compilation time)


In [4]:
@time for i in 1:n
    for j in 1:n
        C[i, j] = A[i] * B[j]
    end
end

 27.639449 seconds (579.60 M allocations: 10.127 GiB, 8.59% gc time)


In [5]:
# 外積を用いた行列計算
@time C .= A * B';

  0.419369 seconds (406.52 k allocations: 790.436 MiB, 9.05% gc time, 42.73% compilation time)


In [119]:
using Tullio
using ThreadsX
using LoopVectorization
using Base.Threads
using BenchmarkTools
using CUDA

function matri(A, B)
    
# A = cu(A)
# B = cu(B)

C = A .* B

return C

end;

In [120]:
n = 10000

A = rand(n, n)
B = rand(n, n);

In [124]:
@time matri(A, B);

  0.172829 seconds (2 allocations: 762.939 MiB, 0.51% gc time)


In [122]:
@btime matri(A, B);

  140.111 ms (2 allocations: 762.94 MiB)


In [107]:
function kernel_cal_matri(C, A, B)
    
    row = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    col = threadIdx().y + (blockIdx().y - 1) * blockDim().y
    
    # 行列サイズを取得
    n_rows, n_cols = size(C)

    # 有効な範囲内かチェック
    if row <= n_rows && col <= n_cols
        C[row, col] = A[row, col] * B[row, col]
    end
    
    return nothing
    
end

function cal_matri(A, B)
    
    n_rows, n_cols = size(A)
    
    # 非同期でデータを転送
    A_cu_task = @async cu(A)
    B_cu_task = @async cu(B)

    # # 非同期転送の完了を待ってデータを取得
    # A_cu = fetch(A_cu_task)
    # B_cu = fetch(B_cu_task)
    
    # A_cu = cu(Float32.(A))
    # B_cu = cu(Float32.(B))
    
    #非同期で計算を実行
    C = CUDA.zeros(Float32, n_rows, n_cols)
    
    threads = (4, 4)  # 1ブロック内のスレッド数 (16×16=256スレッド)
    blocks = (
        ceil(Int, n_rows / threads[1]), 
        ceil(Int, n_cols / threads[2])
    )
    
    @cuda threads=threads blocks=blocks kernel_cal_matri(C, fetch(A_cu), fetch(B_cu))
    
    #計算完了を待つ
    CUDA.synchronize()
    
    return Array(C)
end;

In [109]:
@time cal_matri(A, B);

  0.784098 seconds (94 allocations: 1.118 GiB, 16.87% gc time)


In [40]:
@time begin
A_cu = cu(A)
B_cu = cu(B);
C = A_cu .* B_cu;
end;

  0.367781 seconds (115 allocations: 762.942 MiB, 9.06% gc time)


In [43]:
@time C = A .* B;

  0.147221 seconds (3 allocations: 762.940 MiB)


In [44]:
@time C = @tullio C[i, j] := A[i, j] * B[i, j];

  3.214435 seconds (4.47 M allocations: 1.490 GiB, 2.87% gc time, 94.33% compilation time)


In [None]:
function elementwise_mult_lv(A::Matrix{Float64}, B::Matrix{Float64})
    C = similar(A)  # 結果を格納する配列を確保
    @tturbo for i in 1:size(A, 1), j in 1:size(A, 2)
        C[i, j] = A[i, j] * B[i, j]
    end
    return C
end;

In [55]:
@time C = elementwise_mult_lv(A, B);

  0.194790 seconds (2 allocations: 762.939 MiB, 29.15% gc time)


In [26]:
@time Threads.@threads for i in 1:n
        for j in 1:n
            C = A[i, j] * B[i, j]
        end
    end

  4.931461 seconds (769.37 M allocations: 12.956 GiB, 36.76% gc time, 18.57% compilation time)


In [47]:
using CUDA

function kernel_demand_sparse(S_H, L_H, S_F, L_F, R, W, prm)
    
    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
    if i <= length(R)
        R_S_H = prm.alpha_S_R[i] * R[i] ^ -prm.power_S_H_R
        W_S_H = prm.alpha_S_W[i] * W[i] ^ -prm.power_S_H_W
        S_H[i, j] = prm.T_power * R_S_H * W_S_H

        R_L_H = prm.alpha_L_R[i] * R[i] ^ -prm.power_L_H_R
        W_L_H = prm.alpha_L_W[i] * W[i] ^ -prm.power_L_H_W
        L_H[i] = prm.T_power * R_L_H * W_L_H

        S_F[i] = prm.beta_S_R * R[i] ^ -prm.power_S_F_R * prm.beta_S_W * W[i] ^ -prm.power_S_F_W
        L_F[i] = prm.beta_L_R * R[i] ^ -prm.power_L_F_R * prm.beta_L_W * W[i] ^ -prm.power_L_F_W
    end
end

function demand_sparse(prm, R, W)
    n = length(R)

    # 非同期でデータをGPUに転送
    R_cu = @async cu(R)
    W_cu = @async cu(W)
    prm_cu = @async cu(prm)

    # 非同期タスクを待たずに計算準備
    wait(R_cu)
    wait(W_cu)
    wait(prm_cu)

    # 非同期で計算を実行
    S_H, L_H, S_F, L_F = CUDA.zeros(Float32, n), CUDA.zeros(Float32, n), CUDA.zeros(Float32, n), CUDA.zeros(Float32, n)
    @cuda threads=256 blocks=ceil(Int, n / 256) kernel_demand_sparse(S_H, L_H, S_F, L_F, fetch(R_cu), fetch(W_cu), fetch(prm_cu))

    # 計算完了を待つ
    CUDA.synchronize()

    return S_H, L_H, S_F, L_F
end;

In [19]:
a = [1, 8, 3]
b = [4, 5, 6]
a .= max.(a, b)

println(a)

[4, 8, 6]


In [45]:
A = ones(3) #RW_before
B = zeros(3) #RW

println("A:", A)
println("B:", B)

for k in 1:3
    B .= B .+ 1
    
    A .= B
    
    println("A_{k}:", A)
    println("B_{k}:", B)
    
end

A:[1.0, 1.0, 1.0]
B:[0.0, 0.0, 0.0]
A_{k}:[1.0, 1.0, 1.0]
B_{k}:[1.0, 1.0, 1.0]
A_{k}:[2.0, 2.0, 2.0]
B_{k}:[2.0, 2.0, 2.0]
A_{k}:[3.0, 3.0, 3.0]
B_{k}:[3.0, 3.0, 3.0]


In [41]:
A

3-element Vector{Float64}:
 3.0
 3.0
 3.0

In [42]:
B

3-element Vector{Float64}:
 3.0
 3.0
 3.0

In [47]:
a = [1.0, 2.0, 3.0]
abs.(a)

3-element Vector{Float64}:
 1.0
 2.0
 3.0