In [2]:
using BenchmarkTools
using Distributed
using FLoops


using NBInclude
@nbinclude("solve_model.ipynb")


┌ Info: Precompiling FLoops [cc61a311-1640-44b5-9fba-1b764f453329]
└ @ Base loading.jl:1278


solve_tree (generic function with 5 methods)

#### Teste: gerar draws da uniforme

In [3]:
@btime rand(1000);

  930.100 ns (1 allocation: 7.94 KiB)


In [4]:
@btime for i in 1:1000
    rand()
end

  2.822 μs (0 allocations: 0 bytes)


In [32]:
function draws(ndraws, ncores)
    
    v=0
    
    for n in 1:ndraws
        v+=rand()
    end
    
end   

draws (generic function with 1 method)

In [33]:
@btime draws(1000, 1)

  3.300 μs (0 allocations: 0 bytes)


In [34]:
function draws_floop(ndraws, ncores)
    
    v=0
    
    #parece importante usar ÷, se usar \ dá um erro
    @floop ThreadedEx(basesize = ndraws ÷ ncores) for n in 1:ndraws
        @reduce(v+=rand())
    end
end

draws_floop (generic function with 1 method)

In [36]:
@btime draws_floop(1000, 1)

  3.200 μs (0 allocations: 0 bytes)


In [37]:
@btime draws_floop(1000, 2)

  5.500 μs (12 allocations: 992 bytes)


In [38]:
@btime draws_floop(1000, 3)

  5.450 μs (28 allocations: 2.76 KiB)


In [39]:
@btime draws_floop(1000, 4)

  5.475 μs (28 allocations: 2.77 KiB)


Conclusão: ou estou fazendo errado ou operações com rand() ficam melhores usando apenas 1 core

#### Teste: somar valores em um loop

In [48]:
function somar(n)
    s=0
    for i in 1:n
        s+=i
    end
    return s
end
        

somar (generic function with 1 method)

In [54]:
@btime somar(1_000_000_000)

  1.399 ns (0 allocations: 0 bytes)


500000000500000000

In [70]:
function somar_floop(n, executor)
    s=0
    @floop executor for i in 1:n
        @reduce(s+=i)
    end
    return s
end


base = 2;
ex1 = SequentialEx(simd = Val(true));
ex2 = ThreadedEx(basesize = base)
ex3 = DistributedEx(threads_basesize = base);

In [71]:
@btime somar_floop(10, ex1)

  20.260 ns (0 allocations: 0 bytes)


55

In [72]:
@btime somar_floop(10, ex2)

  6.167 μs (44 allocations: 4.55 KiB)


55

In [73]:
@btime somar_floop(10, ex3)

55

  154.899 μs (190 allocations: 12.14 KiB)


## Threads

In [8]:
#mudei a variavel ambiente JULIA_NUM_THREADS no windows 10
Threads.nthreads()

4

In [6]:
@btime @threads for i in 1:1000
    rand()
    end


  5.620 μs (21 allocations: 3.42 KiB)


### Exemplo de rodar o código da solução do modelo

In [74]:
#Vh/D, L/D, Dj/D
data = [1.0, 0.25, 0.68]; 

#ρ, β, c0, λj
game_parameters = [0.884, 9.84, 0.044, 0.346];

Sem paralelizar

In [75]:
@time for i in 1:2
    solve_tree(data, game_parameters);
end
    

327.289926 seconds (635.93 M allocations: 210.130 GiB, 9.17% gc time)
343.756724 seconds (632.36 M allocations: 209.969 GiB, 10.04% gc time)
673.037577 seconds (1.27 G allocations: 420.811 GiB, 9.60% gc time)


Com paralelização

In [77]:
@time @floop ThreadedEx(basesize=1) for i in 1:2
    solve_tree(data, game_parameters);
end

573.967866 seconds (1.25 G allocations: 418.378 GiB, 26.39% gc time)
585.336963 seconds (1.26 G allocations: 419.923 GiB, 25.97% gc time)
587.342296 seconds (1.27 G allocations: 420.467 GiB, 25.90% gc time)


In [78]:
@time @floop ThreadedEx(basesize=1) for i in 1:4
    solve_tree(data, game_parameters);
end

1101.369947 seconds1105.853144 seconds1110.221093 seconds (2.52 G allocations:  (2.51 G allocations:  (2.53 G allocations: 1107.273044 seconds (2.52 G allocations: 840.212 GiB839.012 GiB837.832 GiB839.732 GiB, 34.79% gc time), 34.99% gc time, )34.89% gc time), 34.86% gc time)



1120.696898 seconds (2.53 G allocations: 840.882 GiB, 34.47% gc time)
