In [1]:
using Base.Threads

In [2]:
Threads.nthreads()

16

In [3]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/Explore_Multithreading"

In [4]:
cd("..")

In [5]:
include("../test/BXDdata_for_test.jl");

In [6]:
include("../src/parallel_helpers.jl");

In [7]:
include("../test/testHelper.jl");

In [8]:
function scan_perms_threads(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
              nperms::Int64 = 1024, rndseed::Int64 = 0, 
              reml::Bool = false, original::Bool = true)

    # check the number of traits as this function only works for permutation testing of univariate trait
    if(size(y, 2) != 1)
        error("Can only handle one trait.")
    end

    # n - the sample size
    # p - the number of markers
    (n, p) = size(g)

    # make intercept
    intercept = ones(n, 1)

    # rotate data so errors are uncorrelated
    (y0, X0, lambda0) = rotateData(y, [intercept g], K)


    ## Note: estimate once the variance components from the null model and use for all marker scans
    # fit lmm

    # X0_intercept = @view X0[:, 1] # to compare
    vc = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml) # vc.b is estimated through weighted least square
    r0 = y0 - X0[:, 1]*vc.b

    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(vc.h2, lambda0))

    # compared runtime of the following with "wls(X0[:, 2:end], X0[:, 1], wts)" ?
    # rescale by weights; now these have the same mean/variance and are independent
    rowMultiply!(r0, sqrtw);
    rowMultiply!(X0, sqrtw);

    
    # after re-weighting X, calling resid on re-weighted X is the same as doing wls on the X after rotation.
    X00 = resid(X0[:, 2:end], reshape(X0[:, 1], :, 1)) # consider not using sub-array, consider @view; in-place changes

    ## random permutations; the first column is the original trait (after transformation)
    rng = MersenneTwister(rndseed);
    ## permute r0 (which is an iid, standard normal distributed N-vector under the null)
    r0perm = shuffleVector(rng, r0[:, 1], nperms; original = original)

    ## Null RSS:
    # By null hypothesis, mean is 0. RSS just becomes the sum of squares of the residuals (r0perm's)
    # (For theoretical derivation of the results, see notebook)
    rss0 = sum(r0perm[:, 1].^2) # a scalar; bc rss0 for every permuted trait is the same under the null (zero mean);
    
    ## make array to hold Alternative RSS's for each permutated trait
    if original
        rss1 = Array{Float64, 2}(undef, nperms+1, p)
    else
        rss1 = Array{Float64, 2}(undef, nperms, p)
    end
    
    ## loop over markers
    Threads.@threads for i = 1:p

        ## alternative rss
        @inbounds rss1[:, i] = rss(r0perm, @view X00[:, i]);
        
    end

    lod = (-n/2)*(log10.(rss1) .- log10(rss0))

    return lod

end

scan_perms_threads (generic function with 1 method)

In [9]:
BLAS.get_num_threads()

16

In [10]:
BLAS.set_num_threads(2)

In [11]:
BLAS.get_num_threads()

2

In [12]:
Threads.nthreads()

16

In [13]:
@benchmark scan_perms_threads(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 6 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m797.424 ms[22m[39m … [35m   1.860 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m10.28% … 53.52%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m824.175 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m11.96%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m   1.069 s[22m[39m ± [32m435.702 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m29.13% ± 19.52%

  [39m█[34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m 
  [39m█[34m█[39m[3

In [14]:
@benchmark scan_perms(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.643 s[22m[39m … [35m  2.693 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m6.42% … 8.86%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.668 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m7.65%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.668 s[22m[39m ± [32m35.087 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.65% ± 1.73%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[

In [15]:
function threads12_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks2(p, block_size);
    # blocks = createBlocks(p, nblocks);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 16, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

threads12_by_blocks (generic function with 1 method)

In [16]:
function scan_perms_threads12(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads12_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

scan_perms_threads12 (generic function with 1 method)

In [17]:
using ThreadTools

In [18]:
b = @benchmark scan_perms_threads12(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)

BenchmarkTools.Trial: 6 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m517.022 ms[22m[39m … [35m   1.904 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m12.63% … 59.25%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m576.172 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m11.57%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m838.552 ms[22m[39m ± [32m541.451 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m36.09% ± 23.40%

  [39m▁[39m▁[34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m 
  [39m█[39m█[34m█[

In [32]:
b.times

9-element Vector{Float64}:
 5.24511693e8
 4.75740902e8
 9.77500076e8
 4.91097884e8
 5.03934603e8
 5.00942729e8
 9.41041457e8
 4.32836939e8
 4.40687622e8

In [19]:
median(b.times)

5.761715455e8

In [20]:
runtimes_tmap = Array{Float64, 1}(undef, 10);

In [43]:
for t in 1:10
    
    b = @benchmark scan_perms_threads12(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)
    runtimes_tmap[t] = mean(b.times)
    
end

In [44]:
mean(runtimes_tmap)/1e9

0.5564858359577778

In [45]:
runtimes_tloops = Array{Float64, 1}(undef, 10);

In [46]:
for t in 1:10
    
    b_tloops = @benchmark scan_perms_threads(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true)
    runtimes_tloops[t] = median(b_tloops.times)
    
end

In [47]:
mean(runtimes_tloops)/1e9

0.5778656253

In [48]:
nthreads()

16

In [49]:
BLAS.get_num_threads()

2

In [50]:
runtimes_tmap

10-element Vector{Float64}:
 5.591043052222222e8
 5.598697161111112e8
 5.516050436e8
 5.899172702222222e8
 5.774236751111112e8
 5.826831521111112e8
 5.147052465e8
 5.524582602e8
 5.436127193e8
 5.334789712e8

In [51]:
@time tmap_LODs = scan_perms_threads12(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 150);

  0.510240 seconds (77.70 k allocations: 13.675 GiB, 7.06% gc time)


In [52]:
@time tloops_LODs = scan_perms_threads(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true);

  0.542453 seconds (95.71 k allocations: 13.615 GiB, 5.96% gc time)


In [53]:
sumSqDiff(tmap_LODs, tloops_LODs)

0.0