In [1]:
using Base.Threads

In [2]:
Threads.nthreads()

16

In [3]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/BXD/Explore_Multithreading"

In [4]:
cd("../..")

In [5]:
include("../test/BXDdata_for_test.jl");

In [6]:
include("../src/parallel_helpers.jl");

In [7]:
include("../test/testHelper.jl");

## Threaded loop

In [8]:
function scan_perms_threadsLoops(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
              nperms::Int64 = 1024, rndseed::Int64 = 0, 
              reml::Bool = false, original::Bool = true)

    # check the number of traits as this function only works for permutation testing of univariate trait
    if(size(y, 2) != 1)
        error("Can only handle one trait.")
    end

    # n - the sample size
    # p - the number of markers
    (n, p) = size(g)

    # make intercept
    intercept = ones(n, 1)

    # rotate data so errors are uncorrelated
    (y0, X0, lambda0) = rotateData(y, [intercept g], K)


    ## Note: estimate once the variance components from the null model and use for all marker scans
    # fit lmm

    # X0_intercept = @view X0[:, 1] # to compare
    vc = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml) # vc.b is estimated through weighted least square
    r0 = y0 - X0[:, 1]*vc.b

    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(vc.h2, lambda0))

    # compared runtime of the following with "wls(X0[:, 2:end], X0[:, 1], wts)" ?
    # rescale by weights; now these have the same mean/variance and are independent
    rowMultiply!(r0, sqrtw);
    rowMultiply!(X0, sqrtw);

    
    # after re-weighting X, calling resid on re-weighted X is the same as doing wls on the X after rotation.
    X00 = resid(X0[:, 2:end], reshape(X0[:, 1], :, 1)) # consider not using sub-array, consider @view; in-place changes

    ## random permutations; the first column is the original trait (after transformation)
    rng = MersenneTwister(rndseed);
    ## permute r0 (which is an iid, standard normal distributed N-vector under the null)
    r0perm = shuffleVector(rng, r0[:, 1], nperms; original = original)

    ## Null RSS:
    # By null hypothesis, mean is 0. RSS just becomes the sum of squares of the residuals (r0perm's)
    # (For theoretical derivation of the results, see notebook)
    rss0 = sum(r0perm[:, 1].^2) # a scalar; bc rss0 for every permuted trait is the same under the null (zero mean);
    
    ## make array to hold Alternative RSS's for each permutated trait
    if original
        rss1 = Array{Float64, 2}(undef, nperms+1, p)
    else
        rss1 = Array{Float64, 2}(undef, nperms, p)
    end
    
    ## loop over markers
    Threads.@threads for i = 1:p

        ## alternative rss
        @inbounds rss1[:, i] = rss(r0perm, @view X00[:, i]);
        
    end

    lod = (-n/2)*(log10.(rss1) .- log10(rss0))

    return lod

end

scan_perms_threadsLoops (generic function with 1 method)

In [9]:
BLAS.get_num_threads()

16

In [10]:
BLAS.set_num_threads(2)

In [11]:
BLAS.get_num_threads()

2

In [12]:
Threads.nthreads()

16

In [13]:
@benchmark scan_perms_threadsLoops(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 5 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m964.955 ms[22m[39m … [35m   1.259 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 8.02% … 29.97%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m   1.046 s               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 8.19%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m   1.087 s[22m[39m ± [32m118.894 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m16.05% ± 10.14%

  [39m█[39m [39m [39m [39m [39m [39m [39m [39m [34m█[39m[39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[

In [14]:
@benchmark scan_perms(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.627 s[22m[39m … [35m  2.693 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m6.49% … 9.07%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.660 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m7.80%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.660 s[22m[39m ± [32m46.250 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.80% ± 1.83%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[

In [15]:
function threads_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks(p, block_size);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 16, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

threads_by_blocks (generic function with 1 method)

In [16]:
function scan_perms_threadsBlocks(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

scan_perms_threadsBlocks (generic function with 1 method)

In [17]:
using ThreadTools

In [18]:
@time scan_perms_threadsBlocks(pheno_y, geno, kinship; 
    reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)

  1.645631 seconds (515.11 k allocations: 13.697 GiB, 37.80% gc time, 21.51% compilation time)


1025×7321 Matrix{Float64}:
 0.00819636  0.00819636  0.00819636  0.00819636  …  0.0128283    0.0128283
 1.96594     1.96594     1.96594     1.96594        0.425106     0.425106
 0.261796    0.261796    0.261796    0.261796       0.0169187    0.0169187
 0.00246689  0.00246689  0.00246689  0.00246689     0.109368     0.109368
 0.742619    0.742619    0.742619    0.742619       0.00209752   0.00209752
 0.182933    0.182933    0.182933    0.182933    …  0.521991     0.521991
 0.376814    0.376814    0.376814    0.376814       0.240474     0.240474
 0.2393      0.2393      0.2393      0.2393         0.65133      0.65133
 0.1775      0.1775      0.1775      0.1775         0.60261      0.60261
 0.838544    0.838544    0.838544    0.838544       0.0155658    0.0155658
 0.217517    0.217517    0.217517    0.217517    …  0.070747     0.070747
 0.090706    0.090706    0.090706    0.090706       2.51674e-5   2.51674e-5
 0.0378054   0.0378054   0.0378054   0.0378054      0.0727514    0.0727514
 ⋮   

In [24]:
b = @benchmark scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 160)

BenchmarkTools.Trial: 6 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m753.440 ms[22m[39m … [35m   1.230 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 4.66% … 40.82%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m773.295 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 5.24%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m847.689 ms[22m[39m ± [32m187.523 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m13.98% ± 14.54%

  [39m▁[39m█[34m [39m[39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m 
  [39m█[39m█[34m▁[

In [25]:
b.times

6-element Vector{Float64}:
 7.77600316e8
 7.87980582e8
 1.229760826e9
 7.68364947e8
 7.53440187e8
 7.68989778e8

In [26]:
median(b.times)

7.73295047e8

In [22]:
runtimes_tmap = Array{Float64, 1}(undef, 10);

In [46]:
for t in 1:10
    
    b = @benchmark scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)
    runtimes_tmap[t] = mean(b.times)
    
end

In [47]:
runtimes_tmap

10-element Vector{Float64}:
 7.985361718571428e8
 8.545767453333334e8
 7.83490551e8
 7.860170172857143e8
 7.796807564285715e8
 8.069048968571428e8
 8.10312887e8
 1.0367473178333334e9
 8.598473711666666e8
 8.534531333333334e8

In [48]:
mean(runtimes_tmap)/1e9

0.8369566848095238

In [49]:
runtimes_tloops = Array{Float64, 1}(undef, 10);

In [None]:
for t in 1:10
    
    b_tloops = @benchmark scan_perms_threadsLoops(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true)
    runtimes_tloops[t] = median(b_tloops.times)
    
end

In [37]:
mean(runtimes_tloops)/1e9

1.33769649255

In [38]:
nthreads()

16

In [39]:
BLAS.get_num_threads()

2

In [40]:
@time tmap_LODs = scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 150);

  0.794476 seconds (77.71 k allocations: 13.675 GiB, 6.33% gc time)


In [41]:
BLAS.set_num_threads(4)

In [42]:
@time tloops_LODs = scan_perms_threads(pheno_y, geno, kinship; nperms = 0, rndseed = 0, reml = false, original = true);

LoadError: UndefVarError: scan_perms_threads not defined

In [43]:
sumSqDiff(tmap_LODs, tloops_LODs)

LoadError: UndefVarError: tloops_LODs not defined

In [34]:
BLAS.get_num_threads()

2

In [57]:
BLAS.set_num_threads(4)

In [58]:
@benchmark test_block = scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = true, nperms = 0, rndseed = 0, original = true, option = "by blocks", nblocks = 160)

BenchmarkTools.Trial: 202 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m10.123 ms[22m[39m … [35m82.389 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m13.572 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m24.771 ms[22m[39m ± [32m19.995 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.36% ± 8.50%

  [39m▅[39m [39m█[39m▆[34m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▇[39m█[39m█[34m▇[39m[3

In [59]:
@benchmark test_normal = scan(pheno_y, geno, kinship; reml = true, method = "null")

BenchmarkTools.Trial: 140 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m27.772 ms[22m[39m … [35m49.786 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 27.65%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m34.582 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m35.737 ms[22m[39m ± [32m 6.990 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.96% ± 10.50%

  [39m [39m█[39m▃[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [34m [39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▆[39m█[39m█[39m█[39m█[39m▃[3

In [60]:
maxSqDiff(test_block, reshape(test_normal[3], 1, :))

7.888609052210118e-29

In [61]:
2*60/35000

0.0034285714285714284

In [65]:
@time scan_perms_distributed(pheno_y, geno, kinship; reml = true, nperms = 1000, nblocks = 160)

  0.045124 seconds (53.62 k allocations: 44.370 MiB, 34.76% gc time)


1×7321 Matrix{Float64}:
 0.00808641  0.00808641  0.00808641  …  0.043462  0.0118178  0.0118178