In [None]:
using Base.Threads

In [None]:
Threads.nthreads()

In [None]:
pwd()

In [1]:
cd("../..")

In [2]:
include("../test/BXDdata_for_test.jl");

In [15]:
include("../src/parallel_helpers.jl");

In [4]:
include("../test/testHelper.jl");

## Threaded loop

In [5]:
function scan_perms_threadsLoops(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
              nperms::Int64 = 1024, rndseed::Int64 = 0, 
              reml::Bool = false, original::Bool = true)

    # check the number of traits as this function only works for permutation testing of univariate trait
    if(size(y, 2) != 1)
        error("Can only handle one trait.")
    end

    # n - the sample size
    # p - the number of markers
    (n, p) = size(g)

    # make intercept
    intercept = ones(n, 1)

    # rotate data so errors are uncorrelated
    (y0, X0, lambda0) = rotateData(y, [intercept g], K)


    ## Note: estimate once the variance components from the null model and use for all marker scans
    # fit lmm

    # X0_intercept = @view X0[:, 1] # to compare
    vc = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml) # vc.b is estimated through weighted least square
    r0 = y0 - X0[:, 1]*vc.b

    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(vc.h2, lambda0))

    # compared runtime of the following with "wls(X0[:, 2:end], X0[:, 1], wts)" ?
    # rescale by weights; now these have the same mean/variance and are independent
    rowMultiply!(r0, sqrtw);
    rowMultiply!(X0, sqrtw);

    
    # after re-weighting X, calling resid on re-weighted X is the same as doing wls on the X after rotation.
    X00 = resid(X0[:, 2:end], reshape(X0[:, 1], :, 1)) # consider not using sub-array, consider @view; in-place changes

    ## random permutations; the first column is the original trait (after transformation)
    rng = MersenneTwister(rndseed);
    ## permute r0 (which is an iid, standard normal distributed N-vector under the null)
    r0perm = shuffleVector(rng, r0[:, 1], nperms; original = original)

    ## Null RSS:
    # By null hypothesis, mean is 0. RSS just becomes the sum of squares of the residuals (r0perm's)
    # (For theoretical derivation of the results, see notebook)
    rss0 = sum(r0perm[:, 1].^2) # a scalar; bc rss0 for every permuted trait is the same under the null (zero mean);
    
    ## make array to hold Alternative RSS's for each permutated trait
    if original
        rss1 = Array{Float64, 2}(undef, nperms+1, p)
    else
        rss1 = Array{Float64, 2}(undef, nperms, p)
    end
    
    ## loop over markers
    Threads.@threads for i = 1:p

        ## alternative rss
        @inbounds rss1[:, i] = rss(r0perm, @view X00[:, i]);
        
    end

    lod = (-n/2)*(log10.(rss1) .- log10(rss0))

    return lod

end

scan_perms_threadsLoops (generic function with 1 method)

In [6]:
BLAS.get_num_threads()

16

In [7]:
BLAS.set_num_threads(2)

In [8]:
BLAS.get_num_threads()

2

In [9]:
Threads.nthreads()

32

In [10]:
@benchmark scan_perms_threadsLoops(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 3 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.463 s[22m[39m … [35m   2.051 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 3.95% … 30.88%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.541 s               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m25.82%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.685 s[22m[39m ± [32m319.644 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m21.54% ± 14.31%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁

In [11]:
@benchmark scan_perms(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.699 s[22m[39m … [35m   3.245 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m7.05% … 9.21%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.972 s               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m8.23%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.972 s[22m[39m ± [32m386.662 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m8.23% ± 1.52%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m

In [16]:
using ThreadTools

In [17]:
function threads_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks(p, block_size);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 16, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

threads_by_blocks (generic function with 1 method)

In [18]:
function scan_perms_threadsBlocks(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

scan_perms_threadsBlocks (generic function with 1 method)

In [20]:
@time scan_perms_threadsBlocks(pheno_y, geno, kinship; 
    reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)

  0.830197 seconds (77.19 k allocations: 13.675 GiB, 11.99% gc time)


1025×7321 Matrix{Float64}:
 0.00819636  0.00819636  0.00819636  0.00819636  …  0.0128283    0.0128283
 1.96594     1.96594     1.96594     1.96594        0.425106     0.425106
 0.261796    0.261796    0.261796    0.261796       0.0169187    0.0169187
 0.00246689  0.00246689  0.00246689  0.00246689     0.109368     0.109368
 0.742619    0.742619    0.742619    0.742619       0.00209752   0.00209752
 0.182933    0.182933    0.182933    0.182933    …  0.521991     0.521991
 0.376814    0.376814    0.376814    0.376814       0.240474     0.240474
 0.2393      0.2393      0.2393      0.2393         0.65133      0.65133
 0.1775      0.1775      0.1775      0.1775         0.60261      0.60261
 0.838544    0.838544    0.838544    0.838544       0.0155658    0.0155658
 0.217517    0.217517    0.217517    0.217517    …  0.070747     0.070747
 0.090706    0.090706    0.090706    0.090706       2.51674e-5   2.51674e-5
 0.0378054   0.0378054   0.0378054   0.0378054      0.0727514    0.0727514
 ⋮   

In [21]:
b = @benchmark scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 160)

BenchmarkTools.Trial: 6 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m764.047 ms[22m[39m … [35m   1.199 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 6.68% … 40.23%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m807.933 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 8.74%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m866.733 ms[22m[39m ± [32m164.865 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m15.44% ± 13.22%

  [39m█[39m [39m [39m█[39m█[34m [39m[39m [39m█[39m [39m█[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[

In [22]:
b.times

6-element Vector{Float64}:
 7.64047493e8
 7.8663239e8
 8.34355113e8
 1.199496515e9
 7.96808647e8
 8.19058179e8

In [23]:
median(b.times)

8.07933413e8

In [24]:
runtimes_tmap = Array{Float64, 1}(undef, 10);

In [25]:
for t in 1:10
    
    b = @benchmark scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)
    runtimes_tmap[t] = mean(b.times)
    
end

In [26]:
runtimes_tmap

10-element Vector{Float64}:
 8.930813385e8
 8.106226087142857e8
 8.646869235e8
 8.522457376666666e8
 8.429717463333334e8
 8.597489828333334e8
 8.76140424e8
 8.700922178333334e8
 9.298007205e8
 9.224707648333334e8

In [27]:
mean(runtimes_tmap)/1e9

0.8721861464714286

In [39]:
runtimes_tloops = Array{Float64, 1}(undef, 10);

In [40]:
for t in 1:10
    
    b_tloops = @benchmark scan_perms_threadsLoops(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true)
    runtimes_tloops[t] = median(b_tloops.times)
    
end

In [56]:
mean(runtimes_tloops)/1e9

1.3997199087

In [55]:
nthreads()

32

In [56]:
BLAS.get_num_threads()

2

In [59]:
@time tmap_LODs = scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = false, nperms = 0, rndseed = 0, original = true, option = "by blocks", nblocks = 120);

  0.031060 seconds (53.72 k allocations: 44.408 MiB)


In [60]:
@time tloops_LODs = scan_perms_threadsLoops(pheno_y, geno, kinship; nperms = 0, rndseed = 0, reml = false, original = true);

  0.021848 seconds (72.82 k allocations: 39.895 MiB)


In [53]:
sumSqDiff(tmap_LODs, tloops_LODs)

0.0

In [43]:
BLAS.get_num_threads()

2

In [44]:
BLAS.set_num_threads(2)

In [54]:
@benchmark test_block = scan_perms_threadsBlocks(pheno_y, geno, kinship; reml = true, nperms = 0, rndseed = 0, original = true, option = "by blocks", nblocks = 160)

BenchmarkTools.Trial: 174 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m15.249 ms[22m[39m … [35m109.344 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 39.82%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m22.848 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m29.060 ms[22m[39m ± [32m 20.613 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m15.75% ± 16.23%

  [39m▃[39m▁[39m▄[39m▆[39m█[34m▇[39m[39m▅[39m▂[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m█[39m█[39m█[3

In [65]:
@benchmark test_normal = scan(pheno_y, geno, kinship; reml = true, method = "null")

BenchmarkTools.Trial: 122 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m25.554 ms[22m[39m … [35m77.232 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 49.96%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m38.826 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m40.986 ms[22m[39m ± [32m12.400 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m13.26% ± 19.38%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [34m█[39m[39m▂[39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▇[39m▅[39m▄[39m▄[39m▃[39m▃