In [57]:
using Base.Threads

In [58]:
Threads.nthreads()

12

In [62]:
pwd()

"/home/xyu/github/BulkLMM.jl"

In [68]:
cd("..")

In [69]:
include("../test/BXDdata_for_test.jl");

In [70]:
include("../src/parallel_helpers.jl");

In [71]:
include("../test/testHelper.jl");

In [72]:
function scan_perms_threads(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
              nperms::Int64 = 1024, rndseed::Int64 = 0, 
              reml::Bool = false, original::Bool = true)

    # check the number of traits as this function only works for permutation testing of univariate trait
    if(size(y, 2) != 1)
        error("Can only handle one trait.")
    end

    # n - the sample size
    # p - the number of markers
    (n, p) = size(g)

    # make intercept
    intercept = ones(n, 1)

    # rotate data so errors are uncorrelated
    (y0, X0, lambda0) = rotateData(y, [intercept g], K)


    ## Note: estimate once the variance components from the null model and use for all marker scans
    # fit lmm

    # X0_intercept = @view X0[:, 1] # to compare
    vc = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml) # vc.b is estimated through weighted least square
    r0 = y0 - X0[:, 1]*vc.b

    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(vc.h2, lambda0))

    # compared runtime of the following with "wls(X0[:, 2:end], X0[:, 1], wts)" ?
    # rescale by weights; now these have the same mean/variance and are independent
    rowMultiply!(r0, sqrtw);
    rowMultiply!(X0, sqrtw);

    
    # after re-weighting X, calling resid on re-weighted X is the same as doing wls on the X after rotation.
    X00 = resid(X0[:, 2:end], reshape(X0[:, 1], :, 1)) # consider not using sub-array, consider @view; in-place changes

    ## random permutations; the first column is the original trait (after transformation)
    rng = MersenneTwister(rndseed);
    ## permute r0 (which is an iid, standard normal distributed N-vector under the null)
    r0perm = shuffleVector(rng, r0[:, 1], nperms; original = original)

    ## Null RSS:
    # By null hypothesis, mean is 0. RSS just becomes the sum of squares of the residuals (r0perm's)
    # (For theoretical derivation of the results, see notebook)
    rss0 = sum(r0perm[:, 1].^2) # a scalar; bc rss0 for every permuted trait is the same under the null (zero mean);
    
    ## make array to hold Alternative RSS's for each permutated trait
    if original
        rss1 = Array{Float64, 2}(undef, nperms+1, p)
    else
        rss1 = Array{Float64, 2}(undef, nperms, p)
    end
    
    ## loop over markers
    Threads.@threads for i = 1:p

        ## alternative rss
        @inbounds rss1[:, i] = rss(r0perm, @view X00[:, i]);
        
    end

    lod = (-n/2)*(log10.(rss1) .- log10(rss0))

    return lod

end

scan_perms_threads (generic function with 1 method)

In [73]:
function scan_null_threads(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                   reml::Bool = false)

    # number of markers
    (n, m) = size(g)
    # make intercept
    intercept = ones(n, 1)
    # rotate data
    (y0, X0, lambda0) = rotateData(y, [intercept g], K)
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(out00.h2, lambda0))
    # rescale by weights
    rowMultiply!(y0, sqrtw)
    rowMultiply!(X0, sqrtw)

    # perform genome scan
    out0 = rss(y0, reshape(X0[:, 1], n, 1))
    lod = zeros(m)
    X = zeros(n, 2)
    X[:, 1] = X0[:, 1]
    Threads.@threads for i = 1:m
        X[:, 2] = X0[:, i+1]
        out1 = rss(y0, X)
        lod[i] = (n/2)*(log10(out0[1]) - log10(out1[1]))
    end

    return (out00.sigma2, out00.h2, lod)

end

scan_null_threads (generic function with 1 method)

In [74]:
tloops_LODs_noperms = scan_null(pheno_y, geno, kinship; reml = false);

In [75]:
using Profile, POkg

In [79]:
Pkg.add("ProfileView")

LoadError: UndefVarError: Pkg not defined

In [76]:
@profile scan_null_threads(pheno_y, geno, kinship; reml = false);

In [77]:
Profile.print()

Overhead ╎ [+additional indent] Count File:Line; Function
    ╎3     ...mpiler/typeinfer.jl:996; typeinf_ext_toplevel(mi::Core....
    ╎ 3     ...piler/typeinfer.jl:1000; typeinf_ext_toplevel(interp::...
    ╎  3     ...piler/typeinfer.jl:967; typeinf_ext(interp::Core.Comp...
    ╎   3     ...piler/typeinfer.jl:213; typeinf(interp::Core.Compile...
    ╎    3     ...iler/typeinfer.jl:230; _typeinf(interp::Core.Compil...
    ╎     3     ...interpretation.jl:2462; typeinf_nocycle(interp::Cor...
    ╎    ╎ 3     ...nterpretation.jl:2366; typeinf_local(interp::Core...
    ╎    ╎  3     ...nterpretation.jl:1890; abstract_eval_statement(i...
    ╎    ╎   3     ...terpretation.jl:1733; abstract_call(interp::Cor...
    ╎    ╎    3     ...terpretation.jl:1764; abstract_call(interp::Co...
    ╎    ╎     3     ...terpretation.jl:153; abstract_call_gf_by_type...
    ╎    ╎    ╎ 3     ...terpretation.jl:641; abstract_call_method(int...
    ╎    ╎    ╎  3     ...r/typeinfer.jl:877; typeinf_edge(inter

In [40]:
@benchmark scan_null_threads(pheno_y, geno, kinship; reml = false)

BenchmarkTools.Trial: 69 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m64.451 ms[22m[39m … [35m196.889 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 66.20%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m65.947 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m73.162 ms[22m[39m ± [32m 28.645 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m9.49% ± 15.03%

  [39m█[34m▂[39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[34m█[39m[39m▇[39m▄[

In [28]:
@benchmark scan_null(pheno_y, geno, kinship; reml = false)

BenchmarkTools.Trial: 120 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m27.622 ms[22m[39m … [35m79.075 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 50.60%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m39.126 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m41.720 ms[22m[39m ± [32m13.319 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m16.69% ± 20.85%

  [39m▆[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▆[34m█[39m[39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▆[39m▃[39m▃[39m▃[39m▃

In [41]:
tloops_LODs = scan_perms_threads(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

1025×7321 Matrix{Float64}:
 0.00819636  0.00819636  0.00819636  …  0.0449878    0.0128283   0.0128283
 0.604956    0.604956    0.604956       1.33928      1.29426     1.29426
 0.0264842   0.0264842   0.0264842      0.16927      0.069518    0.069518
 0.00585467  0.00585467  0.00585467     0.589608     0.14854     0.14854
 0.477297    0.477297    0.477297       0.0317133    0.0625366   0.0625366
 0.140293    0.140293    0.140293    …  0.0400303    0.0338204   0.0338204
 0.45141     0.45141     0.45141        0.459152     0.278652    0.278652
 0.179818    0.179818    0.179818       0.0430329    0.0218039   0.0218039
 0.213351    0.213351    0.213351       0.179031     0.193549    0.193549
 0.0250041   0.0250041   0.0250041      0.0679043    0.00622826  0.00622826
 0.801668    0.801668    0.801668    …  0.42936      0.123412    0.123412
 0.0351507   0.0351507   0.0351507      0.296432     0.0480549   0.0480549
 0.053255    0.053255    0.053255       0.281489     0.11254     0.11254
 ⋮     

In [42]:
normal_LODs = scan_perms(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

1025×7321 Matrix{Float64}:
 0.00819636  0.00819636  0.00819636  …  0.0449878    0.0128283   0.0128283
 0.604956    0.604956    0.604956       1.33928      1.29426     1.29426
 0.0264842   0.0264842   0.0264842      0.16927      0.069518    0.069518
 0.00585467  0.00585467  0.00585467     0.589608     0.14854     0.14854
 0.477297    0.477297    0.477297       0.0317133    0.0625366   0.0625366
 0.140293    0.140293    0.140293    …  0.0400303    0.0338204   0.0338204
 0.45141     0.45141     0.45141        0.459152     0.278652    0.278652
 0.179818    0.179818    0.179818       0.0430329    0.0218039   0.0218039
 0.213351    0.213351    0.213351       0.179031     0.193549    0.193549
 0.0250041   0.0250041   0.0250041      0.0679043    0.00622826  0.00622826
 0.801668    0.801668    0.801668    …  0.42936      0.123412    0.123412
 0.0351507   0.0351507   0.0351507      0.296432     0.0480549   0.0480549
 0.053255    0.053255    0.053255       0.281489     0.11254     0.11254
 ⋮     

In [43]:
@benchmark scan_perms_threads(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 5 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.039 s[22m[39m … [35m  1.297 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m4.58% … 2.84%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.134 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m7.87%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.146 s[22m[39m ± [32m95.571 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.93% ± 4.63%

  [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[34m [39m[39m [39m [39m [39m [39m [39m█[39m [39m [39m [32m [39m[39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁

In [44]:
@benchmark scan_perms(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m2.656 s[22m[39m … [35m 2.660 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m5.06% … 4.52%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m2.658 s             [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m4.79%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.658 s[22m[39m ± [32m3.342 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m4.79% ± 0.38%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m

In [45]:
function threads12_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks2(p, block_size);
    # blocks = createBlocks(p, nblocks);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 24, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

threads12_by_blocks (generic function with 1 method)

In [46]:
function scan_perms_threads12(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads12_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

scan_perms_threads12 (generic function with 1 method)

In [47]:
using ThreadTools

In [48]:
@benchmark scan_perms_threads12(pheno_y, geno, kinship; reml = false, nperms = 1024, rndseed = 0, original = true, option = "by blocks", nblocks = 120)

BenchmarkTools.Trial: 5 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m924.531 ms[22m[39m … [35m  1.063 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m4.09% … 4.29%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m   1.017 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m4.48%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m   1.002 s[22m[39m ± [32m57.001 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m5.28% ± 1.34%

  [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[34m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[39m▁[39m▁[39

In [49]:
using Profile

In [50]:
@profile scan_perms_threads(pheno_y, geno, kinship; nperms = 1024, rndseed = 0, reml = false, original = true);

In [51]:
Profile.print()

Overhead ╎ [+additional indent] Count File:Line; Function
   3╎3     @Base/reduce.jl:0; mapreduce_impl(f::typeof(ident...
   5╎5     @Base/reduce.jl:248; mapreduce_impl(f::typeof(ident...
    ╎283   @Base/task.jl:484; (::IJulia.var"#15#18")()
    ╎ 283   ...ia/src/eventloop.jl:8; eventloop(socket::ZMQ.Socket)
    ╎  283   @Base/essentials.jl:726; invokelatest
    ╎   283   @Base/essentials.jl:729; #invokelatest#2
    ╎    283   ...execute_request.jl:67; execute_request(socket::ZMQ....
    ╎     283   ...oftGlobalScope.jl:65; softscope_include_string(m::...
    ╎    ╎ 283   @Base/loading.jl:1428; include_string(mapexpr::ty...
   2╎    ╎  283   @Base/boot.jl:368; eval
    ╎    ╎   180   In[10]:1; (::var"#scan_perms_threads#...
    ╎    ╎    1     In[10]:18; scan_perms_threads(y::Mat...
    ╎    ╎     1     ...M.jl/src/lmm.jl:47; rotateData
    ╎    ╎    ╎ 1     ...a/src/matmul.jl:148; *
    ╎    ╎    ╎  1     .../src/matmul.jl:276; mul!
    ╎    ╎    ╎   1     .../src/matmul.jl:425; mul!