# Explore Runtimes:

In [1]:
pwd()

"/home/zyu20/git/BulkLMM.jl/analysis/BXD"

In [2]:
using LoopVectorization

In [3]:
cd("..")

In [4]:
include("../test/BXDdata_for_test.jl");

In [5]:
include("../src/parallel_helpers.jl");

In [6]:
size(pheno)

(79, 35556)

In [7]:
size(pheno_y)

(79, 1)

In [8]:
size(geno)

(79, 7321)

In [9]:
size(kinship)

(79, 79)

In [10]:
(n, m, p)

(79, 35556, 7321)

In [11]:
Threads.nthreads()

24

In [12]:
using LinearAlgebra

In [13]:
BLAS.get_num_threads()

8

In [14]:
BLAS.set_num_threads(24);

In [15]:
BLAS.get_num_threads()

24

In [14]:
function scan_ordinary(Y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1}; reml::Bool = true)
    
    X0_inter = reshape(X0[:, 1], :, 1);
    
    full_results_ordinary = Array{Float64, 2}(undef, p, m);
    
    for i in 1:m
        y0i = reshape(Y0[:, i], :, 1);
        out00 = fitlmm(y0i, X0_inter, lambda0; reml = true);
        sqrtw = sqrt.(makeweights(out00.h2, lambda0))
        
        yw_i = rowMultiply(y0i, sqrtw);
        Xw = rowMultiply(X0, sqrtw);
        
        out0 = rss(yw_i, reshape(Xw[:, 1], :, 1));
        lod = Array{Float64, 1}(undef, p);
        
        X = Array{Float64, 2}(undef, n, 2);
        X[:, 1] = Xw[:, 1];
        
        for j in 1:p
            X[:, 2] = Xw[:, j+1];
            
            out1 = rss(yw_i, X);
            lod[j] = (n/2)*(log10(out0[1]) - log10(out1[1]))
        end
        
        full_results_ordinary[:, i] = lod;

    end
    
    
    return full_results_ordinary
end

scan_ordinary (generic function with 1 method)

In [None]:
@time begin
    
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship);
    full_results_ordinary = scan_ordinary(Y0, X0, lambda0);
    
end

## LiteQTL:

In [149]:
(m, n, p)

(35556, 79, 7321)

In [150]:
function r2lod(r::Float64, n::Int64)
    rsq = (r/n)^2;
    return -(n/2.0) * log10(1.0-rsq);
end

r2lod (generic function with 1 method)

In [151]:
map(x -> r2lod(x, n), [0.9, 0.1])

2-element Vector{Float64}:
 0.0022265908888506967
 2.748701454592006e-5

In [152]:
full_results_lite = Array{Float64, 2}(undef, p, m);

In [153]:
@time begin
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship);
    X0_intercept = reshape(X0[:, 1], :, 1);
    #copyX0_intercept = deepcopy(X0_intercept);
    X0_covar = X0[:, 2:end];
    #copyX0_covar = deepcopy(X0_covar);
    
    for i in 1:m
        pheno_y0i = reshape(Y0[:, i], :, 1);
        vc = fitlmm(pheno_y0i, X0_intercept, lambda0; reml = true);
        sqrtw = sqrt.(makeweights(vc.h2, lambda0));
    
        pheno_y00i = rowMultiply(pheno_y0i, sqrtw);
        X00_intercept = rowMultiply(X0_intercept, sqrtw);
        X00_covar = rowMultiply(X0_covar, sqrtw);
    
        y00 = resid(pheno_y00i, X00_intercept);
        X00 = resid(X00_covar, X00_intercept);
        
        sy = std(y00, dims = 1) |> vec;
        sx = std(X00, dims = 1) |> vec;
        colDivide!(y00, sy);
        colDivide!(X00, sx);
        R = y00' * X00;
    
        full_results_lite[:, i] = r2lod.(R, 79);
    
    end

end

238.296716 seconds (33.91 M allocations: 484.267 GiB, 9.37% gc time)


In [154]:
full_results_lite[1:6, :];

### Multi-threading LiteQTL:

In [155]:
Threads.nthreads()

24

In [156]:
nblocks = Threads.nthreads();

In [31]:
if nblocks == Threads.nthreads()
    BLAS.set_num_threads(1)
else
    BLAS.set_num_threads(Threads.nthreads() - nblocks)
end

In [33]:
BLAS.get_num_threads()

1

In [272]:
divrem(m, 4)

(8889, 0)

In [273]:
function scan_lite_univar(y0_j::Array{Float64, 1}, X0_intercept::Array{Float64, 2}, 
                            X0_covar::Array{Float64, 2}, lambda0::Array{Float64, 1};
                            reml::Bool = true)
    
    n = size(y0_j, 1);
    
    y0 = reshape(y0_j, :, 1);
    vc = fitlmm(y0, X0_intercept, lambda0; reml = reml);
    sqrtw = sqrt.(makeweights(vc.h2, lambda0));
    
    wy0 = rowMultiply(y0, sqrtw);
    wX0_intercept = rowMultiply(X0_intercept, sqrtw);
    wX0_covar = rowMultiply(X0_covar, sqrtw);
    
    y00 = resid(wy0, wX0_intercept);
    X00 = resid(wX0_covar, wX0_intercept);
        
        
    sy = std(y00, dims = 1) |> vec;
    sx = std(X00, dims = 1) |> vec;
    colDivide!(y00, sy);
    colDivide!(X00, sx);
    
    R = X00' * y00; # p-by-1 matrix
    
    return r2lod.(R, n); # results will be p-by-1, i.e. all LOD scores for the j-th trait and p markers
end

scan_lite_univar (generic function with 1 method)

In [274]:
function scan_lite_multithreads(Y::Array{Float64, 2}, G::Array{Float64, 2}, K::Array{Float64, 2}, nb::Int64;
                                reml::Bool = true)
    (n, m) = size(Y);
    p = size(G, 2);
    
    # rotate data
    (Y0, X0, lambda0) = transform_rotation(Y, G, K);
    X0_intercept = reshape(X0[:, 1], :, 1);
    X0_covar = X0[:, 2:end];
    
    (len, rem) = divrem(m, nb);
    
    results = Array{Array{Float64, 2}, 1}(undef, nb);
    
    Threads.@threads for t = 1:nb # so the N blocks will share the (nthreads - N) BLAS threads
        
        lods_currBlock = Array{Float64, 2}(undef, p, len);
        
        @simd for i = 1:len
            
            j = i+(t-1)*len;
            #@inbounds
            lods_currBlock[:, i] = scan_lite_univar(Y0[:, j], X0_intercept, X0_covar, lambda0;
                                                    reml = reml);
        end
                    
        results[t] = lods_currBlock;
        
    end
    
    # process up the remaining data
    lods_remBlock = Array{Float64, 2}(undef, p, rem);
    for i in 1:rem
        
        j = m-rem+i;
        
        lods_remBlock[:, i] = scan_lite_univar(Y0[:, j], X0_intercept, X0_covar, lambda0;
                                               reml = reml);
        
    end
    
    LODs_all = reduce(hcat, results);
    LODs_all = hcat(LODs_all, lods_remBlock);
    
    return LODs_all
    
end

scan_lite_multithreads (generic function with 1 method)

In [31]:
@time full_results_lite_multithreads = scan_lite_multithreads(pheno, geno, kinship, nblocks; reml = true);

 44.322109 seconds (33.66 M allocations: 490.079 GiB, 10.42% gc time)


In [32]:
full_results_lite[1:6, :]

LoadError: UndefVarError: full_results_lite not defined

In [33]:
full_results_lite_multithreads[1:6, :]

6×35556 Matrix{Float64}:
 0.000117904  0.0449468  0.00289554  …  0.0457906  0.00329113  0.344878
 0.000117904  0.0449468  0.00289554     0.0457906  0.00329113  0.344878
 0.000117904  0.0449468  0.00289554     0.0457906  0.00329113  0.344878
 0.000117904  0.0449468  0.00289554     0.0457906  0.00329113  0.344878
 0.000117904  0.0449468  0.00289554     0.0457906  0.00329113  0.344878
 0.000117904  0.0449468  0.00289554  …  0.0457906  0.00329113  0.344878

In [29]:
maxSqDiff(full_results_lite, full_results_lite_multithreads)

LoadError: UndefVarError: full_results_lite not defined

In [34]:
any(full_results_lite_multithreads .< 0.0)

false

In [None]:
test[2] = [1 2; 3 4; 5 6]

In [None]:
test

In [None]:
reduce(hcat, test[1:2])

### "Bulk" Calculation:

In [158]:
function scan_lite_multivar(Y0::Array{Float64, 2}, X0::Array{Float64, 2}, 
                            hsq::Float64, lambda0::Array{Float64, 1};
                            reml::Bool = true)
    
    n = size(Y0, 1)
    sqrtw = sqrt.(makeweights(hsq, lambda0));
    
    wY0 = rowMultiply(Y0, sqrtw);
    wX0 = rowMultiply(X0, sqrtw);
    
    wX0_intercept = wX0[:, 1];
    wX0_covar = wX0[:, 2:end];
    
    Y00 = resid(wY0, wX0_intercept);
    X00 = resid(wX0_covar, wX0_intercept);
        
        
    sy = std(Y00, dims = 1) |> vec;
    sx = std(X00, dims = 1) |> vec;
    colDivide!(Y00, sy);
    colDivide!(X00, sx);
    
    R = X00' * Y00; # p-by-m matrix
    
    return r2lod.(R, n); # results will be p-by-1, i.e. all LOD scores for the j-th trait and p markers
end

scan_lite_multivar (generic function with 1 method)

In [159]:
(Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship);

In [160]:
test_w = sqrt.(makeweights(0.1, lambda0));

In [161]:
@time begin
    test_wY0 = rowMultiply(Y0, test_w);
    test_wX0 = rowMultiply(X0, test_w);
end;

  0.025660 seconds (4 allocations: 25.844 MiB, 64.97% gc time)


In [162]:
@time begin
    test_wX0_intercept = test_wX0[:, 1];
    test_wX0_covar = test_wX0[:, 2:end];
end;

  0.001063 seconds (8 allocations: 4.413 MiB)


In [163]:
@time begin
    test_Y00 = resid(test_wY0, test_wX0_intercept);
    test_X00 = resid(test_wX0_covar, test_wX0_intercept);
end;

  0.022058 seconds (18 allocations: 52.341 MiB, 17.76% gc time)


In [164]:
@time begin
    sy = std(test_Y00, dims = 1) |> vec;
    sx = std(test_X00, dims = 1) |> vec;
    colDivide!(test_Y00, sy);
    colDivide!(test_X00, sx);
end;

  0.021850 seconds (24 allocations: 671.531 KiB)


In [165]:
Threads.nthreads()

24

In [166]:
BLAS.set_num_threads(24)

In [167]:
@time test_R = test_X00' * test_Y00;

  0.810603 seconds (3 allocations: 1.939 GiB, 0.47% gc time)


In [168]:
ctest_R = copy(test_R);

In [169]:
@time test_L = map(x -> r2lod(x, 79), test_R);

  3.358308 seconds (60.20 k allocations: 1.943 GiB, 4.24% gc time, 0.98% compilation time)


In [170]:
@time scan_lite_multivar(Y0, X0, 0.1, lambda0);

  4.420124 seconds (57 allocations: 3.960 GiB, 4.80% gc time)


In [171]:
function bulk_scan(Y::Array{Float64, 2}, G::Array{Float64, 2}, K::Array{Float64, 2}, hsq_list::Array{Float64, 1};
                reml::Bool = true)
    
    (Y0, X0, lambda0) = transform_rotation(Y, G, K);
    
    maxL = scan_lite_multivar(Y0, X0, hsq_list[1], lambda0);
    
    for hsq in hsq_list[2:end]
        
        currL = scan_lite_multivar(Y0, X0, hsq, lambda0);
        maxL = max.(maxL, currL);
        
    end
    
    return maxL
      
end

bulk_scan (generic function with 1 method)

In [172]:
hsq_grids = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0];

In [173]:
@time full_results_bulkscan = bulk_scan(pheno, geno, kinship, hsq_grids; reml = true);

 57.248463 seconds (672 allocations: 62.986 GiB, 5.58% gc time)


### Explore Multithreading the map:

#### Multithreading the R2LOD map:

Imagine we parallelize the map for converting of $r$ to LOD over a grid of $r$'s, across the columns. To process one column, the runtime is:

In [157]:
size(test_R)

(7321, 35556)

In [99]:
@time r2lod.(test_R[:, 1], 79);

  0.000129 seconds (7 allocations: 114.672 KiB)


In [97]:
divrem(size(test_R, 2), 16)

(2222, 4)

Then, if using 16 threads, as processing one block of traits takes about

In [101]:
@time r2lod.(test_R[:, 1:2222], 79);

  0.256500 seconds (8 allocations: 248.219 MiB, 1.59% gc time)


we would expect to compute one frame of LOD scores corresponding to one $h^2$ to be done in less than 0.5 seconds

In [102]:
Threads.nthreads()

24

In [103]:
size(test_R)

(7321, 35556)

In [104]:
function tR2LOD1(R::Array{Float64, 2}, n::Int64, nb::Int64)
    
    (p, m) = size(R)
    
    (len, rem) = divrem(m, nb);
    
    results = Array{Array{Float64, 2}, 1}(undef, nb);
    
    Threads.@threads for t in 1:nb
        
        head = 1+(t-1)*len; 
        tail = len+(t-1)*len;
        
        R_sub = R[:, head:tail];
        
        
        @simd for j in 1:size(R_sub, 2)
            @simd for i in 1:size(R_sub, 1)
                R_sub[i, j] = r2lod(R_sub[i, j], n)
            end
        end
        
        results[t] = R_sub
        
    end
    
    # process up the remaining data
    LOD_rem = r2lod.(R[:, (nb*len+1):end]);
    
    LODs_all = reduce(hcat, results);
    LODs_all = hcat(LODs_all, LOD_rem);
    
    return LODs_all
    
end

tR2LOD1 (generic function with 1 method)

In [105]:
function tR2LOD2!(R::Array{Float64, 2}, n::Int64)
    
    (p, m) = size(R)
    
    Threads.@threads for j in 1:m
        for i in 1:p
            @inbounds R[i, j] = r2lod(R[i, j], n)
        end
    end
    
end

tR2LOD2! (generic function with 1 method)

In [106]:
function tR2LOD3!(R::Array{Float64, 2}, n::Int64)
    
    (p, m) = size(R)
    
    Threads.@threads for i in 1:p
        @simd for j in 1:m
            @inbounds R[i, j] = r2lod(R[i, j], n)
        end
    end
    
end

tR2LOD3! (generic function with 1 method)

In [107]:
BLAS.get_num_threads()

24

In [119]:
@time test_LOD = tR2LOD1(test_R, 79, 4);

  2.620945 seconds (160 allocations: 5.818 GiB, 6.59% gc time)


In [111]:
ctest_R;

In [112]:
test_R = ctest_R;

In [117]:
@time tR2LOD2!(test_R, 79)

  0.218149 seconds (125 allocations: 11.594 KiB)


In [114]:
test_R = ctest_R;

In [115]:
@time tR2LOD3!(test_R, 79)

  0.876041 seconds (51.91 k allocations: 2.917 MiB, 4.51% compilation time)


In [121]:
sumSqDiff(test_R, test_LOD)

2.116580304281243

In [120]:
@time test_L = map(x -> r2lod(x, 79), test_R);

LoadError: InterruptException:

#### Multithreading the MAX map:

In [174]:
function mymax!(max::Array{Float64, 2}, toCompare::Array{Float64, 2})
    
    (p, m) = size(max);
    
    for i in 1:p
        for j in 1:m
            
            max[i, j] = (max[i, j] >= toCompare[i, j]) ? max[i, j] : toCompare[i, j];
            
        end
    end
    
end

mymax! (generic function with 1 method)

In [175]:
function mytmax!(max::Array{Float64, 2}, toCompare::Array{Float64, 2})
    
    (p, m) = size(max);
    
    @tturbo for j in 1:m
        for i in 1:p
            
            max[i, j] = (max[i, j] >= toCompare[i, j]) ? max[i, j] : toCompare[i, j];
            
        end
    end
    
end

mytmax! (generic function with 1 method)

In [176]:
A = rand(10000, 10000);
B = rand(10000, 10000);

In [177]:
cA = copy(A);

In [178]:
@time max1 = mymax!(A, B);

  0.887607 seconds


In [179]:
A = cA;

In [180]:
@time mytmax!(A, B);

  0.036367 seconds


After we incorporating the changes, run and time the entire task:

In [181]:
function tscan_lite_multivar(Y0::Array{Float64, 2}, X0::Array{Float64, 2}, 
                            hsq::Float64, lambda0::Array{Float64, 1};
                            reml::Bool = true)
    
    n = size(Y0, 1)
    sqrtw = sqrt.(makeweights(hsq, lambda0));
    
    wY0 = rowMultiply(Y0, sqrtw);
    wX0 = rowMultiply(X0, sqrtw);
    
    wX0_intercept = wX0[:, 1];
    wX0_covar = wX0[:, 2:end];
    
    Y00 = resid(wY0, wX0_intercept);
    X00 = resid(wX0_covar, wX0_intercept);
        
        
    sy = std(Y00, dims = 1) |> vec;
    sx = std(X00, dims = 1) |> vec;
    colDivide!(Y00, sy);
    colDivide!(X00, sx);
    
    R = X00' * Y00; # p-by-m matrix
    
    tR2LOD2!(R, n); # results will be p-by-1, i.e. all LOD scores for the j-th trait and p markers
    
    return R
end

tscan_lite_multivar (generic function with 1 method)

In [182]:
function tbulk_scan(Y::Array{Float64, 2}, G::Array{Float64, 2}, K::Array{Float64, 2}, hsq_list::Array{Float64, 1};
                reml::Bool = true)
    
    (Y0, X0, lambda0) = transform_rotation(Y, G, K);
    
    maxL = tscan_lite_multivar(Y0, X0, hsq_list[1], lambda0);
    
    for hsq in hsq_list[2:end]
        
        currL = tscan_lite_multivar(Y0, X0, hsq, lambda0);
        mytmax!(maxL, currL);
        
    end
    
    return maxL
      
end

tbulk_scan (generic function with 1 method)

In [203]:
hsq_grids = collect(1:10).*0.1;

In [204]:
Threads.nthreads()

24

In [205]:
BLAS.get_num_threads()

24

In [206]:
BenchmarkTools.DEFAULT_PARAMETERS.samples = 10

10

In [209]:
@time ordinary_bulkscanL = bulk_scan(pheno, geno, kinship, hsq_grids);

 68.319608 seconds (613 allocations: 57.086 GiB, 6.81% gc time)


In [207]:
@time t_bulkscanL = tbulk_scan(pheno, geno, kinship, hsq_grids);

 13.020161 seconds (1.89 k allocations: 20.237 GiB, 10.53% gc time)


In [210]:
sumSqDiff(ordinary_bulkscanL, t_bulkscanL)

0.0

In [211]:
@benchmark tbulk_scan(pheno, geno, kinship, hsq_grids)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m12.805 s[39m (8.39% GC) to evaluate,
 with a memory estimate of [33m20.24 GiB[39m, over [33m1894[39m allocations.

In [212]:
full_results_lite[1:6, :]

6×35556 Matrix{Float64}:
 0.000119096  0.0453796  0.00292759  …  0.0458121  0.003295  0.342962
 0.000119096  0.0453796  0.00292759     0.0458121  0.003295  0.342962
 0.000119096  0.0453796  0.00292759     0.0458121  0.003295  0.342962
 0.000119096  0.0453796  0.00292759     0.0458121  0.003295  0.342962
 0.000119096  0.0453796  0.00292759     0.0458121  0.003295  0.342962
 0.000119096  0.0453796  0.00292759  …  0.0458121  0.003295  0.342962

In [213]:
t_bulkscanL[1:6, :]

6×35556 Matrix{Float64}:
 0.0957757  0.130312  0.094194  0.0362664  …  0.0466633  0.354027  0.539208
 0.0957757  0.130312  0.094194  0.0362664     0.0466633  0.354027  0.539208
 0.0957757  0.130312  0.094194  0.0362664     0.0466633  0.354027  0.539208
 0.0957757  0.130312  0.094194  0.0362664     0.0466633  0.354027  0.539208
 0.0957757  0.130312  0.094194  0.0362664     0.0466633  0.354027  0.539208
 0.0957757  0.130312  0.094194  0.0362664  …  0.0466633  0.354027  0.539208

In [216]:
sum(t_bulkscanL .>= full_results_lite)/(p*m)

0.8130819230249309

## Estimate heritability independently for every trait:

### Iteratively:

In [104]:
function get_weights_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    return sqrtw
    
end

function get_h2s_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    # sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    # return sqrtw
    return out00.h2
    
end

get_h2s_for_timing (generic function with 1 method)

In [None]:
m

In [None]:
nprocs()

In [None]:
Threads.nthreads()

In [105]:
(Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits

In [None]:
h2s_list = Array{Float64, 1}(undef, m);
weights_list = Array{Float64, 2}(undef, n, m);

In [None]:
@time begin 
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    for k in 1:m
        y0_k = reshape(Y0[:, k], :, 1);
        h2s_list[k] = get_h2s_for_timing(y0_k, X0, lambda0; reml = true);
    end
    
end

In [None]:
@time begin
    
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    for k in 1:m
        y0_k = reshape(Y0[:, k], :, 1);
        weights_list[:, k] = get_weights_for_timing(y0_k, X0, lambda0; reml = true);
    end
    
end

In [None]:
get_weights_for_timing(reshape(Y0[:, 7919], :, 1), X0, lambda0; reml = true)

In [None]:
histogram(h2s_list, legend = false)

In [None]:
p

In [None]:
lod = Array{Float64, 2}(undef, p, m);

In [None]:
BLAS.get_num_threads()

In [None]:
function threads_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks(p, block_size);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 16, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

In [None]:
function scan_perms_threadsBlocks(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

In [None]:
block_bounds = quantile(h2s_list, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])

In [None]:
length(block_bounds)/2

In [None]:
sum(map(x -> (x < block_bounds[5] && x > block_bounds[4]), h2s_list))

### Parellely:

#### Distributed-processes

In [None]:
using Distributed

In [None]:
@everywhere begin
    
    include("../test/BXDdata_for_test.jl");
    include("../src/parallel_helpers.jl");
    
    function get_h2s_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    # sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    # return sqrtw
    return out00.h2
    
    end
end

In [None]:
@time begin 
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    pmap_h2s = pmap(x -> get_h2s_for_timing(reshape(Y0[:, x], :, 1), X0, lambda0; reml = true), 1:m)
end

In [None]:
Threads.nthreads()