# Explore Runtimes:

In [1]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/BXD"

In [2]:
cd("..")

In [3]:
include("../test/BXDdata_for_test.jl");

In [4]:
include("../src/parallel_helpers.jl");

In [5]:
using Plots

In [5]:
size(pheno)

(79, 35556)

In [6]:
size(pheno_y)

(79, 1)

In [7]:
size(geno)

(79, 7321)

In [8]:
size(kinship)

(79, 79)

In [9]:
(n, m, p)

(79, 35556, 7321)

In [14]:
Threads.nthreads()

8

In [11]:
using LinearAlgebra

In [13]:
BLAS.get_num_threads()

16

In [15]:
BLAS.set_num_threads(1);

In [16]:
BLAS.get_num_threads()

1

In [14]:
full_results_ordinary = Array{Float64, 2}(undef, p, m);

@time begin
for i in 1:m
    pheno_yi = reshape(pheno[:, i], :, 1);
    full_results_ordinary[:, i] = scan(pheno_yi, geno, kinship; method = "null", reml = true)[3]
end
end

full_results_ordinary

In [45]:
n

79

## LiteQTL:

In [17]:
function r2lod(r::Float64)
    rsq = (r/n)^2
    return -(n/2.0) * log10(1.0-rsq);
end

r2lod (generic function with 1 method)

In [18]:
r2lod.([0.1, 0.9])

2-element Vector{Float64}:
 2.748701454592006e-5
 0.0022265908888506967

In [19]:
full_results_lite = Array{Float64, 2}(undef, p, m);

In [40]:
@time begin
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship);
    X0_intercept = reshape(X0[:, 1], :, 1);
    #copyX0_intercept = deepcopy(X0_intercept);
    X0_covar = X0[:, 2:end];
    #copyX0_covar = deepcopy(X0_covar);
    
    for i in 1:m
        pheno_y0i = reshape(Y0[:, i], :, 1);
        vc = fitlmm(pheno_y0i, X0_intercept, lambda0; reml = true);
        sqrtw = sqrt.(makeweights(vc.h2, lambda0));
    
        pheno_y00i = rowMultiply(pheno_y0i, sqrtw);
        X00_intercept = rowMultiply(X0_intercept, sqrtw);
        X00_covar = rowMultiply(X0_covar, sqrtw);
    
        y00 = resid(pheno_y00i, X00_intercept);
        X00 = resid(X00_covar, X00_intercept);
        
        sy = std(y00, dims = 1) |> vec;
        sx = std(X00, dims = 1) |> vec;
        colDivide!(y00, sy);
        colDivide!(X00, sx);
        R = y00' * X00;
    
        full_results_lite[:, i] = r2lod.(R);
    
    end

end

379.958162 seconds (2.12 G allocations: 515.320 GiB, 2.23% gc time)


In [38]:
full_results_lite[1:6, :]

6×35556 Matrix{Float64}:
 0.000119377  0.0456475  0.00295519  …  0.0463614  0.00332262  0.347579
 0.000119377  0.0456475  0.00295519     0.0463614  0.00332262  0.347579
 0.000119377  0.0456475  0.00295519     0.0463614  0.00332262  0.347579
 0.000119377  0.0456475  0.00295519     0.0463614  0.00332262  0.347579
 0.000119377  0.0456475  0.00295519     0.0463614  0.00332262  0.347579
 0.000119377  0.0456475  0.00295519  …  0.0463614  0.00332262  0.347579

In [26]:
@time begin
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship);
    X0_intercept = reshape(X0[:, 1], :, 1);
    copyX0_intercept = deepcopy(X0_intercept);
    X0_covar = X0[:, 2:end];
    copyX0_covar = deepcopy(X0_covar);
    
    for i in 1:m
        pheno_y0i = reshape(Y0[:, i], :, 1);
        vc = fitlmm(pheno_y0i, X0_intercept, lambda0; reml = true);
        sqrtw = sqrt.(makeweights(vc.h2, lambda0));
    
        rowMultiply!(pheno_y0i, sqrtw);
        rowMultiply!(X0_intercept, sqrtw);
        rowMultiply!(X0_covar, sqrtw);
    
        y00 = resid(pheno_y0i, X0_intercept);
        X00 = resid(X0_covar, X0_intercept);
        
        X0_intercept = copyX0_intercept;
        X0_covar = copyX0_covar;
        
        sy = std(y00, dims = 1) |> vec;
        sx = std(X00, dims = 1) |> vec;
        colDivide!(y00, sy);
        colDivide!(X00, sx);
        R = y00' * X00;
    
        full_results_lite[:, i] = r2lod.(R);
    
    end

end

564.388355 seconds (2.12 G allocations: 361.891 GiB, 3.80% gc time, 0.01% compilation time)


### Multi-threading LiteQTL:

In [20]:
Threads.nthreads()

8

In [21]:
nblocks = Threads.nthreads();

In [22]:
if nblocks == Threads.nthreads()
    BLAS.set_num_threads(1)
else
    BLAS.set_num_threads(Threads.nthreads() - nblocks)
end

In [23]:
BLAS.get_num_threads()

1

In [24]:
divrem(m, 16)

(2222, 4)

In [25]:
function scan_lite_univar(y0_j::Array{Float64, 1}, X0_intercept::Array{Float64, 2}, 
                            X0_covar::Array{Float64, 2}, lambda0::Array{Float64, 1};
                            reml::Bool = true)
    
    y0 = reshape(y0_j, :, 1);
    vc = fitlmm(y0, X0_intercept, lambda0; reml = reml);
    sqrtw = sqrt.(makeweights(vc.h2, lambda0));
    
    wy0 = rowMultiply(y0, sqrtw);
    wX0_intercept = rowMultiply(X0_intercept, sqrtw);
    wX0_covar = rowMultiply(X0_covar, sqrtw);
    
    y00 = resid(wy0, wX0_intercept);
    X00 = resid(wX0_covar, wX0_intercept);
        
        
    sy = std(y00, dims = 1) |> vec;
    sx = std(X00, dims = 1) |> vec;
    colDivide!(y00, sy);
    colDivide!(X00, sx);
    
    R = X00' * y00; # p-by-1 matrix
    
    return r2lod.(R); # results will be p-by-1, i.e. all LOD scores for the j-th trait and p markers
end

scan_lite_univar (generic function with 1 method)

In [71]:
function scan_lite_multithreads(Y::Array{Float64, 2}, G::Array{Float64, 2}, K::Array{Float64, 2}, nb::Int64;
                                reml::Bool = true)
    (n, m) = size(Y);
    p = size(G, 2);
    
    # rotate data
    (Y0, X0, lambda0) = transform_rotation(Y, G, K);
    X0_intercept = reshape(X0[:, 1], :, 1);
    X0_covar = X0[:, 2:end];
    
    (len, rem) = divrem(m, nb);
    
    results = Array{Array{Float64, 2}, 1}(undef, nb);
    
    Threads.@threads for t = 1:nb # so the N blocks will share the (nthreads - N) BLAS threads
        
        lods_currBlock = Array{Float64, 2}(undef, p, len);
        
        @simd for i = 1:len
            
            j = i+(t-1)*len;
            #@inbounds
            lods_currBlock[:, i] = scan_lite_univar(Y0[:, j], X0_intercept, X0_covar, lambda0;
                                                    reml = reml);
        end
                    
        results[t] = lods_currBlock;
        
    end
    
    # process up the remaining data
    lods_remBlock = Array{Float64, 2}(undef, p, rem);
    @simd for i in 1:rem
        
        j = m-rem+i;
        
        lods_remBlock[:, i] = scan_lite_univar(X0_intercept, X0_covar, lambda0;
                                               reml = reml);
        
    end
    
    LODs_all = reduce(hcat, results);
    LODs_all = hcat(LODs_all, lods_remBlock);
    
    return LODs_all
    
end

scan_lite_multithreads (generic function with 1 method)

In [None]:
@time full_results_lite_multithreads = scan_lite_multithreads(pheno, geno, kinship, nblocks; reml = true);

In [None]:
full_results_lite[1:6, :]

In [None]:
full_results_lite_multithreads[1:6, :]

In [63]:
test[2] = [1 2; 3 4; 5 6]

3×2 Matrix{Int64}:
 1  2
 3  4
 5  6

In [64]:
test

4-element Vector{Matrix{Float64}}:
    [0.0 0.0; 0.0 0.0; 0.0 0.0]
    [1.0 2.0; 3.0 4.0; 5.0 6.0]
 #undef
 #undef

In [66]:
reduce(hcat, test[1:2])

3×4 Matrix{Float64}:
 0.0  0.0  1.0  2.0
 0.0  0.0  3.0  4.0
 0.0  0.0  5.0  6.0

In [23]:
function scan_multivar!(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1}; 
                    reml::Bool = false)

    ## Note: estimate once the variance components from the null model and use for all marker scans
    # fit lmm

    # X0_intercept = @view X0[:, 1] # to compare
    vc = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml) # vc.b is estimated through weighted least square
    r0 = y0 - X0[:, 1]*vc.b

    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(vc.h2, lambda0))

    # compared runtime of the following with "wls(X0[:, 2:end], X0[:, 1], wts)" ?
    # rescale by weights; now these have the same mean/variance and are independent
    rowMultiply!(r0, sqrtw);
    rowMultiply!(X0, sqrtw);

    
    # after re-weighting X, calling resid on re-weighted X is the same as doing wls on the X after rotation.
    X00 = resid(X0[:, 2:end], reshape(X0[:, 1], :, 1)) # consider not using sub-array, consider @view; in-place changes

    ## random permutations; the first column is the original trait (after transformation)
    rng = MersenneTwister(rndseed);
    ## permute r0 (which is an iid, standard normal distributed N-vector under the null)
    r0perm = shuffleVector(rng, r0[:, 1], nperms; original = original)

    ## Null RSS:
    # By null hypothesis, mean is 0. RSS just becomes the sum of squares of the residuals (r0perm's)
    # (For theoretical derivation of the results, see notebook)
    rss0 = sum(r0perm[:, 1].^2) # a scalar; bc rss0 for every permuted trait is the same under the null (zero mean);
    
    ## make array to hold Alternative RSS's for each permutated trait
    if original
        rss1 = Array{Float64, 2}(undef, nperms+1, p)
    else
        rss1 = Array{Float64, 2}(undef, nperms, p)
    end
    
    ## loop over markers
    for i = 1:p

        ## alternative rss
        @inbounds rss1[:, i] = rss(r0perm, @view X00[:, i]);
        
    end

    lod = (-n/2)*(log10.(rss1) .- log10(rss0))

    return lod

end

scan_multivar! (generic function with 1 method)

In [None]:
@time begin 
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    for k in 1:m
        y0_k = reshape(Y0[:, k], :, 1);
    end
    
end

In [61]:
full_results_ordinary[1:6, :]

6×35556 Matrix{Float64}:
 0.000120087  0.0459408  0.00296438  …  0.0462626  0.00334917  0.348691
 0.000120087  0.0459408  0.00296438     0.0462626  0.00334917  0.348691
 0.000120087  0.0459408  0.00296438     0.0462626  0.00334917  0.348691
 0.000120087  0.0459408  0.00296438     0.0462626  0.00334917  0.348691
 0.000120087  0.0459408  0.00296438     0.0462626  0.00334917  0.348691
 0.000120087  0.0459408  0.00296438  …  0.0462626  0.00334917  0.348691

In [65]:
size(full_results_lite)

(7321, 35556)

In [66]:
size(full_results_ordinary)

(7321, 35556)

In [72]:
tol = 2.0

2.0

In [73]:
sum((full_results_lite .- full_results_ordinary) .> 1.0)

1166

## Estimate heritability independently for every trait:

### Iteratively:

In [None]:
function get_weights_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    return sqrtw
    
end

function get_h2s_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    # sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    # return sqrtw
    return out00.h2
    
end

In [None]:
m

In [None]:
nprocs()

In [None]:
Threads.nthreads()

In [None]:
(Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits

In [None]:
h2s_list = Array{Float64, 1}(undef, m);
weights_list = Array{Float64, 2}(undef, n, m);

In [None]:
@time begin 
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    for k in 1:m
        y0_k = reshape(Y0[:, k], :, 1);
        h2s_list[k] = get_h2s_for_timing(y0_k, X0, lambda0; reml = true);
    end
    
end

In [None]:
@time begin
    
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    for k in 1:m
        y0_k = reshape(Y0[:, k], :, 1);
        weights_list[:, k] = get_weights_for_timing(y0_k, X0, lambda0; reml = true);
    end
    
end

In [None]:
get_weights_for_timing(reshape(Y0[:, 7919], :, 1), X0, lambda0; reml = true)

In [None]:
histogram(h2s_list, legend = false)

In [None]:
p

In [None]:
lod = Array{Float64, 2}(undef, p, m);

In [None]:
BLAS.get_num_threads()

In [None]:
function threads_by_blocks(r0perm::Array{Float64, 2}, X00::Array{Float64, 2}, nblocks::Int64)
    # Does distributed processes of calculations of LOD scores for markers in each block

    p = size(X00, 2);

    ## (Create blocks...)
    
    block_size = ceil(Int, p/nblocks);
    blocks = createBlocks(p, block_size);

    LODs_blocks = tmap(x -> calcLODs_block(r0perm, X00, x), 16, blocks);
    results = reduce(hcat, LODs_blocks);

    return results

end

In [None]:
function scan_perms_threadsBlocks(y::Array{Float64,2}, g::Array{Float64,2}, K::Array{Float64,2};
                                reml::Bool = false,
                                nperms::Int64 = 1024, rndseed::Int64 = 0, original::Bool = true,
                                # (options for blocks, nperms distribution methods...)
                                option::String = "by blocks", nblocks::Int64 = 1, ncopies::Int64 = 1, 
                                nprocs::Int64 = 0)

    (y0, X0, lambda0) = transform_rotation(y, g, K); # rotation of data
    (r0, X00) = transform_reweight(y0, X0, lambda0; reml = reml); # reweighting and taking residuals
    r0perm = transform_permute(r0; nperms = nperms, rndseed = rndseed, original = original);

    if option == "by blocks"
        results = threads_by_blocks(r0perm, X00, nblocks);
    elseif option == "by nperms"
        results = distribute_by_nperms(r0, X00, nperms, ncopies, original);
    else
        throw(error("Option unsupported."))
    end

    return results

end

In [None]:
block_bounds = quantile(h2s_list, [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])

In [None]:
length(block_bounds)/2

In [None]:
sum(map(x -> (x < block_bounds[5] && x > block_bounds[4]), h2s_list))

### Parellely:

#### Distributed-processes

In [None]:
using Distributed

In [None]:
@everywhere begin
    
    include("../test/BXDdata_for_test.jl");
    include("../src/parallel_helpers.jl");
    
    function get_h2s_for_timing(y0::Array{Float64, 2}, X0::Array{Float64, 2}, lambda0::Array{Float64, 1};
                   reml::Bool = false)
    
    # fit null lmm
    out00 = fitlmm(y0, reshape(X0[:, 1], :, 1), lambda0; reml = reml)
    # weights proportional to the variances
    # sqrtw = sqrt.(makeweights(out00.h2, lambda0));
    
    # return sqrtw
    return out00.h2
    
    end
end

In [None]:
@time begin 
    (Y0, X0, lambda0) = transform_rotation(pheno, geno, kinship); # rotate the entire matrix of all traits
    
    pmap_h2s = pmap(x -> get_h2s_for_timing(reshape(Y0[:, x], :, 1), X0, lambda0; reml = true), 1:m)
end

In [None]:
Threads.nthreads()