## Data Analysis using BulkLMM - BXD Longevity Study

In [1]:
using CSV, DelimitedFiles, DataFrames, Missings, XLSX
using LinearAlgebra, Statistics, Optim
using Random, Distributions, LoopVectorization
# using GeneNetworkAPI, Downloads
using BenchmarkTools

In [2]:
using Plots

In [3]:
local_path = "../../BulkLMM.jl/src";

## Load processed_data:

### By individuals:

In [4]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/BXDLongevity"

In [5]:
BXD_pheno_ind_summary_df = CSV.read("../../../shareddata/BXDLongevity/data/GN886_pheno_summary.csv", DataFrame);

By individuals, there are 248 individual samples for 32445 livear proteome.

In [6]:
println(size(BXD_pheno_ind_summary_df)) 
BXD_pheno_ind_summary_df[1:10, 1:10]

(248, 32448)


Row,Sample,Strain,Strain_num,P42209_DESGLNRK_2,P42209_GLRPLDVAFLR_3,Q99M02_VGDPVYR_2,Q99M02_VWIYPIK_2,Q99M02_LCDPSVK_2,Q99M02_CVLTTVDPDTGIIDR_2,Q99M02_LVQFDTSMK_2
Unnamed: 0_level_1,String7,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,H1009,BXD9,9,11.349,11.534,17.587,17.517,17.309,16.574,16.323
2,H0370,BXD9,9,11.249,12.735,17.427,17.325,17.252,16.685,16.021
3,H2577,BXD9,9,12.415,10.487,17.89,17.488,17.594,16.731,16.208
4,H0365,BXD9,9,11.374,10.674,17.714,17.401,17.381,16.621,16.337
5,H1333,BXD13,13,11.687,11.524,17.362,17.367,17.071,16.465,15.97
6,H2259,BXD24,24,11.837,11.715,17.57,17.792,17.505,16.894,16.277
7,H1792,BXD24,24,11.563,11.434,17.789,17.847,17.416,17.042,16.292
8,H1791,BXD24,24,12.5,12.273,17.944,17.833,17.63,16.862,16.3
9,H1541,BXD24,24,11.815,11.564,17.794,17.759,17.456,16.878,16.377
10,H1277,BXD24,24,12.674,11.743,17.866,17.845,17.577,16.922,16.26


In [7]:
BXD_geno_ind_summary_df = CSV.read("../../../shareddata/BXDLongevity/data/GN886_geno_summary.csv", DataFrame);

By individuals, there are 248 individual samples for 7321 markers.

In [8]:
println(size(BXD_geno_ind_summary_df)) 
BXD_geno_ind_summary_df[1:10, 1:10]

(248, 7324)


Row,Sample,Strain,Strain_num,rs31443144,rs6269442,rs32285189,rs258367496,rs32430919,rs36251697,rs30658298
Unnamed: 0_level_1,String7,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,H1009,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H0370,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H2577,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H0365,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,H1333,BXD13,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,H2259,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,H1792,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,H1791,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,H1541,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,H1277,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### By strains:

In [9]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/BXDLongevity"

In [10]:
BXD_pheno_strains_summary_df = CSV.read("../../../shareddata/BXDLongevity/data/GN886_pheno_strain_means_summary.csv", DataFrame);

By strains, there are 50 BXD strains for 32445 liver proteome.

In [12]:
println(size(BXD_pheno_strains_summary_df)) 
BXD_pheno_strains_summary_df[1:10, 1:10]

(50, 32447)


Row,Strain,Number of Samples,P42209_DESGLNRK_2,P42209_GLRPLDVAFLR_3,Q99M02_VGDPVYR_2,Q99M02_VWIYPIK_2,Q99M02_LCDPSVK_2,Q99M02_CVLTTVDPDTGIIDR_2,Q99M02_LVQFDTSMK_2,Q99M02_QLQQVGTVSK_2
Unnamed: 0_level_1,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,BXD9,4,11.5968,11.3575,17.6545,17.4327,17.384,16.6528,16.2222,16.3227
2,BXD13,1,11.687,11.524,17.362,17.367,17.071,16.465,15.97,15.98
3,BXD24,5,12.0778,11.7458,17.7926,17.8152,17.5168,16.9196,16.3012,16.4304
4,BXD27,3,11.717,11.149,17.336,17.3143,17.165,16.5283,15.9407,16.002
5,BXD29,11,11.735,11.3496,17.4583,17.3877,17.2431,16.4945,16.0869,16.1075
6,BXD32,8,11.6266,11.554,17.8254,17.7716,17.4639,16.9029,16.434,16.425
7,BXD34,13,11.7491,11.9135,17.6108,17.5974,17.2988,16.7761,16.3147,16.2926
8,BXD39,5,11.9158,11.719,17.395,17.385,17.157,16.5226,16.179,16.0594
9,BXD40,8,11.8651,11.6817,17.5986,17.6386,17.3861,16.6824,16.2855,16.2523
10,BXD43,4,12.378,11.4563,17.76,17.8035,17.3652,16.885,16.5245,16.3532


In [13]:
BXD_geno_strains_summary_df = CSV.read("../../../shareddata/BXDLongevity/data/GN886_geno_strains_summary.csv", DataFrame);

By strains, there are 50 BXD strains for 7321 markers.

In [14]:
println(size(BXD_geno_strains_summary_df)) 
BXD_geno_strains_summary_df[1:10, 1:10]

(50, 7322)


Row,Strain,rs31443144,rs6269442,rs32285189,rs258367496,rs32430919,rs36251697,rs30658298,rs51852623,rs31879829
Unnamed: 0_level_1,String7,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,BXD9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BXD13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BXD24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BXD27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,BXD29,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,BXD32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,BXD34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,BXD39,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,BXD40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,BXD43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load functions:

In [15]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/BXDLongevity"

In [16]:
include(joinpath(local_path, "kinship.jl"));
include(joinpath(local_path, "util.jl"));
include(joinpath(local_path, "wls.jl"));
include(joinpath(local_path, "lmm.jl"));
include(joinpath(local_path, "gridbrent.jl"));
include(joinpath(local_path, "transform_helpers.jl"));
include(joinpath(local_path, "scan.jl"));
include(joinpath(local_path, "bulkscan_helpers.jl"));
include(joinpath(local_path, "bulkscan.jl"));
include(joinpath(local_path, "readData.jl"));
include(joinpath(local_path, "../plot_utils/visuals_utils.jl"));
include(joinpath(local_path, "analysis_helpers/single_trait_analysis.jl"));
include("../../BigRiver_util_code/src/kinship_utils.jl");

## Objectives

- gemma one trait, ind and strains

- scan_alt one trait, ind and strains

- scan_null one trait, ind and strains

- bulkscan_grid (null), all traits, ...

- data structure for all traits results...

- Heatmap grids_taken, compare on strain means and individuals

- Histogram of heritabilities (null) for all traits (both strain means and individual level data)

- Loglikelihood by h2_grid plot, strain means v.s. individuals

- Report fixed effects:

- Adjusting difference in the number of samples across strains

- Histogram of heritabilities (null) for all traits (both strain means and individual level data)

## Run BulkLMM.jl:

### Calculate kinship:

First, calculate the kinship matrix of relatedness among the strains (50-by-50):

In [17]:
geno_strains = Matrix{Float64}(BXD_geno_strains_summary_df[:, 2:end]);

In [18]:
kinship_strains = calcKinship(geno_strains);

Then, construct the kinship matrix of relatedness among the individuals based on which strain they are from:

In [20]:
BXD_geno_ind_summary_df.Strain_num = Vector{Int64}(BXD_geno_ind_summary_df.Strain_num);

In [21]:
@time kinship_ind_from_strains = calcKinship2(kinship_strains, BXD_geno_ind_summary_df.Strain_num);

  0.002961 seconds (28 allocations: 493.812 KiB)


### Single-trait genome scans:

To see single trait genome scan results, we take the 29437-th livear proteomic as the single trait to run analysis:  

In [31]:
lp_names = names(BXD_pheno_ind_summary_df)[4:end];

In [32]:
lp_id = 29437;
lp_names[lp_id]

"Q9Z2I8_SSGLPITSAVDLEDAAK_3"

In [33]:
prior = [1.0, 0.0];

#### For individual liver proteome

In [34]:
geno_ind = Matrix{Float64}(BXD_geno_ind_summary_df[:, 4:end]);

In [35]:
lp_ind_Y = Matrix{Float64}(BXD_pheno_ind_summary_df[:, 4:end]);
lp_ind_y = reshape(lp_ind_Y[:, lp_id], :, 1);

In [36]:
lpst_ind_Y = colStandardize(lp_ind_Y);
lpst_ind_y = reshape(lpst_ind_Y[:, lp_id], :, 1);

In [45]:
@time scan_results_ind = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                              prior_variance = 1.0, prior_sample_size = 0.1);

  0.164502 seconds (82.20 k allocations: 136.248 MiB, 12.94% gc time)


In [47]:
@time scan_results_ind_alt = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                                  assumption = "alt", 
                                  prior_variance = prior[1], prior_sample_size = prior[2]);

 13.322422 seconds (11.82 M allocations: 14.369 GiB, 7.34% gc time, 0.26% compilation time)


In [48]:
@time scan_perms_results_ind = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                                    prior_variance = prior[1], prior_sample_size = prior[2],
                                    permutation_test = true, nperms = 1000, original = true);

  0.977219 seconds (2.04 M allocations: 268.265 MiB, 1.56% gc time, 89.44% compilation time)


In [49]:
single_results_ind = DataFrame(hcat(scan_results_ind.lod, scan_results_ind_alt.lod, scan_perms_results_ind[:, 1]), ["scan_null", "scan_alt", "scan_perms.original"]);

In [50]:
single_results_ind[1:6, :]

Row,scan_null,scan_alt,scan_perms.original
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.44861,0.44861,0.44861
2,0.44861,0.44861,0.44861
3,0.44861,0.44861,0.44861
4,0.44861,0.44861,0.44861
5,0.44861,0.44861,0.44861
6,0.44861,0.44861,0.44861


#### For liver proteome means by strains

In [51]:
lp_strains_Y = Matrix{Float64}(BXD_pheno_strains_summary_df[:, 3:end]);
lp_strains_y = reshape(lp_strains_Y[:, lp_id], :, 1);

In [52]:
lpst_strains_Y = colStandardize(lp_strains_Y);
lpst_strains_y = reshape(lpst_strains_Y[:, lp_id], :, 1);

In [67]:
@time scan_results_strains = scan(lpst_strains_y, geno_strains, kinship_strains; 
                                  prior_variance = prior[1], prior_sample_size = prior[2],
                                  optim_interval = 4);

  0.045805 seconds (84.35 k allocations: 33.226 MiB)


In [68]:
@time scan_results_strains_alt = scan(lpst_strains_y, geno_strains, kinship_strains;
                                      assumption = "alt",
                                      prior_variance = prior[1], prior_sample_size = prior[2]);

  2.856948 seconds (6.73 M allocations: 2.055 GiB, 7.77% gc time)


In [69]:
@time scan_perms_results_strains = scan(lpst_strains_y, geno_strains, kinship_strains;
                                        permutation_test = true, nperms = 1000, original = true, 
                                        prior_variance = prior[1], prior_sample_size = prior[2]);

  0.137267 seconds (90.54 k allocations: 78.811 MiB, 67.73% gc time)


In [70]:
single_results_strains = DataFrame(hcat(scan_results_strains.lod, scan_results_strains_alt.lod, scan_perms_results_strains[:, 1]), ["scan_null", "scan_alt", "scan_perms.original"]);

In [71]:
single_results_strains[1:6, :]

Row,scan_null,scan_alt,scan_perms.original
Unnamed: 0_level_1,Float64,Float64,Float64
1,0.387771,0.387771,0.387771
2,0.387771,0.387771,0.387771
3,0.387771,0.387771,0.387771
4,0.387771,0.387771,0.387771
5,0.387771,0.387771,0.387771
6,0.387771,0.387771,0.387771


### Multiple-trait genome scans:

In [72]:
BLAS.set_num_threads(Threads.nthreads())

In [73]:
BLAS.get_num_threads()

16

In [75]:
grid_list = collect(0.0:0.05:0.95);
# grid_list = collect(0.0:0.01:0.99);

#### For individual liver proteome

In [76]:
size(lpst_ind_Y)

(248, 32445)

In [77]:
size(lpst_strains_Y)

(50, 32445)

In [78]:
Threads.nthreads()

16

In [79]:
@time bulkscan_results_ind_null = bulkscan_null(lp_ind_Y, geno_ind, kinship_ind_from_strains; nb = Threads.nthreads(),
                                                prior_variance = 1.0, prior_sample_size = 0.0, optim_interval = 1);

163.091016 seconds (2.60 G allocations: 1.813 TiB, 14.57% gc time, 0.19% compilation time)


In [80]:
BLAS.set_num_threads(Threads.nthreads())

In [81]:
@time bulkscan_results_ind_alt_grid = bulkscan_alt_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_list);

 23.266629 seconds (8.86 M allocations: 41.849 GiB, 5.46% gc time, 0.20% compilation time)


In [83]:
@time bulkscan_results_ind_null_grid = bulkscan_null_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_list);

  6.073376 seconds (10.10 M allocations: 11.609 GiB, 3.39% gc time)


#### For liver proteome means by strains

In [145]:
@time bulkscan_results_strains_null = bulkscan_null(lpst_strains_Y, geno_strains, kinship_strains; nb = Threads.nthreads(),
                                                    prior_variance = 1.0, prior_sample_size = 0.0, optim_interval = 1);

 51.151220 seconds (2.61 G allocations: 434.477 GiB, 35.90% gc time)


In [85]:
BLAS.set_num_threads(Threads.nthreads())

In [86]:
@time bulkscan_results_strains_alt_grid = bulkscan_alt_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_list);

 16.416681 seconds (8.69 M allocations: 36.859 GiB, 14.53% gc time)


In [146]:
@time bulkscan_results_strains_null_grid = bulkscan_null_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_list;
                                                              prior_variance = 1.0, prior_sample_size = 0.0);

  5.281386 seconds (9.47 M allocations: 5.288 GiB, 10.84% gc time)


In [88]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/BXDLongevity"

In [132]:
blackSwans = findall(bulkscan_results_strains_null.h2_null_list .>= 0.9999);

In [144]:
h2_grid = collect(1:999).*0.001;
h2_grid = vcat(h2_grid, 0.9999999)

1000-element Vector{Float64}:
 0.001
 0.002
 0.003
 0.004
 0.005
 0.006
 0.007
 0.008
 0.009000000000000001
 0.01
 0.011
 0.012
 0.013000000000000001
 ⋮
 0.989
 0.99
 0.991
 0.992
 0.993
 0.994
 0.995
 0.996
 0.997
 0.998
 0.999
 0.9999999

In [134]:
test_id = blackSwans[5]

241

In [135]:
size(blackSwans, 1)

419

In [147]:
#= 
for test_id in blackSwans[1]
    println(test_id)
    scan(reshape(lpst_strains_Y[:, test_id], :, 1), geno_strains, kinship_strains;
         prior_variance = 1.0, prior_sample_size = 0.0, optim_interval = 1,
         plot_loglik = true, markerID = 1, h2_grid = h2_grid, y_lims = [-25.0, -21]);
end
=#

In [165]:
findall(abs.(bulkscan_results_strains_null.h2_null_list .- bulkscan_results_strains_null_grid.h2_null_list) .> 0.50)

2045-element Vector{Int64}:
     6
    13
    33
    38
    41
    47
    53
    55
    59
    63
    65
    68
    71
     ⋮
 32228
 32250
 32270
 32278
 32334
 32339
 32348
 32355
 32364
 32379
 32424
 32441

for trait 617...

In [150]:
test_grid_brent1 = scan(reshape(lpst_strains_Y[:, 617], :, 1), geno_strains, kinship_strains);

In [151]:
test_grid_brent4 = scan(reshape(lpst_strains_Y[:, 617], :, 1), geno_strains, kinship_strains;
                        optim_interval = 4);

In [194]:
test_grid_brent1c = scan(reshape(lpst_strains_Y[:, 617], :, 1), geno_strains, kinship_strains;
                         prior_variance = 1.0, prior_sample_size = 49.0);

In [195]:
test_grid_brent1.h2_null

0.6760670144691605

In [196]:
test_grid_brent4.h2_null

7.729668872377043e-16

In [197]:
test_grid_brent1c.h2_null

1.09920440735099e-15

In [198]:
hcat(test_grid_brent1.lod, test_grid_brent4.lod, test_grid_brent1c.lod)

7321×3 Matrix{Float64}:
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.125322    0.0708822  0.0643288
 0.112734    0.118025   0.0748609
 ⋮                      
 0.152007    0.118166   0.00133766
 0.152007    0.118166   0.00133766
 0.152007    0.118166   0.00133766
 0.115361    0.0950165  5.77928e-6
 0.140935    0.0833545  0.00068534
 0.140935    0.0833545  0.00068534
 0.117898    0.0649331  0.00521591
 0.0897677   0.0156876  0.0146325
 0.117898    0.0649331  0.00521591
 0.0897677   0.0156876  0.0146325
 2.98765e-6  0.0250059  0.00870479
 2.98765e-6  0.0250059  0.00870479

for trait 6

In [199]:
test_grid_brent1 = scan(reshape(lpst_strains_Y[:, 6], :, 1), geno_strains, kinship_strains);

In [200]:
test_grid_brent4 = scan(reshape(lpst_strains_Y[:, 6], :, 1), geno_strains, kinship_strains;
                        optim_interval = 10);

In [201]:
test_grid_brent1c = scan(reshape(lpst_strains_Y[:, 6], :, 1), geno_strains, kinship_strains;
                         prior_variance = 1.0, prior_sample_size = 49.0);

In [202]:
test_grid_brent1.h2_null

0.6760670144691605

In [203]:
test_grid_brent4.h2_null

7.729668872377043e-16

In [204]:
test_grid_brent1c.h2_null

5.897297997161383e-16

In [205]:
hcat(test_grid_brent1.lod, test_grid_brent4.lod, test_grid_brent1c.lod)

7321×3 Matrix{Float64}:
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.125322    0.0708822  0.0708822
 0.112734    0.118025   0.118025
 ⋮                      
 0.152007    0.118166   0.118166
 0.152007    0.118166   0.118166
 0.152007    0.118166   0.118166
 0.115361    0.0950165  0.0950165
 0.140935    0.0833545  0.0833545
 0.140935    0.0833545  0.0833545
 0.117898    0.0649331  0.0649331
 0.0897677   0.0156876  0.0156876
 0.117898    0.0649331  0.0649331
 0.0897677   0.0156876  0.0156876
 2.98765e-6  0.0250059  0.0250059
 2.98765e-6  0.0250059  0.0250059

### Peformance check:

#### Individual data:
- n = 248
- m = 32445
- p = 7321

In [218]:
size(lpst_ind_Y)

(248, 32445)

In [223]:
size(geno_ind)

(248, 7321)

In [207]:
@benchmark scan(lpst_ind_y, geno_ind, kinship_ind_from_strains)

BenchmarkTools.Trial: 31 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m164.902 ms[22m[39m … [35m170.195 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m10.13% … 9.98%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m165.927 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m10.15%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m166.178 ms[22m[39m ± [32m  1.093 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.15% ± 0.10%

  [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[34m [39m[39m [39m▂[32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m█[39m▁[39m▁[3

In [208]:
@benchmark scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 31 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m165.110 ms[22m[39m … [35m168.172 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m10.27% … 10.09%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m166.420 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m10.13%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m166.449 ms[22m[39m ± [32m755.808 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.18% ±  0.23%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m [39m [39m [39m [39m [39m█[39m▄[39m [39m [39m [39m [39m [39m [39m [34m [39m[39m [32m [39m[39m [39m [39m [39m▄[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▆[39m▁[39m▆

In [209]:
@benchmark scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                assumption = "alt")

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m13.868 s[39m (12.22% GC) to evaluate,
 with a memory estimate of [33m14.36 GiB[39m, over [33m11699751[39m allocations.

In [210]:
@benchmark scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                assumption = "alt", 
                prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m13.863 s[39m (12.31% GC) to evaluate,
 with a memory estimate of [33m14.36 GiB[39m, over [33m11697819[39m allocations.

In [None]:
@benchmark bulkscan_null(lpst_ind_Y, geno_ind, kinship_ind_from_strains,
                         prior_variance = 1.0, prior_sample_size = 0.1)

In [211]:
grid_perf = collect(0.0:0.05:0.95);

In [215]:
@benchmark bulkscan_null_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_perf;
                              prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m17.366 s[39m (48.95% GC) to evaluate,
 with a memory estimate of [33m11.61 GiB[39m, over [33m10096104[39m allocations.

In [217]:
@benchmark bulkscan_alt_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_perf)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m101.817 s[39m (6.92% GC) to evaluate,
 with a memory estimate of [33m41.84 GiB[39m, over [33m8691488[39m allocations.

#### Strain mean data:
- n = 50
- m = 32445
- p = 7321

In [221]:
size(lpst_strains_Y)

(50, 32445)

In [222]:
size(geno_strains)

(50, 7321)

In [224]:
@benchmark scan(lpst_strains_y, geno_strains, kinship_strains)

BenchmarkTools.Trial: 134 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m32.937 ms[22m[39m … [35m44.974 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m38.483 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m13.39%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m37.325 ms[22m[39m ± [32m 2.487 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.17% ± 5.98%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m▄[34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m▇[39m▆[39m▄[39m▃[39m▁[

In [225]:
@benchmark scan(lpst_strains_y, geno_strains, kinship_strains;
                prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 135 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m32.836 ms[22m[39m … [35m44.496 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m38.337 ms              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m13.39%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m37.156 ms[22m[39m ± [32m 2.438 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.20% ± 5.97%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m▄[34m█[39m[39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m 
  [39m▅[39m▅[39m▅[39m▆[39m▅[39m▁[

In [226]:
@benchmark scan(lpst_strains_y, geno_strains, kinship_strains;
                assumption = "alt")

BenchmarkTools.Trial: 3 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.905 s[22m[39m … [35m  1.942 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m11.25% … 11.91%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.909 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m11.38%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.919 s[22m[39m ± [32m20.656 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.51% ±  0.35%

  [34m█[39m[39m [39m [39m [39m [39m█[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m█[39m▁[39m▁[3

In [227]:
@benchmark scan(lpst_strains_y, geno_strains, kinship_strains;
                assumption = "alt", 
                prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 5 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.157 s[22m[39m … [35m  1.185 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m11.38% … 11.62%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.162 s              [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m11.42%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.165 s[22m[39m ± [32m11.653 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.45% ±  0.11%

  [39m█[39m [39m [34m█[39m[39m [39m [39m [39m [39m [39m [39m█[39m█[39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [39m█[39m▁[39m▁[34m█[39m[39m▁[39m▁[39m▁[39m▁[3

In [232]:
@benchmark bulkscan_null(lpst_strains_Y, geno_strains, kinship_strains,
                         prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m50.878 s[39m (36.30% GC) to evaluate,
 with a memory estimate of [33m434.45 GiB[39m, over [33m2611858437[39m allocations.

In [233]:
grid_perf = collect(0.0:0.05:0.95);

In [234]:
@benchmark bulkscan_null_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_perf;
                              prior_variance = 1.0, prior_sample_size = 0.1)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m4.958 s[22m[39m … [35m   5.551 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 5.50% … 14.92%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m5.254 s               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m10.47%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m5.254 s[22m[39m ± [32m419.891 ms[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m10.47% ±  6.66%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁

In [235]:
@benchmark bulkscan_alt_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_perf)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m35.014 s[39m (1.94% GC) to evaluate,
 with a memory estimate of [33m36.86 GiB[39m, over [33m8691531[39m allocations.

### Weighted Error Variances:

Assuming that

$$y = X_0 B_0+g\beta+\epsilon$$

where 

$$\epsilon \sim N(0,  \sigma^2_g K + \sigma^2_e V) $$ and 

the matrix $V$ is a diagonal but not necessarily an identity matrix.

Notice that since $V^{-1} = V^{-1/2}V^{-1/2}$, then the strategy is as such:

Let $$y^* = V^{-1/2}y = V^{-1/2}X0+V^{-1/2}g\beta+V^{-1/2}\epsilon$$

then 
$$V^{-1/2}\epsilon \sim N(0, \sigma^2_g V^{-1/2}KV^{-1/2}+\sigma^2_e V^{-1/2}VV^{-1/2} = \sigma^2_g K^*+\sigma^2_e I)$$

## Examine hsqs, maxLods:

In [None]:
L_ind_null = bulkscan_results_ind_null.L;

In [None]:
L_ind_null_grid = bulkscan_results_ind_null_grid.L;

In [None]:
function getMaxLODs(L::Array{Float64, 2})
    
    m = size(L, 2);
    max_marker = Array{Int64, 1}(undef, m);
    max_lod = Array{Float64, 1}(undef, m);
    
    for i in 1:m
        result_i = findmax(L[:, i]); 
        max_marker[i] = result_i[2];
        max_lod[i] = result_i[1];
    end
    
    return (max_markers = max_marker, max_lod = max_lod);
end

In [None]:
@time begin
    max_results_ind = getMaxLODs(L_ind_null);
    max_results_ind_grid = getMaxLODs(L_ind_null_grid);
end;

In [None]:
pwd()

In [None]:
# CSV.write("output/max_results_ind.csv", DataFrame(round.(hcat(max_results_ind.max_markers, max_results_ind.max_lod), digits = 2), ["max_marker", "max_lod"]))

In [None]:
# CSV.write("output/max_results_ind_grid.csv", DataFrame(round.(hcat(max_results_ind_grid.max_markers, max_results_ind_grid.max_lod), digits = 2), ["max_marker", "max_lod"]))

In [None]:
@time bulkscan_results_ind_null_4 = bulkscan_null(lpst_ind_Y, geno_ind, kinship_ind_from_strains; nb = Threads.nthreads(),
                                                  prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 4);

In [None]:
@time bulkscan_results_null_strains_4 = bulkscan_null(lpst_strains_Y, geno_strains, kinship_strains; nb = Threads.nthreads(),
                                                      prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 4);

In [None]:
L_ind = bulkscan_results_ind_null.L;
L_strains = bulkscan_results_strains_null.L;
h2s_ind = bulkscan_results_ind_null.h2_null_list;
h2s_strains = bulkscan_results_strains_null.h2_null_list;

In [None]:
L_ind_4 = bulkscan_results_ind_null_4.L;
L_strains_4 = bulkscan_results_null_strains_4.L;
h2s_ind_4 = bulkscan_results_ind_null_4.h2_null_list;
h2s_strains_4 = bulkscan_results_null_strains_4.h2_null_list;

## Get Plot:

In [None]:
BXD_pmap = CSV.read("data/BXD_pmap.csv", DataFrame);
BXD_gmap = CSV.read("data/BXD_gmap.csv", DataFrame);

In [None]:
max_results_df_ind = CSV.read("output/max_results_ind.csv", DataFrame);

In [None]:
max_results_df_ind_grid = CSV.read("output/max_results_ind_grid.csv", DataFrame);

In [None]:
using RecipesBase, Plots, Plots.PlotMeasures, ColorSchemes

In [None]:
function plot_eQTL2(lodc::Array{Float64, 2}, gmap::DataFrame, phenocovar::DataFrame;
                thr::Float64 = 5.0, kwargs...)
    
    x, y, z, mysteps, mychr = get_eQTL_accMb(
                                lodc, 
                                phenocovar,
                                gmap;
                                thr = thr,
                                kwargs...
                              )

    eQTLplot(x, y, z, mysteps, mychr, kwargs...)

end

In [None]:
BXD_pmap.Chr = convert.(String, BXD_pmap.Chr);

In [None]:
BXD_gmap.Chr = convert.(String, BXD_gmap.Chr);

In [None]:
plot_eQTL2(Matrix(max_results_df_ind), BXD_gmap, BXD_pmap)

In [None]:
lp_id

In [None]:
lp_ind_y

In [None]:
lpst_ind_y

In [None]:
scan_results_ind_alt.lod

### Heritabilities:

### Maximum lod scores:

In [None]:
histogram(max_results_ind.max_lod, label = "Maximum lod - Individual LP")

## Run GEMMA for single-trait scans:

In [None]:
pwd()

In [None]:
gemma = "/home/zyu20/Softwares/gemma-0.98.5-linux-static-AMD64"

In [None]:
run(`$gemma -h`)

In [None]:
function transform_bxd_pheno_to_gemma2(inputfile::AbstractString, outputfile::AbstractString, iter::Int64)
    pheno = CSV.read(inputfile, DataFrame);
    pheno = Matrix{Float64}(pheno[:, 3:end]);
    open(outputfile, "w") do io
        writedlm(io, pheno[:, iter])
    end
    return pheno
end

In [None]:
function transform_bxd_geno_to_gemma2(inputfile::AbstractString, outputfile::AbstractString, 
                                      startCol::Int64)
    data = CSV.read(inputfile, DataFrame);
    
    marker_names = names(data)[startCol:end];
    data = 2 .* data[:, startCol:end] |> x -> Matrix{Float64}(x)
    # data = data[:, startCol:end] |> x -> Matrix{Float64}(x)
    minor_allele = fill("A", length(marker_names), 1);
    major_allele = fill("B", length(marker_names), 1);
    output = hcat(hcat(marker_names, minor_allele, major_allele), transpose(data))
    writeToFile(output, outputfile)
    return output
end

In [None]:
lp_id

In [None]:
# transform_bxd_geno_to_gemma2("data/GN886_geno_strains_summary.csv", "data/GEMMA_data/GN886_geno_strains_summary.txt", 2);
# transform_bxd_geno_to_gemma2("data/GN886_geno_summary.csv", "data/GEMMA_data/GN886_geno_ind_summary.txt", 4);

In [None]:
lp_id = 29437;

In [None]:
transform_bxd_pheno_to_gemma2("data/GN886_pheno_strain_means_summary.csv", "data/GEMMA_data/GN886_pheno_strains_summary.txt", lp_id);
transform_bxd_pheno_to_gemma2("data/GN886_pheno_summary.csv", "data/GEMMA_data/GN886_pheno_ind_summary.txt", lp_id);

In [None]:
# writedlm("data/GEMMA_data/kinship_strains.txt", kinship_strains, '\t')
# writedlm("data/GEMMA_data/kinship_ind.txt", kinship_ind_from_strains, '\t')

In [None]:
function p2lod(pval::Float64, df::Int64)
    
    lrs = invlogcdf(Chisq(df), log(1-pval))
    lod = lrs/(2*log(10))
    
    # return lrs
    return lod

end

In [None]:
function gemmaWrapper(pheno_filename::String, geno_filename::String,
                      kinship_filename::String, output_filename::String)
    
    run(`$gemma -g $geno_filename -p $pheno_filename -k $kinship_filename -lmm 2 -lmax 1000000 -o $output_filename`)
    
end

In [None]:
@time gemmaWrapper("data/GEMMA_data/GN886_pheno_strains_summary.txt", 
                   "data/GEMMA_data/GN886_geno_strains_summary.txt",
                   "data/GEMMA_data/kinship_strains.txt",
                   "results_strains.txt");

In [None]:
@time gemmaWrapper("data/GEMMA_data/GN886_pheno_ind_summary.txt", 
                   "data/GEMMA_data/GN886_geno_ind_summary.txt",
                   "data/GEMMA_data/kinship_ind.txt",
                   "results_ind.txt");

In [None]:
sum(eigen(kinship_ind_from_strains).values.<0.0001)

In [None]:
gemma_results_strains = readdlm("output/results_strains.txt.assoc.txt", '\t');
gemma_results_ind = readdlm("output/results_ind.txt.assoc.txt", '\t');

In [None]:
gemma_strains = gemma_results_strains[2:end, 10] |> x -> Array{Float64}(x);
gemma_ind = gemma_results_ind[2:end, 10] |> x -> Array{Float64}(x);

In [None]:
gemma_strains_lod = p2lod.(gemma_strains, 1);
gemma_ind_lod = p2lod.(gemma_ind, 1);

In [None]:
hcat(gemma_ind_lod, scan_results_ind.lod, bulkscan_results_null_grid_ind.L[:, lp_id])

In [None]:
hcat(gemma_strains_lod, scan_results_strains.lod, bulkscan_results_null_strains.L[:, lp_id])

In [None]:
lookRange = 1:7321

In [None]:
plot(gemma_strains_lod[lookRange], label = "GEMMA_strains");
# plot!(scan_results_strains_alt_hub.lod[lookRange], label = "BulkLMM_strains")
plot!(bulkscan_results_null_strains.L[lookRange, lp_id], label = "BulkLMM_strains")

In [None]:
scan_results_ind.h2_null