## Data Analysis using BulkLMM - BXD Spleen Study

In [29]:
using CSV, DelimitedFiles, DataFrames, Missings, XLSX
using LinearAlgebra, Statistics, Optim
using Random, Distributions, LoopVectorization
using BenchmarkTools

In [30]:
using Plots

In [3]:
local_path = "../../BulkLMM.jl/src";

In [4]:
include(joinpath(local_path, "kinship.jl"));
include(joinpath(local_path, "util.jl"));
include(joinpath(local_path, "wls.jl"));
include(joinpath(local_path, "lmm.jl"));
include(joinpath(local_path, "gridbrent.jl"));
include(joinpath(local_path, "transform_helpers.jl"));
include(joinpath(local_path, "scan.jl"));
include(joinpath(local_path, "bulkscan_helpers.jl"));
include(joinpath(local_path, "bulkscan.jl"));
include(joinpath(local_path, "readData.jl"));
include(joinpath(local_path, "../plot_utils/visuals_utils.jl"));
include(joinpath(local_path, "analysis_helpers/single_trait_analysis.jl"));

In [5]:
include(joinpath(local_path, "../test/testHelpers.jl"));

### Load data:

In [6]:
bulklmmdir = local_path;
pheno_file = joinpath(bulklmmdir,"..","data/bxdData/spleen-pheno-nomissing.csv");
pheno = readdlm(pheno_file, ',', header = false);
pheno_processed = pheno[2:end, 2:(end-1)].*1.0; # exclude the header, the first (transcript ID)and the last columns (sex)

In [7]:
geno_file = joinpath(bulklmmdir,"..","data/bxdData/spleen-bxd-genoprob.csv");
geno = readdlm(geno_file, ',', header = false);
geno_processed = geno[2:end, 1:2:end] .* 1.0;

In [8]:
size(pheno_processed) # (number of strains, number of traits)

(79, 35554)

In [9]:
size(geno_processed) # (number of strains, number of traits)

(79, 7321)

In [22]:
@time kinship = calcKinship(geno_processed); # calculate K

  0.003620 seconds (8 allocations: 4.508 MiB)


In [23]:
kinship

79×79 Matrix{Float64}:
 1.0       0.468775  0.561072  0.521241  …  0.413409  0.435544  0.486618
 0.468775  1.0       0.482672  0.504428     0.532658  0.48903   0.52606
 0.561072  0.482672  1.0       0.374636     0.450499  0.454097  0.463495
 0.521241  0.504428  0.374636  1.0          0.469941  0.421916  0.49879
 0.502612  0.431555  0.443824  0.57748      0.519702  0.489642  0.453339
 0.476439  0.483656  0.567143  0.462734  …  0.512509  0.549544  0.477227
 0.46737   0.501131  0.520699  0.527784     0.481495  0.519189  0.43882
 0.542552  0.601065  0.506128  0.463305     0.498992  0.479598  0.548914
 0.591885  0.403157  0.533686  0.4254       0.448777  0.531941  0.403805
 0.44131   0.626412  0.535927  0.447897     0.450781  0.461469  0.571935
 0.460221  0.457164  0.460105  0.489863  …  0.49494   0.538491  0.511439
 0.507045  0.570826  0.536865  0.614885     0.503228  0.478424  0.519035
 0.582004  0.544664  0.516427  0.492061     0.450206  0.442543  0.538099
 ⋮                             

In [18]:
kinship = round.(kinship; digits = 12)

79×79 Matrix{Float64}:
 1.0       0.468775  0.561072  0.521241  …  0.413409  0.435544  0.486618
 0.468775  1.0       0.482672  0.504428     0.532658  0.48903   0.52606
 0.561072  0.482672  1.0       0.374636     0.450499  0.454097  0.463495
 0.521241  0.504428  0.374636  1.0          0.469941  0.421916  0.49879
 0.502612  0.431555  0.443824  0.57748      0.519702  0.489642  0.453339
 0.476439  0.483656  0.567143  0.462734  …  0.512509  0.549544  0.477227
 0.46737   0.501131  0.520699  0.527784     0.481495  0.519189  0.43882
 0.542552  0.601065  0.506128  0.463305     0.498992  0.479598  0.548914
 0.591885  0.403157  0.533686  0.4254       0.448777  0.531941  0.403805
 0.44131   0.626412  0.535927  0.447897     0.450781  0.461469  0.571935
 0.460221  0.457164  0.460105  0.489863  …  0.49494   0.538491  0.511439
 0.507045  0.570826  0.536865  0.614885     0.503228  0.478424  0.519035
 0.582004  0.544664  0.516427  0.492061     0.450206  0.442543  0.538099
 ⋮                             

### Single trait scans:

In [11]:
traitID = 1112;
pheno_y = reshape(pheno_processed[:, traitID], :, 1);

In [26]:
@time single_results = scan(pheno_y, geno_processed, kinship);

  0.066642 seconds (80.96 k allocations: 47.299 MiB)


In [28]:
single_results.h2_null

0.8500907321951073

In [19]:
@time single_results_perms = scan(pheno_y, geno_processed, kinship; permutation_test = true, nperms = 1000);

  0.071448 seconds (90.10 k allocations: 146.533 MiB)


In [20]:
single_results_perms.L_perms

7321×1000 Matrix{Float64}:
 0.000577766  0.0031705    1.38102   …  0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102   …  0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577766  0.0031705    1.38102      0.389878     0.287921   0.187091
 0.000577762  0.00317049   1.38102   …  0.389879     0.287921   0.187091
 0.00057545   0.00316465   1.3806       0.390105     0.287916   0.187094
 0.00256859   0.000743197  0.629146     0.698901     0.226259   0.156643
 ⋮                      

In [16]:
thrs = get_thresholds(single_results_perms.L_perms, [0.90, 0.95]).thrs
round.(thrs; digits = 4)

2-element Vector{Float64}:
 3.4102
 3.7489

In [25]:
thrs = get_thresholds(single_results_perms.L_perms, [0.90, 0.95]).thrs
round.(thrs; digits = 4)

2-element Vector{Float64}:
 3.3644
 3.6504

### Multiple trait scans:

In [15]:
Threads.nthreads()

16

In [16]:
BLAS.get_num_threads()

8

In [17]:
BLAS.set_num_threads(4)

In [18]:
h2_grid = collect(0.0:0.01:0.99);

In [19]:
h2_grid2 = collect(0.0:0.05:0.95);

In [20]:
pheno_st = colStandardize(pheno_processed);

In [21]:
@benchmark bulkscan_null_grid(pheno_processed, geno_processed, kinship, h2_grid)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m8.501 s[39m (6.01% GC) to evaluate,
 with a memory estimate of [33m17.66 GiB[39m, over [33m49219720[39m allocations.

In [22]:
@benchmark bulkscan_null_grid(pheno_processed, geno_processed, kinship, h2_grid2)

BenchmarkTools.Trial: 2 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m3.335 s[22m[39m … [35m4.999 s[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 5.07% … 32.63%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m4.167 s            [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m21.60%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m4.167 s[22m[39m ± [32m1.177 s[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m21.60% ± 19.49%

  [34m█[39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m█[39m [39m 
  [34m█[39m[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[39m▁[3

In [23]:
@time all_results_grid = bulkscan_null_grid(pheno_processed, geno_processed, kinship, h2_grid);

  9.863589 seconds (49.22 M allocations: 17.664 GiB, 7.68% gc time)


In [24]:
@time all_results_grid_st = bulkscan_null_grid(pheno_st, geno_processed, kinship, h2_grid;
                                               prior_variance = 1.0, prior_sample_size = 0.1);

 10.736517 seconds (49.06 M allocations: 17.617 GiB, 11.09% gc time)


In [25]:
@time all_results_exact = bulkscan_null(pheno_processed, geno_processed, kinship;
                                        optim_interval = 1);

 86.247687 seconds (2.86 G allocations: 706.927 GiB, 36.87% gc time, 0.07% compilation time)


In [26]:
@time all_results_exact_st = bulkscan_null(pheno_st, geno_processed, kinship;
                                           prior_variance = 1.0, prior_sample_size = 0.1,
                                           optim_interval = 1);

 76.416246 seconds (2.86 G allocations: 706.924 GiB, 37.34% gc time)


In [27]:
@time all_results_alt_grid = bulkscan_alt_grid(pheno_processed, geno_processed, kinship, h2_grid);

701.758396 seconds (90.10 M allocations: 810.950 GiB, 16.24% gc time, 0.03% compilation time)


In [28]:
@time all_results_alt_grid2 = bulkscan_alt_grid(pheno_processed, geno_processed, kinship, h2_grid2);

145.839825 seconds (18.27 M allocations: 171.527 GiB, 16.44% gc time)


In [32]:
findall(all_results_grid.h2_null_list .> 0.0)

3646-element Vector{Int64}:
    82
    95
   107
   108
   128
   153
   234
   253
   258
   298
   348
   354
   381
     ⋮
 35495
 35501
 35509
 35516
 35526
 35527
 35529
 35534
 35535
 35545
 35548
 35552

In [34]:
all_results_grid.h2_null_list[82]

0.78

In [35]:
all_results_alt_grid.h2_panel[:, 82]

7321-element Vector{Float64}:
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.57
 0.56
 ⋮
 0.57
 0.57
 0.57
 0.57
 0.59
 0.59
 0.58
 0.59
 0.59
 0.6
 0.59
 0.59

In [29]:
all_results_alt_grid.L[:, 1]

7321-element Vector{Float64}:
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682101093471
 0.00012008682099859131
 0.00012008682099859131
 0.00012008387637761301
 0.00011849771856038352
 0.009521311901900588
 ⋮
 0.26017848855422443
 0.26017851503361367
 0.26017851503361983
 0.2534463880193611
 0.3782893583076069
 0.37828935830770566
 0.21935139991554947
 0.20144120787677675
 0.20153298758761384
 0.17782339280604786
 0.18170671563925148
 0.18170671563923912

In [36]:
include("../../BigRiver_util_code/src/kinship_utils.jl");
include("../../BigRiver_util_code/src/run_gemma_utils.jl");

In [37]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/BXDSpleen"

In [41]:
gmap = CSV.read("../../BulkLMM.jl/data/bxdData/gmap.csv", DataFrame);

In [44]:
marker_names = gmap.Locus |> x -> String.(x) |> x -> Array{String, 1}(x);

In [45]:
pheno_filename = "data/GEMMA_data/bxd_spleen_pheno.txt";
geno_filename = "data/GEMMA_data/bxd_spleen_geno.txt";
kinship_filename = "data/GEMMA_data/bxd_spleen_kinship.txt";
output_filename = "results_univariate_LMM";

In [48]:
gemma = "/home/zyu20/Softwares/gemma-0.98.5-linux-static-AMD64";

In [49]:
@time gemma_one_trait_results = run_gemma(reshape(pheno_processed[:, 82], :, 1), geno_processed, kinship,
                                        ["A", "B"], marker_names,
                                        pheno_filename, geno_filename, kinship_filename, 
                                        output_filename, 
                                        gemma);

GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 79
## number of analyzed individuals = 79
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =     7321
## number of analyzed SNPs         =     7321
Start Eigen-Decomposition...
pve estimate =0.677983
se(pve) =0.202993


**** INFO: Done.


  2.741495 seconds (3.83 M allocations: 329.091 MiB, 38.53% gc time, 10.02% compilation time)


In [53]:
all_results_exact.h2_null_list[82]

0.7817412174810534

In [55]:
@time test_single_trait = scan(reshape(pheno_processed[:, 82], :, 1), geno_processed, kinship);

  0.083102 seconds (81.13 k allocations: 47.366 MiB, 24.75% gc time)


In [56]:
test_single_trait.h2_null

0.7817412174810534

In [58]:
hcat(gemma_one_trait_results, test_single_trait.lod, all_results_exact.L[:, 82], all_results_grid.L[:, 82])

7321×4 Matrix{Float64}:
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376053     0.376053     0.376463
 0.356661    0.376052     0.376052     0.376463
 0.356494    0.375938     0.375938     0.376349
 0.0715195   0.176568     0.176568     0.177043
 ⋮                                     
 0.00111885  0.0029897    0.0029897    0.0030035
 0.00111885  0.0029897    0.0029897    0.00300349
 0.00111885  0.0029897    0.0029897    0.00300349
 3.41986e-5  0.000736934  0.000736934  0.000756148
 0.248875    0.0381599    0.0381599    0.0377643
 0.248875    0.0381599    0.0381599    0.0377643
 0.199677    0

In [62]:
function meanAbsDiff(x, y)
    
    return mean(abs.(x .- y))
    
end

meanAbsDiff (generic function with 1 method)

In [63]:
meanAbsDiff(gemma_one_trait_results, test_single_trait.lod)

0.10815214516551538

In [65]:
findmax(abs.(gemma_one_trait_results .- test_single_trait.lod))

(0.8248290873637598, CartesianIndex(4918, 1))

In [66]:
gemma_one_trait_results[4918]

1.9718690158524306

In [67]:
test_single_trait.lod[4918]

1.1470399284886708

In [None]:
plot()

In [60]:
findall(isnan.(gemma_one_trait_results))

CartesianIndex{2}[]

In [61]:
findall(isinf.(gemma_one_trait_results))

CartesianIndex{2}[]