## Data Analysis using BulkLMM - BXD Longevity Study

In [1]:
using CSV, DelimitedFiles, DataFrames, Missings, XLSX
using LinearAlgebra, Statistics, Optim
using Random, Distributions, LoopVectorization
using GeneNetworkAPI, Downloads
using BenchmarkTools

In [2]:
using Plots

In [3]:
local_path = "../../BulkLMM.jl/src";

In [4]:
import LossFunctions: HuberLoss, value
import Distributions: Chisq,ccdf
import StatsBase: mad, sample

function huberize(y::Vector{Float64})
    m = median(y)
    s = mad(y,normalize=true)
    z = (y.-m)./s
    l = value.(HuberLoss(1),z)
    x = sign.(z).* sqrt.(2*l)
    return m .+ s.*x
end

huberize (generic function with 1 method)

## Load processed_data:

### By individuals:

In [5]:
BXD_pheno_ind_summary_df = CSV.read("data/GN886_pheno_summary.csv", DataFrame);

By individuals, there are 248 individual samples for 32445 livear proteome.

In [6]:
println(size(BXD_pheno_ind_summary_df)) 
BXD_pheno_ind_summary_df[1:10, 1:10]

(248, 32448)


Row,Sample,Strain,Strain_num,P42209_DESGLNRK_2,P42209_GLRPLDVAFLR_3,Q99M02_VGDPVYR_2,Q99M02_VWIYPIK_2,Q99M02_LCDPSVK_2,Q99M02_CVLTTVDPDTGIIDR_2,Q99M02_LVQFDTSMK_2
Unnamed: 0_level_1,String7,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,H1009,BXD9,9,11.349,11.534,17.587,17.517,17.309,16.574,16.323
2,H0370,BXD9,9,11.249,12.735,17.427,17.325,17.252,16.685,16.021
3,H2577,BXD9,9,12.415,10.487,17.89,17.488,17.594,16.731,16.208
4,H0365,BXD9,9,11.374,10.674,17.714,17.401,17.381,16.621,16.337
5,H1333,BXD13,13,11.687,11.524,17.362,17.367,17.071,16.465,15.97
6,H2259,BXD24,24,11.837,11.715,17.57,17.792,17.505,16.894,16.277
7,H1792,BXD24,24,11.563,11.434,17.789,17.847,17.416,17.042,16.292
8,H1791,BXD24,24,12.5,12.273,17.944,17.833,17.63,16.862,16.3
9,H1541,BXD24,24,11.815,11.564,17.794,17.759,17.456,16.878,16.377
10,H1277,BXD24,24,12.674,11.743,17.866,17.845,17.577,16.922,16.26


In [8]:
BXD_geno_ind_summary_df = CSV.read("data/GN886_geno_summary.csv", DataFrame);

By individuals, there are 248 individual samples for 7321 markers.

In [9]:
println(size(BXD_geno_ind_summary_df)) 
BXD_geno_ind_summary_df[1:10, 1:10]

(248, 7324)


Row,Sample,Strain,Strain_num,rs31443144,rs6269442,rs32285189,rs258367496,rs32430919,rs36251697,rs30658298
Unnamed: 0_level_1,String7,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,H1009,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,H0370,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,H2577,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,H0365,BXD9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,H1333,BXD13,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,H2259,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,H1792,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,H1791,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,H1541,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,H1277,BXD24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### By strains:

In [10]:
BXD_pheno_strains_summary_df = CSV.read("data/GN886_pheno_strain_means_summary.csv", DataFrame);

By strains, there are 50 BXD strains for 32445 liver proteome.

In [94]:
println(size(BXD_pheno_strains_summary_df)) 
BXD_pheno_strains_summary_df[1:10, 1:10]

(50, 32447)


Row,Strain,Number of Samples,P42209_DESGLNRK_2,P42209_GLRPLDVAFLR_3,Q99M02_VGDPVYR_2,Q99M02_VWIYPIK_2,Q99M02_LCDPSVK_2,Q99M02_CVLTTVDPDTGIIDR_2,Q99M02_LVQFDTSMK_2,Q99M02_QLQQVGTVSK_2
Unnamed: 0_level_1,String7,Int64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,BXD9,4,11.5968,11.3575,17.6545,17.4327,17.384,16.6528,16.2222,16.3227
2,BXD13,1,11.687,11.524,17.362,17.367,17.071,16.465,15.97,15.98
3,BXD24,5,12.0778,11.7458,17.7926,17.8152,17.5168,16.9196,16.3012,16.4304
4,BXD27,3,11.717,11.149,17.336,17.3143,17.165,16.5283,15.9407,16.002
5,BXD29,11,11.735,11.3496,17.4583,17.3877,17.2431,16.4945,16.0869,16.1075
6,BXD32,8,11.6266,11.554,17.8254,17.7716,17.4639,16.9029,16.434,16.425
7,BXD34,13,11.7491,11.9135,17.6108,17.5974,17.2988,16.7761,16.3147,16.2926
8,BXD39,5,11.9158,11.719,17.395,17.385,17.157,16.5226,16.179,16.0594
9,BXD40,8,11.8651,11.6817,17.5986,17.6386,17.3861,16.6824,16.2855,16.2523
10,BXD43,4,12.378,11.4563,17.76,17.8035,17.3652,16.885,16.5245,16.3532


In [12]:
BXD_geno_strains_summary_df = CSV.read("data/GN886_geno_strains_summary.csv", DataFrame);

By strains, there are 50 BXD strains for 7321 markers.

In [13]:
println(size(BXD_geno_strains_summary_df)) 
BXD_geno_strains_summary_df[1:10, 1:10]

(50, 7322)


Row,Strain,rs31443144,rs6269442,rs32285189,rs258367496,rs32430919,rs36251697,rs30658298,rs51852623,rs31879829
Unnamed: 0_level_1,String7,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64,Float64
1,BXD9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BXD13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BXD24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BXD27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,BXD29,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,BXD32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,BXD34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,BXD39,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,BXD40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,BXD43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load functions:

In [14]:
include(joinpath(local_path, "kinship.jl"));
include(joinpath(local_path, "util.jl"));
include(joinpath(local_path, "wls.jl"));
include(joinpath(local_path, "lmm.jl"));
include(joinpath(local_path, "gridbrent.jl"));
include(joinpath(local_path, "transform_helpers.jl"));
include(joinpath(local_path, "scan.jl"));
include(joinpath(local_path, "bulkscan_helpers.jl"));
include(joinpath(local_path, "bulkscan.jl"));
include(joinpath(local_path, "readData.jl"));

## Objectives

- gemma one trait, ind and strains

- scan_alt one trait, ind and strains

- scan_null one trait, ind and strains

- bulkscan_grid (null), all traits, ...

- data structure for all traits results...

- Heatmap grids_taken, compare on strain means and individuals

- Histogram of heritabilities (null) for all traits (both strain means and individual level data)

- Loglikelihood by h2_grid plot, strain means v.s. individuals

- Report fixed effects:

- Adjusting difference in the number of samples across strains

- Histogram of heritabilities (null) for all traits (both strain means and individual level data)

## Run BulkLMM.jl:

### Calculate kinship:

First, calculate the kinship matrix of relatedness among the strains (50-by-50):

In [15]:
geno_strains = Matrix{Float64}(BXD_geno_strains_summary_df[:, 2:end]);

In [16]:
kinship_strains = calcKinship(geno_strains);

Then, construct the kinship matrix of relatedness among the individuals based on which strain they are from:

In [17]:
function calcRepeats(x::Array{Int64, 1})
    
    counting_dict = Dict{Int64, Int64}();
    
    for i in 1:length(x)
        curr_key = x[i];
        if haskey(counting_dict, curr_key) # if current key already exists
            counting_dict[curr_key] += 1;
        else # if current key does not present, add it as a new key
            counting_dict[curr_key] = 1;
        end
    end
    
    return counting_dict
    
end

calcRepeats (generic function with 1 method)

In [18]:
function calcIndKinship_from_StrainKinship(kinship_strains::Array{Float64, 2}, 
                                           strain_info_about_samples::Array{Int64, 1})
    
    counting_dict = calcRepeats(strain_info_about_samples);
    reps_each_strain = map(x -> counting_dict[x], unique(strain_info_about_samples));
    
    # Initialize the placeholder:
    kinship_ind_from_strains = ones(sum(reps_each_strain), sum(reps_each_strain));

    # process the first BXD strain...
    reps_strain_1 = reps_each_strain[1];
    k_11 = kinship_strains[1, 1];
    # block_11 = ones(reps_strain_1, reps_strain_1);

    row_id = 0;
    col_id = 0;

    @views kinship_ind_from_strains[(row_id+1):(row_id+reps_strain_1), 
                             (row_id+1):(row_id+reps_strain_1)] .*= k_11;

    for j in 2:length(reps_each_strain)
        reps_strain_j = reps_each_strain[j];
        k_1j = kinship_strains[1, j];
        # block_1j = ones(reps_strain_1, reps_strain_j);

        col_id += reps_each_strain[j-1];
        @views kinship_ind_from_strains[(row_id+1):(row_id+reps_strain_1), 
                                 (col_id+1):(col_id+reps_strain_j)] .*= k_1j;

        # process the off-diagonal block conveniently...
        # block_j1 = ones(reps_strain_j, reps_strain_1);
        kinship_ind_from_strains[(col_id+1):(col_id+reps_strain_j),
                                 (row_id+1):(row_id+reps_strain_1)] .*= k_1j;

    end

    # process for the second BXD strain and all after...
    # for i in 2:length(reps_each_strain)
    for i in 2:length(reps_each_strain)

        reps_strain_i = reps_each_strain[i];
        k_ii = kinship_strains[i, i];
        # block_ii = ones(reps_strain_i, reps_strain_i);

        row_id += reps_each_strain[i-1];
        @views kinship_ind_from_strains[(row_id+1):(row_id+reps_strain_i), 
                                 (row_id+1):(row_id+reps_strain_i)] .*= k_ii;

        col_id = row_id+reps_strain_i;

        for j in (i+1):length(reps_each_strain)

            reps_strain_j = reps_each_strain[j];
            k_ij = kinship_strains[i, j];
            # block_ij = ones(reps_strain_i, reps_strain_j);

            @views kinship_ind_from_strains[(row_id+1):(row_id+reps_strain_i), 
                                     (col_id+1):(col_id+reps_strain_j)] .*= k_ij;

            # process the off-diagonal block conveniently...
            # block_ji = ones(reps_strain_j, reps_strain_i);
            @views kinship_ind_from_strains[(col_id+1):(col_id+reps_strain_j),
                                     (row_id+1):(row_id+reps_strain_i)] .*= k_ij;

            col_id += reps_each_strain[j];
        end
    
    end
    
    return kinship_ind_from_strains
end

calcIndKinship_from_StrainKinship (generic function with 1 method)

In [19]:
BXD_geno_ind_summary_df.Strain_num = Vector{Int64}(BXD_geno_ind_summary_df.Strain_num);

In [20]:
@time kinship_ind_from_strains = calcIndKinship_from_StrainKinship(kinship_strains, BXD_geno_ind_summary_df.Strain_num);

  0.000313 seconds (76 allocations: 503.766 KiB)


Compare the runtime with directly calculating the kinship matrix from individual genotype data:

In [21]:
geno_ind = Matrix{Float64}(BXD_geno_ind_summary_df[:, 4:end]);

In [22]:
@time kinship_ind_from_ind = calcKinship(geno_ind);

  5.802048 seconds (306.28 k allocations: 8.368 GiB, 14.35% gc time)


In [23]:
maximum(abs.(kinship_ind_from_ind .- kinship_ind_from_strains))

0.04186586531894554

### Single-trait genome scans:

To see single trait genome scan results, we take the 29437-th livear proteomic as the single trait to run analysis:  

In [24]:
lp_names = names(BXD_pheno_ind_summary_df)[4:end];

In [25]:
lp_id = 29437;
lp_names[lp_id]

"Q9Z2I8_SSGLPITSAVDLEDAAK_3"

In [26]:
prior = [1.0, 0.1];

#### For individual liver proteome

In [27]:
lp_ind_Y = Matrix{Float64}(BXD_pheno_ind_summary_df[:, 4:end]);
lp_ind_y = reshape(lp_ind_Y[:, lp_id], :, 1);

In [28]:
lpst_ind_Y = colStandardize(lp_ind_Y);
lpst_ind_y = reshape(lpst_ind_Y[:, lp_id], :, 1);

In [29]:
@time scan_results_ind = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains; 
                              prior_variance = prior[1], prior_sample_size = prior[2]);

  4.984315 seconds (14.51 M allocations: 857.320 MiB, 5.89% gc time, 98.18% compilation time)


In [30]:
BLAS.set_num_threads(Threads.nthreads())

In [31]:
BLAS.get_num_threads()

4

In [32]:
@time scan_results_ind_alt = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                                  assumption = "alt", 
                                  prior_variance = prior[1], prior_sample_size = prior[2]);

 11.527491 seconds (11.76 M allocations: 14.368 GiB, 31.64% gc time, 0.22% compilation time)


In [33]:
@time scan_perms_results_ind = scan(lpst_ind_y, geno_ind, kinship_ind_from_strains;
                                    prior_variance = prior[1], prior_sample_size = prior[2],
                                    permutation_test = true, nperms = 1000, original = true);

  0.788294 seconds (1.49 M allocations: 235.071 MiB, 82.23% compilation time)


In [34]:
single_results_ind = DataFrame(hcat(scan_results_ind.lod, scan_results_ind_alt.lod, scan_perms_results_ind[:, 1]), ["scan_null", "scan_alt", "scan_perms.original"]);

#### For liver proteome means by strains

In [35]:
lp_strains_Y = Matrix{Float64}(BXD_pheno_strains_summary_df[:, 3:end]);
lp_strains_y = reshape(lp_strains_Y[:, lp_id], :, 1);

In [36]:
lpst_strains_Y = colStandardize(lp_strains_Y);
lpst_strains_y = reshape(lpst_strains_Y[:, lp_id], :, 1);

In [37]:
@time scan_results_strains = scan(lpst_strains_y, geno_strains, kinship_strains; 
                                  prior_variance = prior[1], prior_sample_size = prior[2]);

  0.031661 seconds (81.02 k allocations: 32.316 MiB)


In [38]:
@time scan_results_strains_alt = scan(lpst_strains_y, geno_strains, kinship_strains;
                                      assumption = "alt",
                                      prior_variance = prior[1], prior_sample_size = prior[2]);

  1.207588 seconds (4.09 M allocations: 1.235 GiB, 18.00% gc time)


In [39]:
@time scan_perms_results_strains = scan(lpst_strains_y, geno_strains, kinship_strains;
                                        permutation_test = true, nperms = 1000, original = true, 
                                        prior_variance = prior[1], prior_sample_size = prior[2]);

  0.080691 seconds (146.57 k allocations: 81.902 MiB, 33.51% compilation time)


In [40]:
single_results_strains = DataFrame(hcat(scan_results_strains.lod, scan_results_strains_alt.lod, scan_perms_results_strains[:, 1]), ["scan_null", "scan_alt", "scan_perms.original"]);

### Multiple-trait genome scans:

In [41]:
BLAS.set_num_threads(Threads.nthreads())

In [42]:
BLAS.get_num_threads()

4

In [43]:
grid_list = collect(0.0:0.01:0.99);

#### For individual liver proteome

In [44]:
@time bulkscan_results_ind_null = bulkscan_null(lpst_ind_Y, geno_ind, kinship_ind_from_strains; nb = Threads.nthreads(),
                                                prior_variance = 1.0, prior_sample_size = 0.1);

628.968913 seconds (2.60 G allocations: 1.813 TiB, 31.29% gc time, 0.03% compilation time)


In [45]:
BLAS.set_num_threads(Threads.nthreads())

In [46]:
@time bulkscan_results_ind_alt_grid = bulkscan_alt_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_list);

232.112341 seconds (43.49 M allocations: 208.842 GiB, 14.95% gc time, 0.01% compilation time)


In [47]:
@time bulkscan_results_ind_null_grid = bulkscan_null_grid(lpst_ind_Y, geno_ind, kinship_ind_from_strains, grid_list);

 45.071388 seconds (49.77 M allocations: 41.910 GiB, 65.14% gc time, 2.21% compilation time)


#### For liver proteome means by strains

In [48]:
@time bulkscan_results_strains_null = bulkscan_null(lpst_strains_Y, geno_strains, kinship_strains; nb = Threads.nthreads(),
                                                    prior_variance = 1.0, prior_sample_size = 0.1);

139.307603 seconds (2.61 G allocations: 434.510 GiB, 21.96% gc time)


In [49]:
BLAS.set_num_threads(Threads.nthreads())

In [50]:
@time bulkscan_results_strains_alt_grid = bulkscan_alt_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_list);

140.734533 seconds (43.45 M allocations: 184.223 GiB, 7.57% gc time)


In [51]:
@time bulkscan_results_strains_null_grid = bulkscan_null_grid(lpst_strains_Y, geno_strains, kinship_strains, grid_list);

  7.701381 seconds (43.67 M allocations: 11.782 GiB, 25.24% gc time)


In [52]:
pwd()

"/Users/FredYu/Documents/GitHub/BulkLMM_Analyses/BXDLongevity"

In [57]:
bulkscan_results_strains_null_grid.L

7321×32445 Matrix{Float64}:
 0.545307  0.468929   0.20646     …  0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646     …  0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646     …  0.00117692  0.0641232  0.0190642
 0.545307  0.468929   0.20646        0.00117692  0.0641232  0.0190642
 0.241439  0.290214   0.23155        0.00717917  0.0387175  0.0620101
 ⋮                                ⋱                         
 

### Write-in results:

In [59]:
lp_names[lp_id]

"Q9Z2I8_SSGLPITSAVDLEDAAK_3"

In [74]:
output_folder_ind_nh = "BulkLMM_Outputs/Ind_NoHub";
output_folder_strains_nh = "BulkLMM_Outputs/Strains_NoHub";

In [75]:
pwd()

"/Users/FredYu/Documents/GitHub/BulkLMM_Analyses/BXDLongevity"

#### Single_trait:

In [90]:
writedlm(joinpath(output_folder_ind_nh, "single_trait_null.txt"), 
         vcat(scan_results_ind.sigma2_e, scan_results_ind.h2_null, scan_results_ind.lod), '\t');

In [91]:
writedlm(joinpath(output_folder_ind_nh, "single_trait_alt.txt"), 
         hcat(scan_results_ind_alt.h2_each_marker, scan_results_ind_alt.lod), '\t');

In [92]:
writedlm(joinpath(output_folder_strains_nh, "single_trait_null.txt"), 
         vcat(scan_results_strains.sigma2_e, scan_results_strains.h2_null, scan_results_strains.lod), '\t');

In [93]:
writedlm(joinpath(output_folder_strains_nh, "single_trait_alt.txt"), 
         hcat(scan_results_strains_alt.h2_each_marker, scan_results_strains_alt.lod), '\t');

#### Multiple_traits:

In [101]:
b_ind_null = vcat(bulkscan_results_ind_null.h2_null_list', bulkscan_results_ind_null.L);

In [102]:
b_ind_null_grid = vcat(bulkscan_results_ind_null_grid.h2_null_list', bulkscan_results_ind_null_grid.L);

In [None]:
b_ind_null_grid = vcat(bulkscan_results_ind_null_grid.h2_null_list', bulkscan_results_ind_null_grid.L);

In [103]:
writedlm(joinpath(output_folder_ind_nh, "multi_traits_null.txt"), 
         b_ind_null, '\t');

In [104]:
writedlm(joinpath(output_folder_ind_nh, "multi_traits_null_grid.txt"), 
         b_ind_null_grid, '\t');

In [107]:
writedlm(joinpath(output_folder_ind_nh, "multi_traits_alt_grid.txt"), 
         bulkscan_results_ind_alt_grid, '\t');

In [108]:
b_strains_null = vcat(bulkscan_results_strains_null.h2_null_list', bulkscan_results_strains_null.L);

In [109]:
b_strains_null_grid = vcat(bulkscan_results_strains_null_grid.h2_null_list', bulkscan_results_strains_null_grid.L);

In [110]:
b_strains_null_grid = vcat(bulkscan_results_strains_null_grid.h2_null_list', bulkscan_results_strains_null_grid.L);

In [111]:
writedlm(joinpath(output_folder_strains_nh, "multi_traits_null.txt"), 
         b_strains_null, '\t');

In [112]:
writedlm(joinpath(output_folder_strains_nh, "multi_traits_null_grid.txt"), 
         b_strains_null_grid, '\t');

In [113]:
writedlm(joinpath(output_folder_strains_nh, "multi_traits_alt_grid.txt"), 
         bulkscan_results_strains_alt_grid, '\t');

## Examine loglikelihood:

In [None]:
## Inputs: Original data, a given h2 to evaluate loglik on
##         (optional) prior for regularization loglik near the upper boundary pt.
## Outputs: the logliks (null, alt mean model) under the given h2
function getLL(y::Array{Float64, 2}, G::Array{Float64, 2}, K::Array{Float64, 2}, markerID::Int64, h2::Float64; prior::Array{Float64, 1} = [0.0, 0.0])
    
    Gj = reshape(G[:, markerID], :, 1);
    (y0, X0, lambda0) = transform_rotation(y, Gj, K; addIntercept = true);
    w = makeweights(h2, lambda0);
    X0_inter = reshape(X0[:, 1], :, 1);
    
    return (ll_null = wls(y0, X0_inter, w, prior).ell, ll_markerID = wls(y0, X0, w, prior).ell)
end

In [None]:
@time opt = optimize(x -> -getLL(lpst_strains_y, geno_strains, kinship_strains, 1997, x).ll_null, 0.0, 1.0)

In [None]:
@time gridopt = gridbrent(x -> -getLL(lpst_strains_y, geno_strains, kinship_strains, 1997, x).ll_null, 0.0, 1.0, 10);

In [None]:
@time ml_null = scan(lpst_strains_y, geno_strains, kinship_strains; 
                     prior_variance = prior[1], prior_sample_size = prior[2]);

In [None]:
@time reml_null = scan(lpst_strains_y, geno_strains, kinship_strains; 
                       reml = true, 
                       prior_variance = prior[1], prior_sample_size = prior[2]);

In [None]:
@time ml_alt = scan(lpst_strains_y, geno_strains, kinship_strains; 
                    reml = false, assumption = "alt", 
                    prior_variance = prior[1], prior_sample_size = prior[2]);

In [None]:
@time reml_alt = scan(lpst_strains_y, geno_strains, kinship_strains; 
                      reml = true, assumption = "alt",
                      prior_variance = prior[1], prior_sample_size = prior[2]);

In [None]:
plot(reml_null.lod[lookRange])
plot!(reml_alt.lod[lookRange])

In [None]:
plot(reml_null.lod .- reml_alt.lod)

In [None]:
opt.minimizer

In [None]:
opt.minimum

In [None]:
-getLL(lp_strains_y_hub, geno_strains, kinship_strains, 1997, 0.0).ll_null

In [None]:
opt.minimum <= -getLL(lp_strains_y_hub, geno_strains, kinship_strains, 1997, 0.0).ll_null

In [None]:
gridopt

In [None]:
prior = [1.0, 0.1];

In [None]:
ind_ll = map(x -> getLL(lpst_ind_y, geno_ind, kinship_ind_from_strains, 1997, x; prior = prior), collect(0.0:0.001:0.999));
strains_ll = map(x -> getLL(lpst_strains_y, geno_strains, kinship_strains, 1997, x; prior = prior), collect(0.0:0.001:0.999));

ind_ell_list = zeros(999);
strains_ell_list = zeros(999);
strains_ell_list_alt = zeros(999);
for k in 1:999 
    ind_ell_list[k] = ind_ll[k].ll_null;
    strains_ell_list[k] = strains_ll[k].ll_null;
    strains_ell_list_alt[k] = strains_ll[k].ll_markerID;
end

In [None]:
getLL(lp_strains_y_hub, geno_strains, kinship_strains, 1997, 0.6011922559652536)

In [None]:
plot(strains_ell_list, label = "null")
scatter!()
plot!(strains_ell_list_alt, label = "alt")

In [None]:
plot(strains_ell_list) # covariates+marker

In [None]:
findmax(strains_ell_list)

In [None]:
plot(collect(1:999).*0.001, ind_ell_list, xlabel = "h2", ylabel = "loglik", label = "LP individuals", color = "blue", legend=:bottomleft)
plot!(collect(1:999).*0.001, strains_ell_list, xlabel = "h2", ylabel = "loglik", label = "LP strain means", color = "red")

## Examine hsqs, maxLods:

In [None]:
function getMaxLODs(L::Array{Float64, 2})
    
    m = size(L, 2);
    max_marker = Array{Int64, 1}(undef, m);
    max_lod = Array{Float64, 1}(undef, m);
    
    for i in 1:m
        result_i = findmax(L[:, i]); 
        max_marker[i] = result_i[2];
        max_lod[i] = result_i[1];
    end
    
    return (max_markers = max_marker, max_lod = max_lod);
end

In [None]:
@time begin
    max_results_ind = getMaxLODs(L_ind);
    max_results_strains = getMaxLODs(L_strains);
end;

In [None]:
@time bulkscan_results_ind_null_4 = bulkscan_null(lpst_ind_Y, geno_ind, kinship_ind_from_strains; nb = Threads.nthreads(),
                                                  prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 4);

In [None]:
@time bulkscan_results_null_strains_4 = bulkscan_null(lpst_strains_Y, geno_strains, kinship_strains; nb = Threads.nthreads(),
                                                      prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 4);

In [None]:
L_ind = bulkscan_results_ind_null.L;
L_strains = bulkscan_results_strains_null.L;
h2s_ind = bulkscan_results_ind_null.h2_null_list;
h2s_strains = bulkscan_results_strains_null.h2_null_list;

In [None]:
L_ind_4 = bulkscan_results_ind_null_4.L;
L_strains_4 = bulkscan_results_null_strains_4.L;
h2s_ind_4 = bulkscan_results_ind_null_4.h2_null_list;
h2s_strains_4 = bulkscan_results_null_strains_4.h2_null_list;

### Heritabilities:

In [None]:
findall(abs.(h2s_strains .- h2s_strains_4) .>= 0.5)

In [None]:
findmax(abs.(h2s_strains .- h2s_strains_4))

In [None]:
largest_h2diff_strains_y = reshape(lpst_strains_Y[:, 6], :, 1);

In [None]:
@time test_scan_results_opt1 = scan(largest_h2diff_strains_y, geno_strains, kinship_strains; 
                             prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 1);

In [None]:
@time test_scan_results_opt2 = scan(largest_h2diff_strains_y, geno_strains, kinship_strains; 
                             prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 2);

In [None]:
@time test_scan_results_opt4 = scan(largest_h2diff_strains_y, geno_strains, kinship_strains; 
                             prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 4);

In [None]:
@time test_scan_results_opt10 = scan(largest_h2diff_strains_y, geno_strains, kinship_strains; 
                             prior_variance = 1.0, prior_sample_size = 0.1, optim_interval = 10);

In [None]:
test_scan_results_opt1.h2_null

In [None]:
test_scan_results_opt2.h2_null

In [None]:
test_scan_results_opt4.h2_null

In [None]:
test_scan_results_opt10.h2_null

In [None]:
getLL(largest_h2diff_strains_y, geno_strains, kinship_strains, 1, 0.0)

In [None]:
getLL(largest_h2diff_strains_y, geno_strains, kinship_strains, 1, 0.66)

In [None]:
# ind_ll = map(x -> getLL(largest_h2diff_strains_y, geno_strains, kinship_strains, 1, x), collect(0.0:0.001:0.999));
l_strains_ll = map(x -> getLL(largest_h2diff_strains_y, geno_strains, kinship_strains, 1, x;
                              prior = [1.0, 0.1]), 
                   collect(0.0:0.001:0.999));

# ind_ell_list = zeros(999);
l_strains_ell_list = zeros(999);
l_strains_ell_list_alt = zeros(999);
for k in 1:999 
    # ind_ell_list[k] = ind_ll[k].ll_null;
    l_strains_ell_list[k] = l_strains_ll[k].ll_null;
    l_strains_ell_list_alt[k] = l_strains_ll[k].ll_markerID;
end

In [None]:
plot(l_strains_ell_list)

In [None]:
plot(exp.(l_strains_ell_list .- maximum(l_strains_ell_list)))

In [None]:
plot(exp.(l_strains_ell_list .- maximum(l_strains_ell_list)))

### Maximum lod scores:

In [None]:
histogram(max_results_ind.max_lod, label = "Maximum lod - Individual LP")

In [None]:
histogram(max_results_strains.max_lod, label = "Maximum lod - Strain mean LP")

In [None]:
plot(max_results_ind.max_lod .- max_results_strains.max_lod, label = "Max lod: ind - strains")
hline!(x -> x = 0.0, label = "")

In [None]:
mean(max_results_ind.max_lod .- max_results_strains.max_lod .>= 0.0)

## Run GEMMA for single-trait scans:

In [None]:
pwd()

In [None]:
gemma = "/home/zyu20/Softwares/gemma-0.98.5-linux-static-AMD64"

In [None]:
run(`$gemma -h`)

In [None]:
function transform_bxd_pheno_to_gemma2(inputfile::AbstractString, outputfile::AbstractString, iter::Int64)
    pheno = CSV.read(inputfile, DataFrame);
    pheno = Matrix{Float64}(pheno[:, 3:end]);
    open(outputfile, "w") do io
        writedlm(io, pheno[:, iter])
    end
    return pheno
end

In [None]:
function transform_bxd_geno_to_gemma2(inputfile::AbstractString, outputfile::AbstractString, 
                                      startCol::Int64)
    data = CSV.read(inputfile, DataFrame);
    
    marker_names = names(data)[startCol:end];
    data = 2 .* data[:, startCol:end] |> x -> Matrix{Float64}(x)
    # data = data[:, startCol:end] |> x -> Matrix{Float64}(x)
    minor_allele = fill("A", length(marker_names), 1);
    major_allele = fill("B", length(marker_names), 1);
    output = hcat(hcat(marker_names, minor_allele, major_allele), transpose(data))
    writeToFile(output, outputfile)
    return output
end

In [None]:
lp_id

In [None]:
# transform_bxd_geno_to_gemma2("data/GN886_geno_strains_summary.csv", "data/GEMMA_data/GN886_geno_strains_summary.txt", 2);
# transform_bxd_geno_to_gemma2("data/GN886_geno_summary.csv", "data/GEMMA_data/GN886_geno_ind_summary.txt", 4);

In [None]:
lp_id = 29437;

In [None]:
transform_bxd_pheno_to_gemma2("data/GN886_pheno_strain_means_summary.csv", "data/GEMMA_data/GN886_pheno_strains_summary.txt", lp_id);
transform_bxd_pheno_to_gemma2("data/GN886_pheno_summary.csv", "data/GEMMA_data/GN886_pheno_ind_summary.txt", lp_id);

In [None]:
# writedlm("data/GEMMA_data/kinship_strains.txt", kinship_strains, '\t')
# writedlm("data/GEMMA_data/kinship_ind.txt", kinship_ind_from_strains, '\t')

In [None]:
function p2lod(pval::Float64, df::Int64)
    
    lrs = invlogcdf(Chisq(df), log(1-pval))
    lod = lrs/(2*log(10))
    
    # return lrs
    return lod

end

In [None]:
function gemmaWrapper(pheno_filename::String, geno_filename::String,
                      kinship_filename::String, output_filename::String)
    
    run(`$gemma -g $geno_filename -p $pheno_filename -k $kinship_filename -lmm 2 -lmax 1000000 -o $output_filename`)
    
end

In [None]:
@time gemmaWrapper("data/GEMMA_data/GN886_pheno_strains_summary.txt", 
                   "data/GEMMA_data/GN886_geno_strains_summary.txt",
                   "data/GEMMA_data/kinship_strains.txt",
                   "results_strains.txt");

In [None]:
@time gemmaWrapper("data/GEMMA_data/GN886_pheno_ind_summary.txt", 
                   "data/GEMMA_data/GN886_geno_ind_summary.txt",
                   "data/GEMMA_data/kinship_ind.txt",
                   "results_ind.txt");

In [None]:
sum(eigen(kinship_ind_from_strains).values.<0.0001)

In [None]:
gemma_results_strains = readdlm("output/results_strains.txt.assoc.txt", '\t');
gemma_results_ind = readdlm("output/results_ind.txt.assoc.txt", '\t');

In [None]:
gemma_strains = gemma_results_strains[2:end, 10] |> x -> Array{Float64}(x);
gemma_ind = gemma_results_ind[2:end, 10] |> x -> Array{Float64}(x);

In [None]:
gemma_strains_lod = p2lod.(gemma_strains, 1);
gemma_ind_lod = p2lod.(gemma_ind, 1);

In [None]:
hcat(gemma_ind_lod, scan_results_ind.lod, bulkscan_results_null_grid_ind.L[:, lp_id])

In [None]:
hcat(gemma_strains_lod, scan_results_strains.lod, bulkscan_results_null_strains.L[:, lp_id])

In [None]:
lookRange = 1:7321

In [None]:
plot(gemma_strains_lod[lookRange], label = "GEMMA_strains");
# plot!(scan_results_strains_alt_hub.lod[lookRange], label = "BulkLMM_strains")
plot!(bulkscan_results_null_strains.L[lookRange, lp_id], label = "BulkLMM_strains")

In [None]:
scan_results_ind.h2_null