In [1]:
using CSV, DelimitedFiles, DataFrames, Missings, XLSX
using LinearAlgebra, Statistics, Optim, StatsBase
using Random, Distributions, LoopVectorization
using GeneNetworkAPI, Downloads
using BenchmarkTools

In [2]:
using Plots

In [3]:
pwd()

"/home/zyu20/git/BulkLMM_Analyses/HS-Palmer"

In [4]:
local_path = "../../BulkLMM.jl/src";

In [5]:
include(joinpath(local_path, "kinship.jl"));
include(joinpath(local_path, "util.jl"));
include(joinpath(local_path, "wls.jl"));
include(joinpath(local_path, "lmm.jl"));
include(joinpath(local_path, "gridbrent.jl"));
include(joinpath(local_path, "transform_helpers.jl"));
include(joinpath(local_path, "scan.jl"));
include(joinpath(local_path, "bulkscan_helpers.jl"));
include(joinpath(local_path, "bulkscan.jl"));
include(joinpath(local_path, "readData.jl"));
include(joinpath(local_path, "../plot_utils/visuals_utils.jl"));
include(joinpath(local_path, "analysis_helpers/single_trait_analysis.jl"));
include("../../BigRiver_util_code/src/kinship_utils.jl");
include("../../BigRiver_util_code/src/run_gemma_utils.jl");

## Omics traits:

In [6]:
@time omics_pheno_df = CSV.read("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Rat-PL-RSeq-0818_nomissing.csv", DataFrame);

 13.052676 seconds (8.19 M allocations: 467.961 MiB, 0.95% gc time, 90.87% compilation time)


In [7]:
names(omics_pheno_df)[1:6]

6-element Vector{String}:
 "id"
 "ENSRNOG00000000001"
 "ENSRNOG00000000007"
 "ENSRNOG00000000008"
 "ENSRNOG00000000009"
 "ENSRNOG00000000010"

In [8]:
sample_ids = omics_pheno_df[:, 1];

In [9]:
omics_pheno = omics_pheno_df[1:end, 2:end] |> Matrix{Float64};

Remove missings or columns of all 0's...

In [10]:
no_value_traits = vec(sum(omics_pheno, dims = 1) .== 0.0);

In [11]:
omics_pheno_processed = omics_pheno[:, map(x -> x .== 0.0, no_value_traits)];

In [12]:
grid_loose = collect(0.0:0.05:0.95);

## HS Genotypes：

Takes too long...

In [13]:
# @time hs_geno = parse_geno("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true.geno")

In [14]:
open("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true.geno") do io
           readline(io) # throw out the first line
           readline(io)
           # readlines(io)
end

"@name:HSNIH-Palmer"

In [15]:
count = 0;
test_line = readline("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true.geno")

for line in eachline("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true.geno")
    
    count = count + 1;
    println(line)
    
    test_line = line
    
    if count == 8
        break
    end
    
end

@type:riset
@name:HSNIH-Palmer
@mat:0
@pat:2
#heterozygous, optional, default is \"H\"
@het:1
#Unknown, optional, default is \"U\"
@unk:U


In [16]:
geno_colnames = split(test_line, '\t')

1-element Vector{SubString{String}}:
 "@unk:U"

In [17]:
size(geno_colnames)

(1,)

In [18]:
@time cols_to_extract_from_fullgeno = map(x -> x in sample_ids, geno_colnames);

  0.170417 seconds (431.58 k allocations: 22.404 MiB, 99.80% compilation time)


In [19]:
sum(cols_to_extract_from_fullgeno)

0

In [20]:
cols_ids_in_geno = findall(cols_to_extract_from_fullgeno .== 1) 

Int64[]

In [21]:
#=
@time begin
    myfile = open("my_genofile.txt", "w")
    row_count = 0
    for line in eachline("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true.geno")

        row_count = row_count + 1;

        if row_count >= 9
            col_count = 0;
            words_in_curr_line = split(line, '\t');
            for word in words_in_curr_line
                col_count = col_count + 1;
                if col_count in cols_ids_in_geno
                    to_write = word * "\t";
                    write(myfile, to_write);
                end
            end
            write(myfile, "\n")
        end
    end

    close(myfile)
end
=#

In [22]:
@time omics_geno = readdlm("my_genofile.txt", '\t')[2:end, 1:(end-1)]

 13.409058 seconds (192.84 M allocations: 5.101 GiB, 6.31% gc time, 4.01% compilation time)


134918×80 Matrix{Any}:
 2        2        1        2        …  2        1        2        2
 2        2        2        2           2        2        2        2
 2        2        2        1           2        2        2        2
 2        2        2        1           2        2        2        2
 2        2        2        1           2        2        2        1
 2        2        2        2        …  2        2        1        2
 2        2        2        2           2        2        1        2
 0.997    1.976    0.991    1.979       1.979    1.979    1.979    0.007
 1.941    1.559    1.006    1.994       1.993    1.994    1.993    1.998
 1.978    1.992    1.997    1.994       1.993    1.993    1.993    1.997
 0.031    0.39     0.99298  0.008    …  0.01     0.009    0.01     0.003
 1.997    1.997    1.998    1.997       1.997    1.997    1.997    1.998
 1.997    1.997    1.006    1.997       1.997    1.997    1.997    1.998
 ⋮                                   ⋱                  

In [23]:
NA_entries = findall(x -> typeof(x) == SubString{String}, omics_geno)

1494-element Vector{CartesianIndex{2}}:
 CartesianIndex(21302, 1)
 CartesianIndex(21316, 1)
 CartesianIndex(35852, 1)
 CartesianIndex(35855, 1)
 CartesianIndex(78338, 1)
 CartesianIndex(78340, 1)
 CartesianIndex(78341, 1)
 CartesianIndex(78363, 1)
 CartesianIndex(95701, 1)
 CartesianIndex(95704, 1)
 CartesianIndex(95731, 1)
 CartesianIndex(95812, 1)
 CartesianIndex(21302, 2)
 ⋮
 CartesianIndex(95776, 80)
 CartesianIndex(95777, 80)
 CartesianIndex(95779, 80)
 CartesianIndex(95781, 80)
 CartesianIndex(95782, 80)
 CartesianIndex(95784, 80)
 CartesianIndex(95787, 80)
 CartesianIndex(95812, 80)
 CartesianIndex(109662, 80)
 CartesianIndex(124557, 80)
 CartesianIndex(124558, 80)
 CartesianIndex(130075, 80)

There are still some missings coded as "NA"'s...

In [24]:
omics_geno[21302, 1]

"NA"

In [25]:
omics_geno[130075, 80]

"NA"

In [26]:
NA_markers = zeros(length(NA_entries));

for i in 1:length(NA_entries)
    NA_markers[i] = NA_entries[i][1];
end

NA_markers = trunc.(Int64, NA_markers);

In [27]:
NA_markers

1494-element Vector{Int64}:
  21302
  21316
  35852
  35855
  78338
  78340
  78341
  78363
  95701
  95704
  95731
  95812
  21302
      ⋮
  95776
  95777
  95779
  95781
  95782
  95784
  95787
  95812
 109662
 124557
 124558
 130075

In [28]:
omics_geno_nomissing = omics_geno[map(x -> !(x in NA_markers), collect(1:size(omics_geno, 1))), :]';

In [29]:
# writedlm("/home/zyu20/shareddata/HSNIH-Palmer/HSNIH-Palmer_true_omics_geno_nomissing.txt", omics_geno_nomissing, '\t')

In [30]:
omics_geno_nomissing = float.(omics_geno_nomissing)./2.0;

In [31]:
@time maf_omics_geno = vec(mean(omics_geno_nomissing; dims = 1));

  0.205472 seconds (502.51 k allocations: 26.978 MiB, 94.42% compilation time)


In [32]:
to_keep = map(x -> (x < 0.95) & (x > 0.05), maf_omics_geno);

In [33]:
omics_geno_processed = omics_geno_nomissing[:, to_keep];

In [34]:
(n, m) = size(omics_pheno_processed);
p = size(omics_geno_processed, 2);

In [35]:
n, m, p

(80, 18416, 117618)

## Load BulkLMM functions:

In [36]:
include(joinpath(local_path, "kinship.jl"));
include(joinpath(local_path, "util.jl"));
include(joinpath(local_path, "wls.jl"));
include(joinpath(local_path, "lmm.jl"));
include(joinpath(local_path, "gridbrent.jl"));
include(joinpath(local_path, "transform_helpers.jl"));
include(joinpath(local_path, "scan.jl"));
include(joinpath(local_path, "bulkscan_helpers.jl"));
include(joinpath(local_path, "bulkscan.jl"));
include(joinpath(local_path, "readData.jl"));
include(joinpath(local_path, "../plot_utils/visuals_utils.jl"));
include(joinpath(local_path, "analysis_helpers/single_trait_analysis.jl"));
include("../../BigRiver_util_code/src/kinship_utils.jl");
include("../../BigRiver_util_code/src/run_gemma_utils.jl");

In [37]:
@time omics_kinship = calcKinship(omics_geno_processed);

 10.215746 seconds (31.60 k allocations: 13.847 GiB, 10.20% gc time)


In [38]:
@time scan_null_results = scan(reshape(omics_pheno_processed[:, 1997], :, 1), omics_geno_processed, omics_kinship;
                               optim_interval = 10);

  1.475880 seconds (1.97 M allocations: 799.621 MiB, 9.97% gc time, 31.00% compilation time)


In [39]:
# @time bulkscan_null_results = bulkscan_null(omics_pheno_nozeros, omics_geno_nomissing_filtered, omics_kinship);

In [44]:
grid_loose = collect(0.00:0.05:0.95);

In [45]:
grid_fine = collect(0.00:0.001:0.999);

In [41]:
@time bulkscan_null_grid_results = bulkscan_null_grid(omics_pheno_processed, omics_geno_processed, omics_kinship, grid_loose;
                                                      reml = false);

 70.045321 seconds (227.92 M allocations: 95.681 GiB, 7.65% gc time, 3.55% compilation time)


In [46]:
BLAS.get_num_threads()

8

In [47]:
BLAS.set_num_threads(4)

In [48]:
# @time bulkscan_alt_grid_results = bulkscan_alt_grid(omics_pheno_nozeros, omics_geno_nomissing_filtered, omics_kinship, grid_loose);

In [49]:
scan_null_results.h2_null

1.6099624375063655e-15

In [50]:
bulkscan_null_grid_results.h2_null_list[1997]

0.0

In [51]:
quantile(bulkscan_null_grid_results.h2_null_list, collect(0.1:0.1:1.0))

10-element Vector{Float64}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.999

In [52]:
hcat(scan_null_results.lod, bulkscan_null_grid_results.L[:, 1997])

117618×2 Matrix{Float64}:
 0.618842   0.618842
 0.161133   0.161133
 0.102444   0.102444
 0.656556   0.656556
 0.0857209  0.0857209
 0.598879   0.598879
 0.418926   0.418926
 0.543063   0.543063
 0.0715733  0.0715733
 0.0820948  0.0820948
 0.342029   0.342029
 0.272401   0.272401
 0.643292   0.643292
 ⋮          
 0.0353781  0.0353781
 0.0180401  0.0180401
 0.0178843  0.0178843
 0.0179356  0.0179356
 0.0176449  0.0176449
 0.0180401  0.0180401
 0.0932019  0.0932019
 0.609019   0.609019
 0.307771   0.307771
 0.328802   0.328802
 0.3583     0.3583
 0.358103   0.358103

In [49]:
# bulkscan_alt_grid_results.h2_panel

## GEMMA:

In [53]:
gemma = "/home/zyu20/Softwares/gemma-0.98.5-linux-static-AMD64"

"/home/zyu20/Softwares/gemma-0.98.5-linux-static-AMD64"

In [54]:
rand_samples = sample(1:m, 1000);

In [55]:
pheno_filename = "data/GEMMA_data/hs_omics_pheno.txt";
geno_filename = "data/GEMMA_data/hs_omics_geno.txt";
kinship_filename = "data/GEMMA_data/hs_omics_kinship.txt";
output_filename = "results_univariate_LMM";

In [56]:
marker_names = "marker: " .* string.(collect(1:p));

In [None]:
@time gemma_samples_results = run_gemma(omics_pheno_processed[:, rand_samples], omics_geno_processed, omics_kinship,
                                  ["A", "B"], marker_names,
                                  pheno_filename, geno_filename, kinship_filename, 
                                  output_filename, 
                                  gemma);

GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.135784
se(pve) =0.627626


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.999997
se(pve) =0.000478139


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.965651
se(pve) =0.776553


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.230465
se(pve) =0.559326


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.0475157
se(pve) =0.686069


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =3.5548e-06
se(pve) =0.760351


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =3.5548e-06
se(pve) =0.521191


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.587294
se(pve) =0.680588


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.568454
se(pve) =0.703232


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =3.5548e-06
se(pve) =0.674689


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =0.125336
se(pve) =0.648045


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =nan
se(pve) =nan


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =3.5548e-06
se(pve) =0.661099


**** INFO: Done.


GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021
Reading Files ... 
## number of total individuals = 80
## number of analyzed individuals = 80
## number of covariates = 1
## number of phenotypes = 1
## number of total SNPs/var        =   117618
## number of analyzed SNPs         =   117618
Start Eigen-Decomposition...
pve estimate =3.5548e-06
se(pve) =0.0899421


In [None]:
gemma_samples_results

In [None]:
bulkscan_null_grid_results.L[:, rand_samples]

In [None]:
sum(abs.(bulkscan_null_grid_results.L[:, rand_samples] .- gemma_samples_results))

#### Issues with data found so far:

- Identical genotype probabilities at some markers for all samples;