In [2]:
#using Pkg
#Pkg.add("GeneNetworkAPI")

In [3]:
using Helium;
using DataFrames;
using CSV;
using GeneNetworkAPI;
using DelimitedFiles;
using LinearAlgebra;
using Optim;
using Distributions;
using BenchmarkTools;
using Random;
using Plots;

In [4]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/HSNIH-Palmer"

In [5]:
include("../../src/readData.jl");

In [6]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/HSNIH-Palmer"

In [7]:
pheno_10441 = get_pheno("HSNIH-Palmer", "10441");

In [8]:
size(pheno_10441)

(4099, 4)

In [9]:
pheno_10441[1:6, :]

Unnamed: 0_level_0,data_id,sample_name,sample_name_2,value
Unnamed: 0_level_1,Int64,String,String,Float64
1,41013852,00071F4FAF,00071F4FAF,67.0
2,41013852,00071FB160,00071FB160,32.0
3,41013852,0007207A73,0007207A73,70.0
4,41013852,0007213055,0007213055,45.0
5,41013852,0007213615,0007213615,59.0
6,41013852,00072136BA,00072136BA,38.0


In [10]:
geno_filename = "HSNIH-Palmer.he";

In [11]:
geno_mat = readhe(geno_filename);

In [12]:
size(geno_mat)

(134918, 6147)

## Check for infinite values in the genotype array:

In [17]:
sum(geno_mat)

NaN

In [18]:
NaN_to_remove = zeros(size(geno_mat, 1));

In [19]:
for p = 1:size(geno_mat, 1)
    
    if any(!isfinite, geno_mat[p, :])
        
        NaN_to_remove[p] = 1
        
    end
    
end

LoadError: InterruptException:

In [20]:
sum(NaN_to_remove) # 282 markers have infinite values; we simply remove those rows

259.0

In [21]:
geno_mat = geno_mat[NaN_to_remove .!= 1, :];
size(geno_mat)

(134659, 6147)

In [22]:
geno_supp = Helium.getsupp(geno_filename);
size(geno_supp)

LoadError: KeyError: key "302e39392c302e39" not found

In [23]:
geno_supp[1:10, :]

LoadError: UndefVarError: geno_supp not defined

In [24]:
geno_supp_header = geno_supp[1, :];
geno_supp = geno_supp[2:end, :];

LoadError: UndefVarError: geno_supp not defined

In [25]:
geno_supp = geno_supp[NaN_to_remove .!= 1, :]

LoadError: UndefVarError: geno_supp not defined

In [26]:
geno_sampleID = Helium.getcolnames(geno_filename);

LoadError: KeyError: key "302e39392c302e39" not found

In [27]:
pheno_sampleID = pheno_10441.sample_name;

In [28]:
match_id = Array{Int, 1}(undef, 6147);

In [29]:
@time begin 
    for i in 1:6147
    
    indicator = 0;
    for j in 1:4099
        if geno_sampleID[i] == pheno_sampleID[j]
            indicator = indicator + 1;
        end
    end
    
    match_id[i] = indicator;
end
end

LoadError: UndefVarError: geno_sampleID not defined

In [30]:
sum(match_id)

2332864606977916928

In [31]:
typeof(match_id)

Vector{Int64}[90m (alias for [39m[90mArray{Int64, 1}[39m[90m)[39m

In [32]:
samples_in_common = geno_sampleID[match_id .==1];

LoadError: UndefVarError: geno_sampleID not defined

In [33]:
match_id_pheno = Array{Int, 1}(undef, 4099);

In [34]:
@time begin 
    for i in 1:length(match_id_pheno)
    
    indicator = 0;
    for j in 1:length(samples_in_common)
        if pheno_sampleID[i] == samples_in_common[j]
            indicator = indicator + 1;
        end
    end
    
    match_id_pheno[i] = indicator;
end
end

LoadError: UndefVarError: samples_in_common not defined

In [35]:
sum(match_id_pheno)

37141831977676648

In [36]:
size(geno_mat)

(134659, 6147)

In [37]:
size(pheno_10441)

(4099, 4)

In [38]:
## subset geno:
geno_common = geno_mat[:, match_id .== 1];
## subset pheno:
pheno_common = pheno_10441[match_id_pheno .== 1, :];

In [39]:
size(pheno_common)

(0, 4)

In [40]:
size(geno_common)

(134659, 0)

Finally, we got a sample genofile with 4094 individuals, and 134918 genotype markers.

In [41]:
n = size(geno_common, 2)

0

In [42]:
mean_freq = sum(geno_common; dims = 2)./(2*n);

In [43]:
less_common_markers = map(x -> (x > 0.90) || (x < 0.10), mean_freq)[:, 1];

In [44]:
typeof(less_common_markers)

Vector{Bool}[90m (alias for [39m[90mArray{Bool, 1}[39m[90m)[39m

In [45]:
size(geno_common, 1) - sum(less_common_markers)

134659

In [46]:
geno_processed = permutedims(geno_common[less_common_markers .!= 1, :]);

In [47]:
pheno_processed = reshape(pheno_common[:, 4], :, 1);

In [48]:
size(geno_processed)

(0, 134659)

In [49]:
size(pheno_processed)

(0, 1)

## Run Gemma:

In [50]:
gemma = "/home/xyu/software/GEMMA/gemma-0.98.5-linux-static-AMD64";

In [51]:
run(`$gemma -h`)

GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021

 type ./gemma -h [num] for detailed help
 options: 
  1: quick guide
  2: file I/O related
  3: SNP QC
  4: calculate relatedness matrix
  5: perform eigen decomposition
  6: perform variance component estimation
  7: fit a linear model
  8: fit a linear mixed model
  9: fit a multivariate linear mixed model
 10: fit a Bayesian sparse linear mixed model
 11: obtain predicted values
 12: calculate snp variance covariance
 13: note
 14: debug options

The GEMMA software is distributed under the GNU General Public v3
   -license    show license information
   see also http://www.xzlab.org/software.html, https://github.com/genetics-statistics


Process(`[4m/home/xyu/software/GEMMA/gemma-0.98.5-linux-static-AMD64[24m [4m-h[24m`, ProcessExited(0))

In [52]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/HSNIH-Palmer"

In [53]:
include("../../src/readData.jl");

In [54]:
geno_supp

LoadError: UndefVarError: geno_supp not defined

In [55]:
geno_supp_processed = geno_supp[less_common_markers .!= 1, :];

LoadError: UndefVarError: geno_supp not defined

In [56]:
size(geno_supp_processed)

LoadError: UndefVarError: geno_supp_processed not defined

In [57]:
@time geno_processed = permutedims(geno_processed);

  0.000005 seconds (1 allocation: 64 bytes)


In [58]:
size(geno_processed)

(134659, 0)

In [59]:
marker_names = geno_supp_processed[:, 2];
minor_allele = geno_supp_processed[:, 4];
major_allele = geno_supp_processed[:, 5];
@time geno_prob_bimbam = hcat(marker_names, minor_allele, major_allele);

LoadError: UndefVarError: geno_supp_processed not defined

In [60]:
geno_prob_bimbam = hcat(geno_prob_bimbam, geno_processed);

LoadError: UndefVarError: geno_prob_bimbam not defined

In [61]:
geno_prob_bimbam[1:10, :]

LoadError: UndefVarError: geno_prob_bimbam not defined

In [62]:
size(geno_prob_bimbam)

LoadError: UndefVarError: geno_prob_bimbam not defined

In [63]:
pheno_filename = "pheno.txt";

In [64]:
geno_filename = "geno.txt";

In [65]:
#writedlm(pheno_filename, pheno_processed);

In [66]:
#writeToFile(geno_prob_bimbam, geno_filename);

In [67]:
# @time run(`$gemma -g $geno_filename -p $pheno_filename -gk 1 -o kinship`)

In [68]:
kinship_Gc = CSV.read("output/kinship.cXX.txt", DataFrame, delim = '\t', header = false);

In [69]:
size(kinship_Gc)

(4094, 4094)

In [70]:
pheno_processed

0×1 Matrix{Float64}

In [71]:
kinshipMat_Gc = Matrix(kinship_Gc); 

In [72]:
size(pheno_processed)

(0, 1)

In [73]:
geno_processed_transpose = permutedims(geno_processed);

In [74]:
size(geno_processed_transpose)

(0, 134659)

In [75]:
# writedlm("pheno.csv", pheno_processed);
# writedlm("geno.csv", geno_processed_transpose);

In [None]:
# @time results_bulklmm_alt = scan(pheno_processed, geno_processed_transpose, kinshipMat_Gc; reml = false, method = "null");

In [None]:
# @time run(`$gemma -g $geno_filename -p $pheno_filename -k output/kinship.cXX.txt -lmm 2 -o gemma_results.txt`)

## Run BulkLMM:

In [None]:
# @time kinship_BulkLMM = calcKinship(geno_processed)

In [None]:
# size(kinship_BulkLMM)

In [None]:
# pheno_common

In [None]:
# @time results_BulkLMM = scan(pheno_processed, geno_processed, kinship_BulkLMM; reml = false, method = "alt")