In [1]:
#using Pkg
#Pkg.add("GeneNetworkAPI")

In [2]:
using Helium;
using DataFrames;
using CSV;
using GeneNetworkAPI;

In [41]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/HSNIH-Palmer"

In [42]:
pheno_10441 = get_pheno("HSNIH-Palmer", "10441");

In [43]:
size(pheno_10441)

(4099, 4)

In [44]:
pheno_10441[1:6, :]

Unnamed: 0_level_0,data_id,sample_name,sample_name_2,value
Unnamed: 0_level_1,Int64,String,String,Float64
1,41013852,00071F4FAF,00071F4FAF,67.0
2,41013852,00071FB160,00071FB160,32.0
3,41013852,0007207A73,0007207A73,70.0
4,41013852,0007213055,0007213055,45.0
5,41013852,0007213615,0007213615,59.0
6,41013852,00072136BA,00072136BA,38.0


In [45]:
geno_filename = "HSNIH-Palmer.he";

In [46]:
geno_mat = readhe(geno_filename);

In [48]:
size(geno_mat)

(134918, 6147)

In [49]:
geno_supp = Helium.getsupp(geno_filename);
geno_supp[1:10, :]

10×5 Matrix{String}:
 "SNPID"  "rsid"          "position"  "alleleA"  "alleleB"
 "1"      "chr1:55365"    "55365"     "A"        "T"
 "1"      "chr1:666374"   "666374"    "T"        "C"
 "1"      "chr1:666382"   "666382"    "A"        "T"
 "1"      "chr1:666394"   "666394"    "G"        "A"
 "1"      "chr1:669529"   "669529"    "C"        "T"
 "1"      "chr1:669562"   "669562"    "T"        "A"
 "1"      "chr1:671466"   "671466"    "T"        "C"
 "1"      "chr1:759319"   "759319"    "T"        "C"
 "1"      "chr1:1134030"  "1134030"   "A"        "G"

In [50]:
geno_sampleID = Helium.getcolnames(geno_filename);

In [51]:
pheno_sampleID = pheno_10441.sample_name;

In [52]:
match_id = Array{Int, 1}(undef, 6147);

In [53]:
@time begin 
    for i in 1:6147
    
    indicator = 0;
    for j in 1:4099
        if geno_sampleID[i] == pheno_sampleID[j]
            indicator = indicator + 1;
        end
    end
    
    match_id[i] = indicator;
end
end

  2.030349 seconds (45.16 M allocations: 689.133 MiB)


In [54]:
sum(match_id)

4094

In [55]:
typeof(match_id)

Vector{Int64}[90m (alias for [39m[90mArray{Int64, 1}[39m[90m)[39m

In [56]:
samples_in_common = geno_sampleID[match_id .==1];

In [57]:
match_id_pheno = Array{Int, 1}(undef, 4099);

In [58]:
@time begin 
    for i in 1:length(match_id_pheno)
    
    indicator = 0;
    for j in 1:length(samples_in_common)
        if pheno_sampleID[i] == samples_in_common[j]
            indicator = indicator + 1;
        end
    end
    
    match_id_pheno[i] = indicator;
end
end

  2.440472 seconds (46.17 M allocations: 960.750 MiB)


In [59]:
sum(match_id_pheno)

4094

In [60]:
size(geno_mat)

(134918, 6147)

In [61]:
size(pheno_10441)

(4099, 4)

In [62]:
## subset geno:
geno_common = geno_mat[:, match_id .== 1];
## subset pheno:
pheno_common = pheno_10441[match_id_pheno .== 1, :];

In [63]:
size(pheno_common)

(4094, 4)

In [64]:
size(geno_common)

(134918, 4094)

Finally, we got a sample genofile with 4094 individuals, and 134918 genotype markers.

In [65]:
n = size(geno_common, 2)

4094

In [66]:
mean_freq = sum(geno_common; dims = 2)./(2*n);

In [67]:
less_common_markers = map(x -> (x > 0.95) || (x < 0.05), mean_freq)[:, 1];

In [68]:
typeof(less_common_markers)

Vector{Bool}[90m (alias for [39m[90mArray{Bool, 1}[39m[90m)[39m

In [69]:
sum(less_common_markers)

16546

In [70]:
134918 - 16546

118372

In [71]:
geno_processed = permutedims(geno_common[less_common_markers .!= 1, :]);

In [79]:
pheno_processed = reshape(pheno_common[:, 4], :, 1);

In [80]:
size(geno_processed)

(4094, 118372)

In [81]:
size(pheno_processed)

(4094, 1)

## Run BulkLMM:

In [82]:
using DelimitedFiles
using LinearAlgebra
using Optim
using Distributions
using Test
using BenchmarkTools
using Random
using Plots
using Profile
using Distributed

In [83]:
pwd()

"/home/xyu/github/BulkLMM.jl/analysis/HSNIH-Palmer"

In [84]:
include("../../src/scan.jl");
include("../../src/kinship.jl");
include("../../src/lmm.jl");
include("../../src/parallel_helpers.jl");
include("../../src/util.jl");
include("../../src/wls.jl");

In [None]:
@time kinship_BulkLMM = calcKinship(geno_processed);

In [None]:
size(kinship_BulkLMM)

In [None]:
pheno_common

In [None]:
@time scan(pheno_processed, geno_processed, kinship_BulkLMM; reml = false, method = "alt")

## Run Gemma:

In [36]:
gemma = "/home/xyu/software/GEMMA/gemma-0.98.5-linux-static-AMD64";

In [37]:
run(`$gemma -h`)

GEMMA 0.98.5 (2021-08-25) by Xiang Zhou, Pjotr Prins and team (C) 2012-2021

 type ./gemma -h [num] for detailed help
 options: 
  1: quick guide
  2: file I/O related
  3: SNP QC
  4: calculate relatedness matrix
  5: perform eigen decomposition
  6: perform variance component estimation
  7: fit a linear model
  8: fit a linear mixed model
  9: fit a multivariate linear mixed model
 10: fit a Bayesian sparse linear mixed model
 11: obtain predicted values
 12: calculate snp variance covariance
 13: note
 14: debug options

The GEMMA software is distributed under the GNU General Public v3
   -license    show license information
   see also http://www.xzlab.org/software.html, https://github.com/genetics-statistics


Process(`[4m/home/xyu/software/GEMMA/gemma-0.98.5-linux-static-AMD64[24m [4m-h[24m`, ProcessExited(0))