In [1]:
from datetime import datetime
import os 
import pandas as pd
import numpy as np

In [2]:
bucket = os.getenv("WORKSPACE_BUCKET")

USER_NAME = os.getenv('OWNER_EMAIL').split('@')[0].replace('.','-')

In [3]:
%env USER_NAME={USER_NAME}

env: USER_NAME=williamsjacr


In [5]:
!pip3 install --upgrade dsub


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
%%writefile ~/aou_dsub.bash
#!/bin/bash
function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

Writing /home/jupyter/aou_dsub.bash


In [7]:
!echo source ~/aou_dsub.bash >> ~/.bashrc

In [12]:
%%writefile Common_Variants.R
rm(list = ls())

chr <- as.numeric(commandArgs(TRUE)[1])
print(chr)

BED_file <- commandArgs(TRUE)[2]
BED_file <- gsub("\\.bed","",BED_file)
print(BED_file)

Ancestry_File <- commandArgs(TRUE)[5]
print(Ancestry_File)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)


ancestries <- read.delim(Ancestry_File)
system(paste0("rm ",Ancestry_File))
ancestries <- data.frame(IID = as.numeric(ancestries[,c("research_id")]),ancestry = toupper(ancestries[,c("ancestry_pred")]))

system(paste0("plink --bfile ",BED_file," --geno 0.10 --make-bed --out tmp1"),intern = TRUE)
system(paste0("rm ",BED_file,".bed"));system(paste0("rm ",BED_file,".bim"));system(paste0("rm ",BED_file,".fam"))
system(paste0("plink --bfile tmp1 --mind 0.05 --make-bed --out tmp2"),intern = TRUE)
system("rm tmp1.bed");system("rm tmp1.bim");system("rm tmp1.fam")


AFR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AFR"])
AMR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AMR"])
EUR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "EUR"])

write.table(AFR_Keep,"AFR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(AMR_Keep,"AMR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(EUR_Keep,"EUR_Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile tmp2 --keep AFR_Keep.txt --freq --out chr",chr,"_freq_AFR"),intern = TRUE)
system(paste0("plink --bfile tmp2 --keep AMR_Keep.txt --freq --out chr",chr,"_freq_AMR"),intern = TRUE)
system(paste0("plink --bfile tmp2 --keep EUR_Keep.txt --freq --out chr",chr,"_freq_EUR"),intern = TRUE)

a <- read.csv(paste0("chr",chr,"_freq_AFR.frq"),sep = "")
b <- read.csv(paste0("chr",chr,"_freq_AMR.frq"),sep = "")
c <- read.csv(paste0("chr",chr,"_freq_EUR.frq"),sep = "")

exclude_list <- a$SNP[is.na(a$MAF) | is.na(b$MAF) | is.na(c$MAF) | a$MAF <= 0.01 | b$MAF <= 0.01 | c$MAF <= 0.01]
write.table(exclude_list,"exclude_list.txt",row.names = F,col.names = F,quote=F)

system(paste0("plink --bfile tmp2 --exclude exclude_list.txt --make-bed --out ",OUTPUT_PATH,"/chr",chr,"_corrected"),intern = TRUE)


Overwriting Common_Variants.R


In [13]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for (chrom in 1:12) {
  tasks <- rbind(tasks, data.frame(
    '--env CHR'=chrom,
    '--input BED_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr",chrom,".bed"),
    '--input BIM_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr",chrom,".bim"),
    '--input FAM_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/acaf_threshold_v7.1/plink_bed/acaf_threshold.chr",chrom,".fam"),
    '--input Ancestry_File'="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Common_Variants.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
    # '--output out_file'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/test",chrom,".Rdata"),
    check.names = FALSE
  ))
}

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [14]:
%%writefile Common_Variants.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${CHR} ${BED_File} ${BIM_File} ${FAM_File} ${Ancestry_File} ${OUTPUT_PATH}

Overwriting Common_Variants.sh


In [15]:
!Rscript score_task.R

In [16]:
!gsutil -m cp Common_Variants.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Common_Variants.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.1 KiB/  2.1 KiB] 100% Done                                    
Operation completed over 1 objects/2.1 KiB.                                      


In [17]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 1000 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Common_Variants.sh \
  --tasks score_task.txt

Job properties:
  job-id: common-var--williamsjacr--250410-125128-66
  job-name: common-variants
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/609494964870255421
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12779355517925240545
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5962273838059799743
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12207914947044021913
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9136467727547312618
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4436065283973798346
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1749630105134682131
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17896130386990975270
Provider inte

In [36]:
%%writefile Common_Variants_BED_Sum.R
rm(list = ls())

chr <- as.numeric(commandArgs(TRUE)[1])
print(chr)

BED_file <- commandArgs(TRUE)[2]
BED_file <- gsub(".bed","",BED_file,fixed = TRUE)
print(BED_file)

CT_BIM_FILE <- commandArgs(TRUE)[3]
print(CT_BIM_FILE)

PROSPER_BIM_FILE <- commandArgs(TRUE)[4]
print(PROSPER_BIM_FILE)

Ancestry_File <- commandArgs(TRUE)[5]
print(Ancestry_File)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)


ancestries <- read.delim(Ancestry_File)
ancestries <- data.frame(IID = as.numeric(ancestries[,c("research_id")]),ancestry = toupper(ancestries[,c("ancestry_pred")]))


AFR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AFR"])
AMR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AMR"])
EUR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "EUR"])


write.table(AFR_Keep,"AFR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(AMR_Keep,"AMR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(EUR_Keep,"EUR_Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_file," --keep AFR_Keep.txt --freq --out chr",chr,"_freq_AFR"),intern = TRUE)
system(paste0("plink --bfile ",BED_file," --keep AMR_Keep.txt --freq --out chr",chr,"_freq_AMR"),intern = TRUE)
system(paste0("plink --bfile ",BED_file," --keep EUR_Keep.txt --freq --out chr",chr,"_freq_EUR"),intern = TRUE)

a <- read.csv(paste0("chr",chr,"_freq_AFR.frq"),sep = "")
b <- read.csv(paste0("chr",chr,"_freq_AMR.frq"),sep = "")
c <- read.csv(paste0("chr",chr,"_freq_EUR.frq"),sep = "")

exclude_list <- a$SNP[is.na(a$MAF) | is.na(b$MAF) | is.na(c$MAF) | a$MAF <= 0.01 | b$MAF <= 0.01 | c$MAF <= 0.01]
rarevariant_list <- a$SNP[is.na(a$MAF) | is.na(b$MAF) | is.na(c$MAF)]

Number_Common_Variants <- nrow(a) - length(exclude_list)
Number_Rare_Variants <- nrow(a) - length(rarevariant_list) - Number_Common_Variants

CT_BIM <- read.delim(CT_BIM_FILE,header = FALSE)
CT_SNPs <- nrow(CT_BIM)

PROSPER_BIM <- read.delim(PROSPER_BIM_FILE,header = FALSE)
PROSPER_SNPs <- nrow(PROSPER_BIM)

tmp <- data.frame(Chr = chr,
                  Number_Common_Variants = Number_Common_Variants,
                  Number_Rare_Variants = Number_Rare_Variants,
                 CT_SNPs = CT_SNPs,
                 PROSPER_SNPs = PROSPER_SNPs)
write.csv(tmp,row.names = FALSE,file = paste0(OUTPUT_PATH,"/",chr,"_NumSNPs.csv"))


Overwriting Common_Variants_BED_Sum.R


In [37]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for (chrom in 1:22) {
  tasks <- rbind(tasks, data.frame(
    '--env CHR'=chrom,
    '--input BED_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/plink_bed/exome.chr",chrom,".bed"),
    '--input BIM_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/plink_bed/exome.chr",chrom,".bim"),
    '--input FAM_File'=paste0("gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/exome_v7.1/plink_bed/exome.chr",chrom,".fam"),
    '--input CT_BIM_FILE'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input PROSPER_BIM_FILE'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bim",  
    '--input Ancestry_File'="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Common_Variants_BED_Sum.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Number_SNPs",
    check.names = FALSE
  ))
}

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [38]:
%%writefile Common_Variants_BED_Sum.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${CHR} ${BED_File} ${CT_BIM_FILE} ${PROSPER_BIM_FILE} ${Ancestry_File} ${OUTPUT_PATH}

Overwriting Common_Variants_BED_Sum.sh


In [39]:
!Rscript score_task.R

In [40]:
!gsutil -m cp Common_Variants_BED_Sum.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Common_Variants_BED_Sum.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.3 KiB/  2.3 KiB] 100% Done                                    
Operation completed over 1 objects/2.3 KiB.                                      


In [41]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 400 \
  --boot-disk-size 25 \
  --min-ram 32 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Common_Variants_BED_Sum.sh \
  --tasks score_task.txt

Job properties:
  job-id: common-var--williamsjacr--240901-131943-66
  job-name: common-variants-bed-sum
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11196432129391680803
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7824575572013587726
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9758527684499917372
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15865405136015907616
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13738081057261705017
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1215447884564362733
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12706738842639517649
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9128748637006943225
Pro

In [12]:
%%writefile merge_bed.R

rm(list = ls())

INPUT_PATH <- commandArgs(TRUE)[1]
print(INPUT_PATH)

OUTPUT_PATH <- commandArgs(TRUE)[2]
print(OUTPUT_PATH)

merge_list_train <- paste0(INPUT_PATH,"/chr",2:22,"_corrected")
write.table(merge_list_train,file = "merge_list.txt",col.names = F,row.names = F,quote=F)
system(paste0("plink --bfile ",INPUT_PATH,"/chr",1,"_corrected --merge-list merge_list.txt --make-bed --out ",OUTPUT_PATH,"/all_chr"))

Overwriting merge_bed.R


In [13]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
  '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/merge_bed.R",
  '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
  '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
  # '--output out_file'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/test",chrom,".Rdata"),
  check.names = FALSE
))

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [14]:
%%writefile merge_bed.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${INPUT_PATH} ${OUTPUT_PATH}

Overwriting merge_bed.sh


In [15]:
!Rscript score_task.R

In [16]:
!gsutil -m cp merge_bed.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://merge_bed.R [Content-Type=application/octet-stream]...
/ [1/1 files][  417.0 B/  417.0 B] 100% Done                                    
Operation completed over 1 objects/417.0 B.                                      


In [17]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 2000 \
  --boot-disk-size 25 \
  --min-ram 72 \
  --timeout "72h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script merge_bed.sh \
  --tasks score_task.txt

Job properties:
  job-id: merge-bed--williamsjacr--250414-112849-13
  job-name: merge-bed
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1734854414342758511
Launched job-id: merge-bed--williamsjacr--250414-112849-13
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'merge-bed--williamsjacr--250414-112849-13' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'merge-bed--williamsjacr--250414-112849-13' --users 'williamsjacr'


In [58]:
%%writefile all_phenotypes.R

rm(list = ls())

Related_File <- commandArgs(TRUE)[1]
print(Related_File)

Ancestry_File <- commandArgs(TRUE)[2]
print(Ancestry_File)

INPUT_PATH <- commandArgs(TRUE)[3]
print(INPUT_PATH)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

flagged_relatedness <- read.delim(Related_File)
colnames(flagged_relatedness) <- "IID"

ancestries <- read.delim(Ancestry_File)

ancestries <- data.frame(IID = as.numeric(ancestries[,c("research_id")]),ancestry = toupper(ancestries[,c("ancestry_pred")]))

aou_BMI_pheno <- read.delim(paste0(INPUT_PATH,"/aou_BMI_pheno.tsv"))
aou_LDL_pheno <- read.delim(paste0(INPUT_PATH,"/aou_LDL_pheno.tsv"))
aou_HDL_pheno <- read.delim(paste0(INPUT_PATH,"/aou_HDL_pheno.tsv"))
aou_TG_pheno <- read.delim(paste0(INPUT_PATH,"/aou_TG_pheno.tsv"))
colnames(aou_TG_pheno)[3] <- "logTG"
aou_TC_pheno <- read.delim(paste0(INPUT_PATH,"/aou_TC_pheno.tsv"))
aou_Height_pheno <- read.csv(paste0(INPUT_PATH,"/aou_Height_pheno.tsv"),sep ="")

aou_BMI_cov <- read.delim(paste0(INPUT_PATH,"/aou_BMI_cov.tsv"))
aou_LDL_cov <- read.csv(paste0(INPUT_PATH,"/aou_LDL_cov.tsv"),sep ="")
aou_HDL_cov <- read.csv(paste0(INPUT_PATH,"/aou_HDL_cov.tsv"),sep ="")
aou_TG_cov <- read.csv(paste0(INPUT_PATH,"/aou_TG_cov.tsv"),sep ="")
aou_TC_cov <- read.csv(paste0(INPUT_PATH,"/aou_TC_cov.tsv"),sep ="")
aou_Height_cov <- read.csv(paste0(INPUT_PATH,"/aou_Height_cov.tsv"),sep ="")

library(dplyr)
unique_ids_pheno <- unique(c(aou_BMI_pheno$IID,aou_LDL_pheno$IID,aou_HDL_pheno$IID,aou_TG_pheno$IID,aou_TC_pheno$IID,aou_Height_pheno$IID))
unique_ids_cov <- unique(c(aou_BMI_cov$IID,aou_LDL_cov$IID,aou_HDL_cov$IID,aou_TG_cov$IID,aou_TC_cov$IID,aou_Height_cov$IID))


all_phenotypes <- data.frame(IID = unique(c(unique_ids_pheno,unique_ids_cov)))
all_phenotypes <- all_phenotypes[!(all_phenotypes$IID %in% flagged_relatedness$IID),,drop = FALSE]

all_phenotypes <- left_join(all_phenotypes,aou_BMI_pheno)
all_phenotypes <- left_join(all_phenotypes,aou_LDL_pheno)
all_phenotypes <- left_join(all_phenotypes,aou_HDL_pheno)
all_phenotypes <- left_join(all_phenotypes,aou_TG_pheno)
all_phenotypes <- left_join(all_phenotypes,aou_TC_pheno)
all_phenotypes <- left_join(all_phenotypes,aou_Height_pheno)

all_phenotypes <- left_join(all_phenotypes,aou_BMI_cov)
all_phenotypes <- left_join(all_phenotypes,aou_LDL_cov)
all_phenotypes <- left_join(all_phenotypes,aou_HDL_cov)
all_phenotypes <- left_join(all_phenotypes,aou_TG_cov)
all_phenotypes <- left_join(all_phenotypes,aou_TC_cov)
all_phenotypes <- left_join(all_phenotypes,aou_Height_cov)

all_phenotypes <- left_join(all_phenotypes,ancestries)

all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$age),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$female),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC1),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC2),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC3),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC4),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC5),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC6),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC7),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC8),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC9),]
all_phenotypes <- all_phenotypes[!is.na(all_phenotypes$PC10),]

write.csv(all_phenotypes,file = paste0(OUTPUT_PATH,"/all_phenotypes.csv"),row.names = FALSE)

Overwriting all_phenotypes.R


In [59]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
  '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/all_phenotypes.R",
  '--input Related_File'="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/relatedness/relatedness_flagged_samples.tsv",
  '--input Ancestry_File'="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv",
  '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/Tony_Files",
  '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/",
  check.names = FALSE
))

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [60]:
%%writefile all_phenotypes.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Related_File} ${Ancestry_File} ${INPUT_PATH} ${OUTPUT_PATH}

Overwriting all_phenotypes.sh


In [61]:
!Rscript score_task.R

In [62]:
!gsutil -m cp all_phenotypes.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://all_phenotypes.R [Content-Type=application/octet-stream]...
/ [1/1 files][  3.4 KiB/  3.4 KiB] 100% Done                                    
Operation completed over 1 objects/3.4 KiB.                                      


In [63]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script all_phenotypes.sh \
  --tasks score_task.txt

Job properties:
  job-id: all-phenot--williamsjacr--240627-152247-11
  job-name: all-phenotypes
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3165015638967736981
Launched job-id: all-phenot--williamsjacr--240627-152247-11
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'all-phenot--williamsjacr--240627-152247-11' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'all-phenot--williamsjacr--240627-152247-11' --users 'williamsjacr'


In [73]:
%%writefile Train_Tune_Validation.R

rm(list = ls())

all_phenotype_File <- commandArgs(TRUE)[1]
print(all_phenotype_File)

OUTPUT_PATH <- commandArgs(TRUE)[2]
print(OUTPUT_PATH)

all_phenotypes <- read.csv(all_phenotype_File)

i <- (1:length(all_phenotypes$IID))[all_phenotypes$ancestry %in% c("EUR","AFR","AMR")]
train_number <- round(length(all_phenotypes$IID)*0.7) + 1
train <- sample(i, train_number)

i <- (1:length(all_phenotypes$IID))[!((1:length(all_phenotypes$IID)) %in% train)]
i_EUR <- i[all_phenotypes$ancestry[i] == "EUR"]
i_AFR <- i[all_phenotypes$ancestry[i] == "AFR"]
i_SAS <- i[all_phenotypes$ancestry[i] == "SAS"]
i_EAS <- i[all_phenotypes$ancestry[i] == "EAS"]
i_MID <- i[all_phenotypes$ancestry[i] == "MID"]
i_AMR <- i[all_phenotypes$ancestry[i] == "AMR"]


tune <- c(sample(i_EUR,round(length(i_EUR)/2)),
          sample(i_AFR,round(length(i_AFR)/2)),
          sample(i_SAS,round(length(i_SAS)/2)),
          sample(i_EAS,round(length(i_EAS)/2)),
          sample(i_MID,round(length(i_MID)/2)),
          sample(i_AMR,round(length(i_AMR)/2)))

validation <- i[!(i %in% tune)]


train <- all_phenotypes$IID[train]
tune <- all_phenotypes$IID[tune]
validation <- all_phenotypes$IID[validation]


write.table(train,paste0(OUTPUT_PATH,"/train.txt"),row.names = FALSE,col.names = FALSE)
write.table(tune,paste0(OUTPUT_PATH,"/tune.txt"),row.names = FALSE,col.names = FALSE)
write.table(validation,paste0(OUTPUT_PATH,"/validation.txt"),row.names = FALSE,col.names = FALSE)

reference <- train[sample(1:length(train),3000,replace = FALSE)]

write.table(reference,paste0(OUTPUT_PATH,"/reference.txt"),row.names = FALSE,col.names = FALSE)

colnames(all_phenotypes)[10] <- "sex"

phenotype_train <- all_phenotypes[all_phenotypes$IID %in% train,]
phenotype_tune <- all_phenotypes[all_phenotypes$IID %in% tune,]
phenotype_validation <- all_phenotypes[all_phenotypes$IID %in% validation,]

phenotype_train$FID <- 0
phenotype_tune$FID <- 0
phenotype_validation$FID <- 0

phenotype_train <- phenotype_train[,c("IID","FID","BMI","TC","HDL","LDL","logTG","Height","age","sex","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]
phenotype_train$age2 <- phenotype_train$age^2
phenotype_tune <- phenotype_tune[,c("IID","FID","BMI","TC","HDL","LDL","logTG","Height","age","sex","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]
phenotype_tune$age2 <- phenotype_tune$age^2
phenotype_validation <- phenotype_validation[,c("IID","FID","BMI","TC","HDL","LDL","logTG","Height","age","sex","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]
phenotype_validation$age2 <- phenotype_validation$age^2

colnames(phenotype_train) <- c("IID",colnames(phenotype_train)[-1])
colnames(phenotype_tune) <- c("IID",colnames(phenotype_tune)[-1])
colnames(phenotype_validation) <- c("IID",colnames(phenotype_validation)[-1])

phenotype_train[c(9,11:21)] <- lapply(phenotype_train[c(9,11:21)], function(x){c(scale(x))})
phenotype_tune[c(9,11:21)] <- lapply(phenotype_tune[c(9,11:21)], function(x){c(scale(x))})
phenotype_validation[c(9,11:21)] <- lapply(phenotype_validation[c(9,11:21)], function(x){c(scale(x))})

write.table(phenotype_train,file = paste0(OUTPUT_PATH,"/All_Train.txt"),sep = '\t',row.names = FALSE,quote = FALSE)
write.table(phenotype_tune,file = paste0(OUTPUT_PATH,"/All_Tune.txt"),sep = '\t',row.names = FALSE,quote = FALSE)
write.table(phenotype_validation,file = paste0(OUTPUT_PATH,"/All_Validation.txt"),sep = '\t',row.names = FALSE,quote = FALSE)


Overwriting Train_Tune_Validation.R


In [74]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
  '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Train_Tune_Validation.R",
  '--input all_phenotype_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
  '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/",
  check.names = FALSE
))

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [75]:
%%writefile Train_Tune_Validation.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${all_phenotype_File} ${OUTPUT_PATH}

Overwriting Train_Tune_Validation.sh


In [76]:
!Rscript score_task.R

In [77]:
!gsutil -m cp Train_Tune_Validation.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Train_Tune_Validation.R [Content-Type=application/octet-stream]...
/ [1/1 files][  3.4 KiB/  3.4 KiB] 100% Done                                    
Operation completed over 1 objects/3.4 KiB.                                      


In [78]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 800 \
  --boot-disk-size 25 \
  --min-ram 60 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Train_Tune_Validation.sh \
  --tasks score_task.txt

Job properties:
  job-id: train-tune--williamsjacr--240627-153410-54
  job-name: train-tune-validation
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9958687970653510792
Launched job-id: train-tune--williamsjacr--240627-153410-54
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'train-tune--williamsjacr--240627-153410-54' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'train-tune--williamsjacr--240627-153410-54' --users 'williamsjacr'


In [8]:
%%writefile REGENIE_Prep_TraitSpecific.R
rm(list = ls())

all_phenotypes_file <- commandArgs(TRUE)[1]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[2]
print(all_phenotypes_file)

OUTPUT_PATH <- commandArgs(TRUE)[3]
print(OUTPUT_PATH)

all_phenotypes <- read.csv(all_phenotypes_file)

all_train <- read.delim(all_train_file)
for(anc in c("AFR","AMR","EUR")){
    for(trait in c("BMI","TC","logTG","LDL","HDL","Height")){
        tmp <- all_train[all_train$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == anc],]
        tmp <- tmp[!is.na(tmp[,trait]),]
        tmp <- tmp[,c("FID","IID","BMI","TC","HDL","LDL","logTG","Height","age","age2","sex","PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10")]
        write.table(tmp,file = paste0(OUTPUT_PATH,"/","All_Train_",trait,"_",anc,"_REGENIE.txt"),sep = '\t',row.names = FALSE,quote = FALSE)
    }
}

Overwriting REGENIE_Prep_TraitSpecific.R


In [9]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
  '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/REGENIE_Prep_TraitSpecific.R",
  '--input all_phenotype_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
  '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
  '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/",
  check.names = FALSE
))

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [10]:
%%writefile REGENIE_Prep_TraitSpecific.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${all_phenotype_File} ${all_train_file} ${OUTPUT_PATH}

Overwriting REGENIE_Prep_TraitSpecific.sh


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp REGENIE_Prep_TraitSpecific.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://REGENIE_Prep_TraitSpecific.R [Content-Type=application/octet-stream]...
/ [1/1 files][  840.0 B/  840.0 B] 100% Done                                    
Operation completed over 1 objects/840.0 B.                                      


In [13]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 10 \
  --boot-disk-size 25 \
  --min-ram 10 \
  --timeout "4h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script REGENIE_Prep_TraitSpecific.sh \
  --tasks score_task.txt

Job properties:
  job-id: regenie-pr--williamsjacr--250426-143127-77
  job-name: regenie-prep-traitspecific
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5063303981653871125
Launched job-id: regenie-pr--williamsjacr--250426-143127-77
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-pr--williamsjacr--250426-143127-77' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-pr--williamsjacr--250426-143127-77' --users 'williamsjacr'


In [9]:
%%writefile REGENIE_Prep_BED.R
rm(list = ls())

Ancestry_File <- commandArgs(TRUE)[1]
print(Ancestry_File)

BED_File <- gsub(".bed","",commandArgs(TRUE)[2])
print(BED_File)

OUTPUT_PATH <- commandArgs(TRUE)[3]
print(OUTPUT_PATH)

ancestries <- read.delim(Ancestry_File)
ancestries <- data.frame(IID = as.numeric(ancestries[,c("research_id")]),ancestry = toupper(ancestries[,c("ancestry_pred")]))

AFR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AFR"])
AMR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "AMR"])
EUR_Keep <- data.frame(FID = 0,IID = ancestries$IID[ancestries$ancestry == "EUR"])

write.table(AFR_Keep,"AFR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(AMR_Keep,"AMR_Keep.txt",row.names = FALSE,col.names = FALSE)
write.table(EUR_Keep,"EUR_Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_File," --keep AFR_Keep.txt --freq --out freq_AFR"),intern = TRUE)
system(paste0("plink --bfile ",BED_File," --keep AMR_Keep.txt --freq --out freq_AMR"),intern = TRUE)
system(paste0("plink --bfile ",BED_File," --keep EUR_Keep.txt --freq --out freq_EUR"),intern = TRUE)

a <- read.csv(paste0("freq_AFR.frq"),sep = "")
b <- read.csv(paste0("freq_AMR.frq"),sep = "")
c <- read.csv(paste0("freq_EUR.frq"),sep = "")

print(summary(a))
print(summary(b))
print(summary(c))

exclude_list <- which(is.na(a$MAF) | is.na(b$MAF) | is.na(c$MAF) | a$MAF <= 0.05 | b$MAF <= 0.05 | c$MAF <= 0.05)

print(length(exclude_list)/nrow(a))

all_chr <- read.table(paste0(BED_File,".bim"), header=FALSE)
samps <- sample((1:nrow(all_chr))[-exclude_list],900000,replace = FALSE)
samps <- samps[order(samps)]
all_chr <- all_chr[samps,2]
write.table(all_chr,file = "extract_snps",row.names = F,col.names = F,quote=F)

system(paste0("plink2 --bfile ",BED_File," --extract extract_snps --make-bed --out ",OUTPUT_PATH,"/","REGENIE_Step1"))

Overwriting REGENIE_Prep_BED.R


In [10]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
      '--input BED_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
      '--input BIM_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
      '--input FAM_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
      '--input Ancestry_File'="gs://fc-aou-datasets-controlled/v7/wgs/short_read/snpindel/aux/ancestry/ancestry_preds.tsv",
      '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/REGENIE_Prep_BED.R",
      '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
      check.names = FALSE
    ))



write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)


Overwriting score_task.R


In [11]:
%%writefile REGENIE_Prep_BED.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Ancestry_File} ${BED_File} ${OUTPUT_PATH}

Overwriting REGENIE_Prep_BED.sh


In [12]:
!Rscript score_task.R

In [13]:
!gsutil -m cp REGENIE_Prep_BED.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://REGENIE_Prep_BED.R [Content-Type=application/octet-stream]...
/ [1/1 files][  1.8 KiB/  1.8 KiB] 100% Done                                    
Operation completed over 1 objects/1.8 KiB.                                      


In [14]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 1000 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script REGENIE_Prep_BED.sh \
  --tasks score_task.txt

Job properties:
  job-id: regenie-pr--williamsjacr--250415-112618-55
  job-name: regenie-prep-bed
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12235351109116825389
Launched job-id: regenie-pr--williamsjacr--250415-112618-55
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-pr--williamsjacr--250415-112618-55' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-pr--williamsjacr--250415-112618-55' --users 'williamsjacr'


In [8]:
%%writefile REGENIE_Test.R
rm(list = ls())

print(1 + 1)

Writing REGENIE_Test.R


In [9]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
      '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/REGENIE_Test.R",
      '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
      check.names = FALSE
    ))



write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)


Writing score_task.R


In [10]:
%%writefile REGENIE_Test.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script}

Writing REGENIE_Test.sh


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp REGENIE_Test.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://REGENIE_Test.R [Content-Type=application/octet-stream]...
/ [1/1 files][   30.0 B/   30.0 B] 100% Done                                    
Operation completed over 1 objects/30.0 B.                                       


In [25]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 100 \
  --min-ram 100 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script REGENIE_Test.sh \
  --tasks score_task.txt

Job properties:
  job-id: regenie-te--williamsjacr--250523-160030-74
  job-name: regenie-test
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11227930139315377219
Launched job-id: regenie-te--williamsjacr--250523-160030-74
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-te--williamsjacr--250523-160030-74' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-te--williamsjacr--250523-160030-74' --users 'williamsjacr'


In [36]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-te--williamsjacr--250523-154923-89' --users 'williamsjacr' --status '*'

Job Name        Task  Status                                      Last Update
------------  ------  ------------------------------------------  --------------
regenie-test       1  pulling image: docker pull: retry budge...  05-23 16:06:49



In [46]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-te--williamsjacr--250523-160014-73' --users 'williamsjacr' --status '*'

Job Name        Task  Status                                   Last Update
------------  ------  ---------------------------------------  --------------
regenie-test       1  Pulling "willja16/r_with_plink_regenie"  05-23 16:01:19



In [47]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'regenie-te--williamsjacr--250523-160030-74' --users 'williamsjacr' --status '*'

Job Name        Task  Status                           Last Update
------------  ------  -------------------------------  --------------
regenie-test       1  Pulling "willja16/r_with_plink"  05-23 16:01:19



In [63]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(anc in c("AFR","AMR","EUR")){
    for(trait in c("BMI","LDL","logTG","HDL","TC","Height")){
     tasks <- rbind(tasks, data.frame(
  '--input Regenie_Step1_Bed'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/REGENIE_Step1.bed",
  '--input Regenie_Step1_Bim'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/REGENIE_Step1.bim",
  '--input Regenie_Step1_Fam'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/REGENIE_Step1.fam",
  '--input Regenie_Step2_Bed'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
  '--input Regenie_Step2_Bim'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
  '--input Regenie_Step2_Fam'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
  '--env anc'=anc,
  '--env trait'=trait,
  '--input Regenie_Train_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train_",trait,"_",anc,"_REGENIE.txt"),
  '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
  check.names = FALSE
))   
    }
}

write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [64]:
!Rscript score_task.R

In [65]:
%%writefile REGENIE.sh
#!/bin/bash

set -o errexit
set -o nounset

regenie --step 1 --bed /mnt/data/input/gs/fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/REGENIE_Step1 --phenoFile ${Regenie_Train_File} --phenoColList ${trait} --covarFile ${Regenie_Train_File} --covarColList age,age2,sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 --qt --bsize 1000 --lowmem --lowmem-prefix regenie_step1_tmp --out regenie_step1_withsex &
wait
regenie --step 2 --bed /mnt/data/input/gs/fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr --phenoFile ${Regenie_Train_File} --phenoColList ${trait} --covarFile ${Regenie_Train_File} --covarColList age,age2,sex,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10 --pred regenie_step1_withsex_pred.list --qt --bsize 400 --out ${OUTPUT_PATH}/regenie_step2_${anc}


Overwriting REGENIE.sh


In [66]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink_regenie \
  --disk-size 800 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script REGENIE.sh \
  --tasks score_task.txt

Job properties:
  job-id: regenie--williamsjacr--250427-234426-19
  job-name: regenie
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5130628365722022552
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12685230392961720833
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1553640902003377733
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10114656875896946847
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2348449403932750315
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9354960312708151806
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3715688646150788466
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3795155697146286595
Provider internal-id (op

In [71]:
%%writefile GWAS_SumStats.R
rm(list = ls())

BED_file <- commandArgs(TRUE)[1]
BED_file <- gsub(".bed","",BED_file)
print(BED_file)

all_phenotypes_file <- commandArgs(TRUE)[2]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[3]
print(all_phenotypes_file)

anc <- commandArgs(TRUE)[4]
print(anc)

trait <- commandArgs(TRUE)[5]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)

all_phenotypes <- read.csv(all_phenotypes_file)

tmp <- read.delim(all_train_file)
write.table(tmp[tmp$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == anc],],file = paste0("All_Train_",anc,".txt"),sep = '\t',row.names = FALSE,quote = FALSE)

system(paste0("plink2 --bfile ",BED_file," --pheno All_Train_",anc,".txt --pheno-name ",trait," --linear --covar All_Train_",anc,".txt --covar-name age, age2, sex, PC1, PC2, PC3, PC4, PC5, PC6, PC7, PC8, PC9, PC10 --vif 999 --out ",OUTPUT_PATH,"/",trait,"_sumstats_",anc))

Writing GWAS_SumStats.R


In [72]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(anc in c("AFR","AMR","EUR")){
  for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
      '--input BED_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
      '--input BIM_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
      '--input FAM_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
      '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
      '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
      '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/GWAS_SumStats.R",
      '--env anc'=anc,
      '--env trait'=trait,
      '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
      check.names = FALSE
    ))
  }
}


write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [73]:
%%writefile GWAS_SumStats.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${BED_File} ${all_phenotypes_file} ${all_train_file} ${anc} ${trait} ${OUTPUT_PATH}

Writing GWAS_SumStats.sh


In [74]:
!Rscript score_task.R

In [75]:
!gsutil -m cp GWAS_SumStats.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://GWAS_SumStats.R [Content-Type=application/octet-stream]...
/ [1/1 files][  905.0 B/  905.0 B] 100% Done                                    
Operation completed over 1 objects/905.0 B.                                      


In [80]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 800 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script GWAS_SumStats.sh \
  --tasks score_task.txt

Job properties:
  job-id: gwas-sumst--williamsjacr--250424-173116-13
  job-name: gwas-sumstats
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3214562358622228061
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17511041392178526707
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1214001968959801695
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4518281125617921996
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7051887284840326656
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10142665077198811106
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6126429204444064864
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6321005950925884043
Provider intern

In [36]:
%%writefile GWAS_SumStats_Clean_Plink.R

sumstats <- commandArgs(TRUE)[1]

anc <- commandArgs(TRUE)[2]
print(anc)

trait <- commandArgs(TRUE)[3]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

library(data.table)

sumstats <- fread(sumstats)
print(str(sumstats))
sumstats <- as.data.frame(sumstats)
colnames(sumstats) <- c("CHROM","POS","ID","REF","ALT","PROVISIONAL_REF","A1","OMITTED","A1_FREQ","TEST","OBS_CT","BETA","SE","T_STAT","P","ERRCODE")
sumstats <- sumstats[sumstats$TEST == "ADD",]
sumstats <- sumstats[,c("CHROM","ID","POS","REF","ALT","A1","OBS_CT","BETA","SE","P")]
colnames(sumstats) <- c("CHR","SNP","BP","REF","ALT","A1","OBS_CT","BETA","SE","P")
write.table(sumstats, file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_Plink_GWAS_SumStats_Cleaned"), sep = "\t", row.names = FALSE, quote = FALSE)

Overwriting GWAS_SumStats_Clean_Plink.R


In [37]:
%%writefile GWAS_SumStats_Clean_Plink.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${sumstats} ${anc} ${trait} ${OUTPUT_PATH}

Overwriting GWAS_SumStats_Clean_Plink.sh


In [38]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "EUR"
trait <- "HDL"

for(anc in c("EUR","AMR","AFR")){
    for(trait in c("BMI","LDL","HDL","Height","TC","logTG")){
     tasks <- rbind(tasks, data.frame(
    '--input sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/",trait,"_sumstats_",anc,".",trait,".glm.linear"),
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/GWAS_SumStats_Clean_Plink.R",
    check.names = FALSE
))   
    }
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [39]:
!Rscript score_task.R

In [40]:
!gsutil -m cp GWAS_SumStats_Clean_Plink.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://GWAS_SumStats_Clean_Plink.R [Content-Type=application/octet-stream]...
/ [1/1 files][  793.0 B/  793.0 B] 100% Done                                    
Operation completed over 1 objects/793.0 B.                                      


In [41]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 40 \
  --boot-disk-size 25 \
  --min-ram 60 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script GWAS_SumStats_Clean_Plink.sh \
  --tasks score_task.txt

Job properties:
  job-id: gwas-sumst--williamsjacr--250425-122607-63
  job-name: gwas-sumstats-clean-plink
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14674126123918469510
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12452681274097805714
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14256211189955613424
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14878270159713398819
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7764722351286179164
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10541295791420624510
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4394802813209458800
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4845252920812637302


In [8]:
%%writefile tune_validation_anc_bed.R

rm(list = ls())

Tune_File <- commandArgs(TRUE)[1]
print(Tune_File)

Valid_File <- commandArgs(TRUE)[2]
print(Valid_File)

all_File <- commandArgs(TRUE)[3]
print(all_File)

BED_Full_File <- commandArgs(TRUE)[4]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

anc <- commandArgs(TRUE)[5]
print(anc)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)

Tune <- read.delim(Tune_File)
All <- read.csv(all_File)

Keep <- data.frame(FID = 0,IID = Tune$IID[Tune$IID %in% All$IID[All$ancestry == anc]])

write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_Full_File," --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_tune_",anc),intern = TRUE)


Valid <- read.delim(Valid_File)
All <- read.csv(all_File)

Keep <- data.frame(FID = 0,IID = Valid$IID[Valid$IID %in% All$IID[All$ancestry == anc]])

write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_Full_File," --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_valid_",anc),intern = TRUE)


Writing tune_validation_anc_bed.R


In [9]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(anc in c("AFR","EUR","AMR")){
    tasks <- rbind(tasks, data.frame(
    '--env anc'=anc,
    '--input Tune_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input Valid_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input all_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/tune_validation_anc_bed.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_BedFiles",
    # '--output out_file'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/test",chrom,".Rdata"),
    check.names = FALSE
  ))
}


write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [10]:
%%writefile tune_validation_anc_bed.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Tune_File} ${Valid_File} ${all_File} ${BED_Full_File} ${anc} ${OUTPUT_PATH}

Writing tune_validation_anc_bed.sh


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp tune_validation_anc_bed.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://tune_validation_anc_bed.R [Content-Type=application/octet-stream]...
- [1/1 files][  1.0 KiB/  1.0 KiB] 100% Done                                    
Operation completed over 1 objects/1.0 KiB.                                      


In [13]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 700 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script tune_validation_anc_bed.sh \
  --tasks score_task.txt

Job properties:
  job-id: tune-valid--williamsjacr--250417-112318-03
  job-name: tune-validation-anc-bed
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7194155009525131398
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/984999540801392388
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12867490442755241193
Launched job-id: tune-valid--williamsjacr--250417-112318-03
3 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--250417-112318-03' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--250417-112318-03' --users 'williamsjacr'


In [117]:
%%writefile JointPRS_MUSSEL_PROSPER_BED_Files.R

rm(list = ls())

library(dplyr)

Tune_File <- commandArgs(TRUE)[1]
print(Tune_File)

Valid_File <- commandArgs(TRUE)[2]
print(Valid_File)

all_File <- commandArgs(TRUE)[3]
print(all_File)

snp_info_rsids <- commandArgs(TRUE)[4]
print(snp_info_rsids)

snp_info_extractlist <- commandArgs(TRUE)[5]
print(snp_info_extractlist)

BED_Full_File <- commandArgs(TRUE)[6]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[7]
print(OUTPUT_PATH)

all_chr_bim <- read.delim(paste0(BED_Full_File,".bim"), header=FALSE)
all_chr_bim <- as.data.frame(all_chr_bim)
colnames(all_chr_bim) <- c("CHR","SNP","IDK","BP","A1","A2")

all_chr_bim$snp <- all_chr_bim$SNP
all_chr_bim$rs_ID <- all_chr_bim$SNP

all_chr_bim$SNP <- toupper(all_chr_bim$SNP)
all_chr_bim$SNP <- gsub(" ","",all_chr_bim$SNP)

SNP_GRCh37_38_match_update <- readRDS(snp_info_rsids)

system(paste0("rm ",snp_info_rsids))

SNP_GRCh37_38_match_update$unique_id1 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele1_38,":",SNP_GRCh37_38_match_update$allele2_38)
SNP_GRCh37_38_match_update$unique_id2 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele2_38,":",SNP_GRCh37_38_match_update$allele1_38)

SNP_GRCh37_38_match_update$unique_id1 <- toupper(SNP_GRCh37_38_match_update$unique_id1)
SNP_GRCh37_38_match_update$unique_id1 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id1)

SNP_GRCh37_38_match_update$unique_id2 <- toupper(SNP_GRCh37_38_match_update$unique_id2)
SNP_GRCh37_38_match_update$unique_id2 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id2)

all_chr_bim <- left_join(all_chr_bim,SNP_GRCh37_38_match_update[,c("unique_id1","rsid")],by = c("SNP"="unique_id1"))
all_chr_bim$rs_ID[!is.na(all_chr_bim$rsid)] <- all_chr_bim$rsid[!is.na(all_chr_bim$rsid)]
    
all_chr_bim <- subset(all_chr_bim,select = -c(rsid))
    
all_chr_bim <- left_join(all_chr_bim,SNP_GRCh37_38_match_update[,c("unique_id2","rsid")],by = c("SNP"="unique_id2"))
all_chr_bim$rs_ID[!is.na(all_chr_bim$rsid)] <- all_chr_bim$rsid[!is.na(all_chr_bim$rsid)]

snp_info_extractlist <- read.csv(snp_info_extractlist)
extract_list <- all_chr_bim$snp[all_chr_bim$rs_ID %in% snp_info_extractlist$SNP]
write.table(extract_list,"extract_list.txt",row.names = F,col.names = F,quote=F)

# Extract Needed SNPs in Full File
system(paste0("plink --bfile ",BED_Full_File," --extract extract_list.txt --make-bed --out ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER"),intern = TRUE)

system(paste0("rm ",BED_Full_File,".bed"))
system(paste0("rm ",BED_Full_File,".bim"))
system(paste0("rm ",BED_Full_File,".fam"))

write.table(all_chr_bim[,c("snp","rs_ID")],file = "snp_rename.txt",row.names = F,col.names = F,quote=F)

# Rename SNPs to rsids
system(paste0("plink --bfile ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER --update-name snp_rename.txt 2 1 --make-bed --out ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER"))

Tune <- read.delim(Tune_File)

Keep <- data.frame(FID = 0,IID = Tune$IID)
write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

## Full Tune
system(paste0("plink --bfile ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_tune_JointPRS_MUSSEL_PROSPER"),intern = TRUE)

All <- read.csv(all_File)

# Anc Specific Tune
for(anc in c("AFR","EUR","AMR")){
    Keep <- data.frame(FID = 0,IID = Tune$IID[Tune$IID %in% All$IID[All$ancestry == anc]])
    write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

    system(paste0("plink --bfile ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_tune_JointPRS_MUSSEL_PROSPER_",anc),intern = TRUE)
}


Valid <- read.delim(Valid_File)

Keep <- data.frame(FID = 0,IID = Valid$IID)
write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

## Full Valid
system(paste0("plink --bfile ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_valid_JointPRS_MUSSEL_PROSPER"),intern = TRUE)

All <- read.csv(all_File)

# Anc Specific Tune
for(anc in c("AFR","EUR","AMR")){
    Keep <- data.frame(FID = 0,IID = Valid$IID[Valid$IID %in% All$IID[All$ancestry == anc]])
    write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

    system(paste0("plink --bfile ",OUTPUT_PATH,"/all_JointPRS_MUSSEL_PROSPER --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_Valid_JointPRS_MUSSEL_PROSPER_",anc),intern = TRUE)
}

Overwriting JointPRS_MUSSEL_PROSPER_BED_Files.R


In [118]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


tasks <- rbind(tasks, data.frame(
    '--input Tune_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input Valid_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input all_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input snp_info_rsids'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/SNP_GRCh37_38_match_update.rds",
    '--input snp_info_extractlist'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/SNP_Info_JointPRS_PROSPER_MUSSEL.csv",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/JointPRS_MUSSEL_PROSPER_BED_Files.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED",
    check.names = FALSE
  ))



write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [119]:
%%writefile JointPRS_MUSSEL_PROSPER_BED_Files.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Tune_File} ${Valid_File} ${all_File} ${snp_info_rsids} ${snp_info_extractlist} ${BED_Full_File} ${OUTPUT_PATH}

Overwriting JointPRS_MUSSEL_PROSPER_BED_Files.sh


In [120]:
!Rscript score_task.R

In [121]:
!gsutil -m cp JointPRS_MUSSEL_PROSPER_BED_Files.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://JointPRS_MUSSEL_PROSPER_BED_Files.R [Content-Type=application/octet-stream]...
/ [1/1 files][  4.5 KiB/  4.5 KiB] 100% Done                                    
Operation completed over 1 objects/4.5 KiB.                                      


In [122]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 700 \
  --boot-disk-size 25 \
  --min-ram 80 \
  --timeout "168h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script JointPRS_MUSSEL_PROSPER_BED_Files.sh \
  --tasks score_task.txt

Job properties:
  job-id: jointprs-m--williamsjacr--250417-151205-37
  job-name: jointprs-mussel-prosper-bed-files
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2830465926782299950
Launched job-id: jointprs-m--williamsjacr--250417-151205-37
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'jointprs-m--williamsjacr--250417-151205-37' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'jointprs-m--williamsjacr--250417-151205-37' --users 'williamsjacr'


In [24]:
%%writefile tune_validation_bed.R

rm(list = ls())

Tune_File <- commandArgs(TRUE)[1]
print(Tune_File)

Valid_File <- commandArgs(TRUE)[2]
print(Valid_File)

BED_Full_File <- commandArgs(TRUE)[3]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

Tune <- read.delim(Tune_File)

Keep <- data.frame(FID = 0,IID = Tune$IID)

write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_Full_File," --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_tune"),intern = TRUE)






Valid <- read.delim(Valid_File)

Keep <- data.frame(FID = 0,IID = Valid$IID)

write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink --bfile ",BED_Full_File," --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_valid"),intern = TRUE)


Writing tune_validation_bed.R


In [25]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input Tune_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input Valid_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/tune_validation_bed.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_BedFiles",
    # '--output out_file'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/test",chrom,".Rdata"),
    check.names = FALSE
  ))


write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [26]:
%%writefile tune_validation_bed.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Tune_File} ${Valid_File} ${BED_Full_File} ${OUTPUT_PATH}

Writing tune_validation_bed.sh


In [27]:
!Rscript score_task.R

In [28]:
!gsutil -m cp tune_validation_bed.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://tune_validation_bed.R [Content-Type=application/octet-stream]...
/ [1/1 files][  823.0 B/  823.0 B] 100% Done                                    
Operation completed over 1 objects/823.0 B.                                      


In [29]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 700 \
  --boot-disk-size 40 \
  --min-ram 30 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script tune_validation_bed.sh \
  --tasks score_task.txt

Job properties:
  job-id: tune-valid--williamsjacr--250417-112549-30
  job-name: tune-validation-bed
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5390226490247056300
Launched job-id: tune-valid--williamsjacr--250417-112549-30
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--250417-112549-30' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--250417-112549-30' --users 'williamsjacr'


In [21]:
%%writefile tune_validation_phenotypes.R

rm(list = ls())

Tune_File <- commandArgs(TRUE)[1]
print(Tune_File)

Valid_File <- commandArgs(TRUE)[2]
print(Valid_File)

all_File <- commandArgs(TRUE)[3]
print(all_File)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

Tune <- read.delim(Tune_File)
All <- read.csv(all_File)

covar_tmp <- data.frame(FID = Tune$FID,IID = Tune$IID,age = Tune$age,age2 = Tune$age2,sex = Tune$sex,
                        PC1 = Tune$PC1,PC2 = Tune$PC2,PC3 = Tune$PC3,PC4 = Tune$PC4,PC5 = Tune$PC5,
                        PC6 = Tune$PC6,PC7 = Tune$PC7,PC8 = Tune$PC8,PC9 = Tune$PC9,PC10 = Tune$PC10)

write.table(covar_tmp,file = paste0(OUTPUT_PATH,"/full_tune_covar.txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    
for(anc in c("AFR","AMR","EUR")){
    covar_tmp_anc <- covar_tmp[covar_tmp$IID %in% All$IID[All$ancestry == anc],]
    write.table(covar_tmp_anc,file = paste0(OUTPUT_PATH,"/full_tune_covar_",anc,".txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
}

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
    fam_tmp <- data.frame(FID = Tune$FID,IID = Tune$IID,Y = Tune[,trait])
    write.table(fam_tmp,file = paste0(OUTPUT_PATH,"/",trait,"_full_tune_y.txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    
    for(anc in c("AFR","AMR","EUR")){
        fam_tmp_anc <- fam_tmp[fam_tmp$IID %in% All$IID[All$ancestry == anc],]
        write.table(fam_tmp_anc,file = paste0(OUTPUT_PATH,"/",trait,"_full_tune_y_",anc,".txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    }
}




Valid <- read.delim(Valid_File)
All <- read.csv(all_File)

covar_tmp <- data.frame(FID = Valid$FID,IID = Valid$IID,age = Valid$age,age2 = Valid$age2,sex = Valid$sex,
                        PC1 = Valid$PC1,PC2 = Valid$PC2,PC3 = Valid$PC3,PC4 = Valid$PC4,PC5 = Valid$PC5,
                        PC6 = Valid$PC6,PC7 = Valid$PC7,PC8 = Valid$PC8,PC9 = Valid$PC9,PC10 = Valid$PC10)

write.table(covar_tmp,file = paste0(OUTPUT_PATH,"/full_valid_covar.txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    
for(anc in c("AFR","AMR","EUR")){
    covar_tmp_anc <- covar_tmp[covar_tmp$IID %in% All$IID[All$ancestry == anc],]
    write.table(covar_tmp_anc,file = paste0(OUTPUT_PATH,"/full_valid_covar_",anc,".txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
}

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
    fam_tmp <- data.frame(FID = Valid$FID,IID = Valid$IID,Y = Valid[,trait])
    write.table(fam_tmp,file = paste0(OUTPUT_PATH,"/",trait,"_full_valid_y.txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    
    for(anc in c("AFR","AMR","EUR")){
        fam_tmp_anc <- fam_tmp[fam_tmp$IID %in% All$IID[All$ancestry == anc],]
        write.table(fam_tmp_anc,file = paste0(OUTPUT_PATH,"/",trait,"_full_valid_y_",anc,".txt"),sep = '\t',row.names = FALSE,quote = FALSE,col.names = FALSE)
    }
}



Writing tune_validation_phenotypes.R


In [22]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input Tune_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input Valid_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input all_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/tune_validation_phenotypes.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes",
    check.names = FALSE
  ))


write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [23]:
%%writefile tune_validation_phenotypes.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Tune_File} ${Valid_File} ${all_File} ${OUTPUT_PATH}

Writing tune_validation_phenotypes.sh


In [24]:
!Rscript score_task.R

In [25]:
!gsutil -m cp tune_validation_phenotypes.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://tune_validation_phenotypes.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.8 KiB/  2.8 KiB] 100% Done                                    
Operation completed over 1 objects/2.8 KiB.                                      


In [26]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 40 \
  --boot-disk-size 25 \
  --min-ram 8 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script tune_validation_phenotypes.sh \
  --tasks score_task.txt

Job properties:
  job-id: tune-valid--williamsjacr--240722-175637-83
  job-name: tune-validation-phenotypes
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16970955615058126312
Launched job-id: tune-valid--williamsjacr--240722-175637-83
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--240722-175637-83' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'tune-valid--williamsjacr--240722-175637-83' --users 'williamsjacr'


In [10]:
%%writefile reference_bed.R

rm(list = ls())

pheno_train <- commandArgs(TRUE)[1]
print(pheno_train)

all_phenotypes_file <- commandArgs(TRUE)[2]
print(all_phenotypes_file)

BED_Full_File <- commandArgs(TRUE)[3]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

pheno_train <- read.delim(pheno_train)

all_phenotypes <- read.csv(all_phenotypes_file)

for(anc in c("AFR","AMR","EUR")){
    tmp <- pheno_train[pheno_train$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == anc],]
    Keep <- data.frame(FID = 0,IID = sample(tmp$IID,3000))
    write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)
    system(paste0("plink --bfile ",BED_Full_File," --keep Keep.txt --make-bed --out ",OUTPUT_PATH,"/all_chr_reference_",anc),intern = TRUE)
}

Writing reference_bed.R


In [11]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input pheno_train'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/reference_bed.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants",
    # '--output out_file'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/test",chrom,".Rdata"),
    check.names = FALSE
  ))


write.table(tasks,
            file="score_task.txt",
            row.names=F, col.names=T,
            sep='\t', quote=F)

Overwriting score_task.R


In [12]:
%%writefile reference_bed.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${pheno_train} ${all_phenotypes_file} ${BED_Full_File} ${OUTPUT_PATH}

Writing reference_bed.sh


In [13]:
!Rscript score_task.R

In [14]:
!gsutil -m cp reference_bed.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://reference_bed.R [Content-Type=application/octet-stream]...
/ [1/1 files][  799.0 B/  799.0 B] 100% Done                                    
Operation completed over 1 objects/799.0 B.                                      


In [15]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 700 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script reference_bed.sh \
  --tasks score_task.txt

Job properties:
  job-id: reference---williamsjacr--250418-123556-81
  job-name: reference-bed
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16501531450550937699
Launched job-id: reference---williamsjacr--250418-123556-81
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'reference---williamsjacr--250418-123556-81' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'reference---williamsjacr--250418-123556-81' --users 'williamsjacr'


In [54]:
%%writefile GWAS_SumStats_Clean.R

sumstats <- commandArgs(TRUE)[1]

anc <- commandArgs(TRUE)[2]
print(anc)

trait <- commandArgs(TRUE)[3]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)


sumstats <- read.csv(sumstats, sep="")
colnames(sumstats) <- c("CHROM","POS","ID","REF","ALT","A1_FREQ","N","TEST","BETA","SE","CHISQ","LOG10P","EXTRA")
sumstats$P <- 10^(-1*sumstats$LOG10P)
sumstats <- sumstats[,c("CHROM","ID","POS","REF","ALT","N","BETA","SE","P")]
colnames(sumstats) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P")
write.table(sumstats, file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_GWAS_SumStats_Cleaned"), sep = "\t", row.names = FALSE, quote = FALSE)


Overwriting GWAS_SumStats_Clean.R


In [55]:
%%writefile GWAS_SumStats_Clean.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${sumstats} ${anc} ${trait} ${OUTPUT_PATH}

Overwriting GWAS_SumStats_Clean.sh


In [56]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "EUR"
trait <- "HDL"

for(anc in c("EUR","AMR","AFR")){
    for(trait in c("BMI","LDL","HDL","Height","TC","logTG")){
     tasks <- rbind(tasks, data.frame(
    '--input sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/regenie_step2_",anc,"_",trait,".regenie"),
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/GWAS_SumStats_Clean.R",
    check.names = FALSE
))   
    }
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [57]:
!Rscript score_task.R

In [58]:
!gsutil -m cp GWAS_SumStats_Clean.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://GWAS_SumStats_Clean.R [Content-Type=application/octet-stream]...
/ [1/1 files][  662.0 B/  662.0 B] 100% Done                                    
Operation completed over 1 objects/662.0 B.                                      


In [59]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 10 \
  --boot-disk-size 25 \
  --min-ram 20 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script GWAS_SumStats_Clean.sh \
  --tasks score_task.txt

Job properties:
  job-id: gwas-sumst--williamsjacr--250428-173207-76
  job-name: gwas-sumstats-clean
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7034685886252727257
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4082315975050440483
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7938481039455991176
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15274961768330524042
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11793126664326088779
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3982309362940487567
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10594059716017372268
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3991644517047102351
Provider

In [60]:
%%writefile GWAS_SumStats_Clean_QQPlots.R

sumstats <- commandArgs(TRUE)[1]

anc <- commandArgs(TRUE)[2]
print(anc)

trait <- commandArgs(TRUE)[3]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

sumstats <- read.csv(sumstats, sep="")
colnames(sumstats) <- c("CHROM","POS","ID","REF","ALT","A1_FREQ","N","TEST","BETA","SE","CHISQ","LOG10P","EXTRA")
sumstats$P <- 10^(-1*sumstats$LOG10P)
sumstats <- sumstats[,c("CHROM","ID","POS","A1_FREQ","P")]
colnames(sumstats) <- c("CHR","SNP","BP","A1_FREQ","P")
write.table(sumstats, file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_GWAS_SumStats_Cleaned_QQPlots"), sep = "\t", row.names = FALSE, quote = FALSE)


Overwriting GWAS_SumStats_Clean_QQPlots.R


In [61]:
%%writefile GWAS_SumStats_Clean_QQPlots.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${sumstats} ${anc} ${trait} ${OUTPUT_PATH}

Overwriting GWAS_SumStats_Clean_QQPlots.sh


In [62]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "EUR"
trait <- "HDL"

for(anc in c("EUR","AMR","AFR")){
    for(trait in c("BMI","LDL","HDL","Height","TC","logTG")){
     tasks <- rbind(tasks, data.frame(
    '--input sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/regenie_step2_",anc,"_",trait,".regenie"),
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/GWAS_SumStats_Clean_QQPlots.R",
    check.names = FALSE
))   
    }
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [63]:
!Rscript score_task.R

In [64]:
!gsutil -m cp GWAS_SumStats_Clean_QQPlots.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://GWAS_SumStats_Clean_QQPlots.R [Content-Type=application/octet-stream]...
/ [1/1 files][  628.0 B/  628.0 B] 100% Done                                    
Operation completed over 1 objects/628.0 B.                                      


In [65]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 10 \
  --boot-disk-size 25 \
  --min-ram 10 \
  --timeout "48h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script GWAS_SumStats_Clean_QQPlots.sh \
  --tasks score_task.txt

Job properties:
  job-id: gwas-sumst--williamsjacr--250428-173226-64
  job-name: gwas-sumstats-clean-qqplots
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2403754769293709418
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13781852294473506066
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12577162130862398103
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10308808330836817463
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/8410554366338355886
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4842582575030511619
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2512501573097638363
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5789696100044841828


In [74]:
%%writefile Add_RSIDs_Sumstats.R
rm(list = ls())

sumstats <- commandArgs(TRUE)[1]
print(sumstats)

snp_info <- commandArgs(TRUE)[2]
print(snp_info)

anc <- commandArgs(TRUE)[3]
print(anc)

trait <- commandArgs(TRUE)[4]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[5]
print(OUTPUT_PATH)

library(data.table)
library(dplyr)

SNP_GRCh37_38_match_update <- readRDS(snp_info)

SNP_GRCh37_38_match_update$unique_id1 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele1_38,":",SNP_GRCh37_38_match_update$allele2_38)
SNP_GRCh37_38_match_update$unique_id2 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele2_38,":",SNP_GRCh37_38_match_update$allele1_38)

SNP_GRCh37_38_match_update$unique_id1 <- toupper(SNP_GRCh37_38_match_update$unique_id1)
SNP_GRCh37_38_match_update$unique_id1 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id1)

SNP_GRCh37_38_match_update$unique_id2 <- toupper(SNP_GRCh37_38_match_update$unique_id2)
SNP_GRCh37_38_match_update$unique_id2 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id2)


sumstats <- fread(sumstats)
    
sumstats <- as.data.frame(sumstats)
sumstats <- sumstats[,c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","SNP")]
colnames(sumstats) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")
    
sumstats$SNP <- toupper(sumstats$SNP)
sumstats$SNP <- gsub(" ","",sumstats$SNP)
    
sumstats <- left_join(sumstats,SNP_GRCh37_38_match_update[,c("unique_id1","rsid")],by = c("SNP"="unique_id1"))
sumstats$rs_ID[!is.na(sumstats$rsid)] <- sumstats$rsid[!is.na(sumstats$rsid)]
    
sumstats <- subset(sumstats,select = -c(rsid))
    
sumstats <- left_join(sumstats,SNP_GRCh37_38_match_update[,c("unique_id2","rsid")],by = c("SNP"="unique_id2"))
sumstats$rs_ID[!is.na(sumstats$rsid)] <- sumstats$rsid[!is.na(sumstats$rsid)]
    
sumstats <- sumstats[,c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")]
    
sumstats$SNP <- sumstats$rs_ID
    
doubles <- names(which(table(sumstats$rs_ID) > 1))
    
sumstats <- sumstats[!(sumstats$SNP %in% doubles),]
    
write.csv(sumstats,file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_GWAS_SumStats_RSIDs.csv"),row.names = FALSE)

Overwriting Add_RSIDs_Sumstats.R


In [75]:
%%writefile Add_RSIDs_Sumstats.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${sumstats} ${snp_info} ${anc} ${trait} ${OUTPUT_PATH}

Overwriting Add_RSIDs_Sumstats.sh


In [76]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "AFR"
trait <- "LDL"

for(anc in c("EUR","AFR","AMR")){
   for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/",anc,"_",trait,"_GWAS_SumStats_Cleaned"),
    '--input snp_info'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/SNP_GRCh37_38_match_update.rds",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Add_RSIDs_Sumstats.R",
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats",
    check.names = FALSE
))   
   }
}



write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [77]:
!Rscript score_task.R

In [78]:
!gsutil -m cp Add_RSIDs_Sumstats.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Add_RSIDs_Sumstats.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.3 KiB/  2.3 KiB] 100% Done                                    
Operation completed over 1 objects/2.3 KiB.                                      


In [79]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 10 \
  --boot-disk-size 25 \
  --min-ram 80 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Add_RSIDs_Sumstats.sh \
  --tasks score_task.txt

Job properties:
  job-id: add-rsids---williamsjacr--250428-174430-30
  job-name: add-rsids-sumstats
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6998726100006922535
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4293973618065607811
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3001410113428019042
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15389202591089947051
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14367914056569811633
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15698518150691807952
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11655765565357011582
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3650844991244160939
Provider

In [98]:
%%writefile PROSPER_SumStats_Format.R
rm(list = ls())

sumstats <- commandArgs(TRUE)[1]
print(sumstats)

ref_bim <- commandArgs(TRUE)[2]
print(ref_bim)

anc <- commandArgs(TRUE)[3]
print(anc)

trait <- commandArgs(TRUE)[4]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[5]
print(OUTPUT_PATH)

library(data.table)
library(dplyr)

sumstats <- fread(sumstats)
    
sumstats <- as.data.frame(sumstats)
colnames(sumstats) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")

sumstats <- sumstats[,c("rs_ID","CHR","ALT","REF","BETA","SE","OBS_CT")]
colnames(sumstats) <- c("rsid","chr","a1","a0","beta","beta_se","n_eff")

sumstats$n_eff <- as.numeric(sumstats$n_eff)

ref_bim <- fread(ref_bim)
ref_bim <- as.data.frame(ref_bim)

sumstats <- sumstats[sumstats$rsid %in% ref_bim[,2],]

ref_bim <- ref_bim[ref_bim[,2] %in% sumstats$rsid,]

order_vec <- match(sumstats$rsid,ref_bim[,2])

ref_bim <- ref_bim[order_vec,]

a <- which((sumstats$a0 != ref_bim[,6]) & (sumstats$a1 != ref_bim[,6]))

b <- which((sumstats$a0 != ref_bim[,5]) & (sumstats$a1 != ref_bim[,5]))

sumstats$a0[a] <- ref_bim$V5[a]
sumstats$a1[a] <- ref_bim$V6[a]

b <- which((sumstats$a0 != ref_bim[,5]) & (sumstats$a1 != ref_bim[,5]))

if(length(b) > 0){
  sumstats$a0[b] <- ref_bim$V5[b]
  sumstats$a1[b] <- ref_bim$V6[b]
}

print(summary(sumstats$n_eff))

sumstats <- sumstats[sumstats$n_eff > (median(sumstats$n_eff)*.9),]

write.csv(sumstats,file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_GWAS_SumStats_PROSPER.csv"),row.names = FALSE)

Writing PROSPER_SumStats_Format.R


In [99]:
%%writefile PROSPER_SumStats_Format.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${sumstats} ${ref_bim} ${anc} ${trait} ${OUTPUT_PATH}

Writing PROSPER_SumStats_Format.sh


In [100]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "AFR"
trait <- "LDL"

for(anc in c("EUR","AFR","AMR")){
   for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/",anc,"_",trait,"_GWAS_SumStats_RSIDs.csv"),
    '--input ref_bim'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER/ref_bim.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER_SumStats_Format.R",
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER",
    check.names = FALSE
))   
   }
}



write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [101]:
!Rscript score_task.R

In [102]:
!gsutil -m cp PROSPER_SumStats_Format.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://PROSPER_SumStats_Format.R [Content-Type=application/octet-stream]...
/ [1/1 files][  1.4 KiB/  1.4 KiB] 100% Done                                    
Operation completed over 1 objects/1.4 KiB.                                      


In [103]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 10 \
  --boot-disk-size 25 \
  --min-ram 8 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_SumStats_Format.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-su--williamsjacr--250428-184627-04
  job-name: prosper-sumstats-format
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11674139616768273866
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13484182636830392265
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3962884142189636599
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13392857554919715171
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12029767872049378365
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/895516038504869640
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/93042629627218191
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2397665020924857596
Provid

In [81]:
%%writefile CT_SLEB_dimCT_prseb.R
rm(list = ls())

library(devtools)
library(CTSLEB)
library(data.table)
library(dplyr)

BED_Tune_File <- commandArgs(TRUE)[1]
BED_Tune_File <- gsub(".bed","",BED_Tune_File)
print(BED_Tune_File)

BED_Full_File <- commandArgs(TRUE)[2]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

#EUR
BED_Ref_pop1_File <- commandArgs(TRUE)[3]
BED_Ref_pop1_File <- gsub(".bed","",BED_Ref_pop1_File)
print(BED_Ref_pop1_File)

#ANC
BED_Ref_pop2_File <- commandArgs(TRUE)[4]
BED_Ref_pop2_File <- gsub(".bed","",BED_Ref_pop2_File)
print(BED_Ref_pop2_File)

pop1_sumstats <- commandArgs(TRUE)[5]
print(pop1_sumstats)

pop2_sumstats <- commandArgs(TRUE)[6]
print(pop2_sumstats)

all_phenotypes_file <- commandArgs(TRUE)[7]
print(all_phenotypes_file)

all_tune_file <- commandArgs(TRUE)[8]
print(all_tune_file)

anc <- commandArgs(TRUE)[9]
print(anc)

trait <- commandArgs(TRUE)[10]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[11]
print(OUTPUT_PATH)

system("mkdir CTSLEB_TestCode")

sum_pop1 <- fread(pop1_sumstats)

sum_pop1 <- as.data.frame(sum_pop1)
sum_pop1 <- sum_pop1[,c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","SNP")]
colnames(sum_pop1) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")
sum_pop1 <- sum_pop1[,c("CHR","SNP","BP","ALT","BETA","SE","P","rs_ID")]
colnames(sum_pop1) <- c("CHR","SNP","BP","A1","BETA","SE","P","rs_ID")

sum_pop1$P <- as.numeric(sum_pop1$P)
sum_pop1 <- sum_pop1[which(sum_pop1$P < 0.1),]

sum_pop2 <- fread(pop2_sumstats)

sum_pop2 <- as.data.frame(sum_pop2)
sum_pop2 <- sum_pop2[,c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","SNP")]
colnames(sum_pop2) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")
sum_pop2 <- sum_pop2[,c("CHR","SNP","BP","ALT","BETA","SE","P","rs_ID")]
colnames(sum_pop2) <- c("CHR","SNP","BP","A1","BETA","SE","P","rs_ID")

sum_pop2$P <- as.numeric(sum_pop2$P)
sum_pop2 <- sum_pop2[which(sum_pop2$P < 0.1),]

print(nrow(sum_pop1));print(nrow(sum_pop2))

PRS_farm <- SetParamsFarm(plink19_exec = "plink",plink2_exec = "plink2",mem = 80490,pthres = c(5E-08,5E-07,5E-06,5E-05,5E-04,5E-03,0.05,0.1))

prs_mat <- dimCT(results_dir = "CTSLEB_TestCode/",sum_target = sum_pop2,sum_ref = sum_pop1,
                 ref_plink = BED_Ref_pop1_File,
                 target_plink = BED_Ref_pop2_File,
                 test_target_plink = BED_Tune_File,
                 out_prefix = "dimCT",
                 params_farm = PRS_farm)

# save(prs_mat,file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_prs_mat_dimCT.RData"))

pheno_tuning <- read.delim(all_tune_file)

pheno_tuning <- left_join(pheno_tuning,prs_mat,by = "IID")

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tuning)

n.total.prs <- length(pthres)^2*length(r2_vec)*length(wc_base_vec)
prs_r2_vec_test <- rep(0,n.total.prs)
indices <- (ncol(pheno_tuning) - n.total.prs + 1):ncol(pheno_tuning)

for(p_ind in 1:n.total.prs){
  prs <- pheno_tuning[!is.na(pheno_tuning[,trait]),indices[p_ind]]
  model <- lm(model.null$residual~prs)
  prs_r2_vec_test[p_ind] <- summary(model)$r.square
}

max_ind <- which.max(prs_r2_vec_test)

print(colnames(prs_mat)[indices[max_ind]])

best_snps <- colnames(prs_mat)[indices[max_ind]]
#calculate eb effect using EB coefficients
prs_mat_eb <- CalculateEBEffectSize(bfile = BED_Full_File,
                                    snp_ind = best_snps,
                                    plink_list = plink_list,
                                    out_prefix = "EB",
                                    results_dir = "CTSLEB_TestCode/",
                                    params_farm = PRS_farm)



score_full <- matrix(NA,ncol = ncol(prs_mat_eb) - 2,nrow = nrow(unique_infor_post))
names_score_full <- NULL
count <- 1
for(i in 1:length(pthres)){
  for(j in 1:length(pthres)){
    for(k in 1:24){
      tmp <- matrix(0,nrow = nrow(unique_infor_post),ncol = 1)
      tmp[which(unique_infor_post$P < pthres[j] | unique_infor_post$P_ref < pthres[i]),1] <- plink_list_eb$scores_eb[which(unique_infor_post$P < pthres[j] | unique_infor_post$P_ref < pthres[i]),k + 2]
      names_score_full <- c(names_score_full,paste0(colnames(plink_list_eb$scores_eb)[k + 2],"_p_other_",pthres[i],"_p_tar_",pthres[j]))
      score_full[,count] <- tmp[,1]
      count <- count + 1
    }
  }
}

final <- data.frame(SNP = unique_infor_post$SNP,A1 = unique_infor_post$A1,score_full)
colnames(final) <- c("SNP","A1",names_score_full)


save(final,file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_betas_eb.RData"))
save(prs_mat_eb,file = paste0(OUTPUT_PATH,"/",anc,"_",trait,"_prs_mat_eb.RData"))

Overwriting CT_SLEB_dimCT_prseb.R


In [82]:
%%writefile CT_SLEB_dimCT_prseb.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${BED_Tune_File} ${BED_Full_File} ${BED_Ref_pop1_File} ${BED_Ref_pop2_File} ${pop1_sumstats} ${pop2_sumstats} ${all_phenotypes_file} ${all_tune_file} ${anc} ${trait} ${OUTPUT_PATH}

Overwriting CT_SLEB_dimCT_prseb.sh


In [83]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

anc <- "AFR"
trait <- "LDL"

for(anc in c("AFR","AMR")){
   for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input BED_Tune_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_BedFiles/all_tune_",anc,".bed"),
    '--input BIM_Tune_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_BedFiles/all_tune_",anc,".bim"),
    '--input FAM_Tune_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_BedFiles/all_tune_",anc,".fam"),
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input BED_Ref_pop1_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_","EUR",".bed"),
    '--input BIM_Ref_pop1_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_","EUR",".bim"),
    '--input FAM_Ref_pop1_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_","EUR",".fam"),
    '--input BED_Ref_pop2_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_",anc,".bed"),
    '--input BIM_Ref_pop2_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_",anc,".bim"),
    '--input FAM_Ref_pop2_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr_reference_",anc,".fam"),
    '--input pop1_sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/EUR_",trait,"_GWAS_SumStats_Cleaned"),
    '--input pop2_sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/",anc,"_",trait,"_GWAS_SumStats_Cleaned"),
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/CT_SLEB_dimCT_prseb.R",
    '--env anc'=anc,
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB",
    check.names = FALSE
))   
   }
}



write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [84]:
!Rscript score_task.R

In [85]:
!gsutil -m cp CT_SLEB_dimCT_prseb.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://CT_SLEB_dimCT_prseb.R [Content-Type=application/octet-stream]...
/ [1/1 files][  4.5 KiB/  4.5 KiB] 100% Done                                    
Operation completed over 1 objects/4.5 KiB.                                      


In [86]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 800 \
  --boot-disk-size 25 \
  --min-ram 30 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script CT_SLEB_dimCT_prseb.sh \
  --tasks score_task.txt

Job properties:
  job-id: ct-sleb-di--williamsjacr--250428-174505-06
  job-name: ct-sleb-dimct-prseb
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12444147214727260830
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/8856746294784154544
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15219274510332855362
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3476450934576849204
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1798407164403620421
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/752621326411423424
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12262684282836058048
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17686097527311174742
Provider

In [108]:
%%writefile PROSPER_LASSOSUM.sh
#!/bin/bash

path_plink='plink2'

set -o errexit
set -o nounset

tuning_bed_EUR=${tuning_bed_EUR%??}
tuning_bed_AFR=${tuning_bed_AFR%??}
tuning_bed_AMR=${tuning_bed_AMR%??}

valid_bed_EUR=${valid_bed_EUR%??}
valid_bed_AFR=${valid_bed_AFR%??}
valid_bed_AMR=${valid_bed_AMR%??}


Rscript ${PROSPER_Scripts}/scripts/lassosum2.R \
--PATH_package ${PROSPER_Scripts} \
--PATH_out ${OUTPUT_PATH} \
--PATH_plink ${path_plink} \
--FILE_sst ${EUR_Sumstats},${AFR_Sumstats},${AMR_Sumstats} \
--pop EUR,AFR,AMR \
--chrom 1-22 \
--bfile_tuning ${tuning_bed_EUR},${tuning_bed_AFR},${tuning_bed_AMR} \
--pheno_tuning ${tuning_y_EUR},${tuning_y_AFR},${tuning_y_AMR} \
--covar_tuning ${tuning_covar_EUR},${tuning_covar_AFR},${tuning_covar_AMR} \
--bfile_testing ${valid_bed_EUR},${valid_bed_AFR},${valid_bed_AMR} \
--pheno_testing ${valid_y_EUR},${valid_y_AFR},${valid_y_AMR} \
--covar_testing ${valid_covar_EUR},${valid_covar_AFR},${valid_covar_AMR} \
--testing TRUE \
--NCORES 1

Writing PROSPER_LASSOSUM.sh


In [109]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

trait <- "LDL"

for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
        '--input EUR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/EUR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input AFR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/AFR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input AMR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/AMR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input-recursive PROSPER_Scripts'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER",
        '--input tuning_bed_EUR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_EUR.*",
        '--input tuning_bed_AMR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_AMR.*",
        '--input tuning_bed_AFR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_AFR.*",
        '--input valid_bed_EUR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_Valid_JointPRS_MUSSEL_PROSPER_EUR.*",
        '--input valid_bed_AMR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_Valid_JointPRS_MUSSEL_PROSPER_AMR.*",
        '--input valid_bed_AFR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_Valid_JointPRS_MUSSEL_PROSPER_AFR.*",
        '--input tuning_y_EUR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_tune_y_EUR.txt"),
        '--input tuning_y_AMR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_tune_y_AMR.txt"),
        '--input tuning_y_AFR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_tune_y_AFR.txt"),
        '--input valid_y_EUR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_valid_y_EUR.txt"),
        '--input valid_y_AMR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_valid_y_AMR.txt"),
        '--input valid_y_AFR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_valid_y_AFR.txt"),
        '--input tuning_covar_EUR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_tune_covar_EUR.txt"),
        '--input tuning_covar_AMR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_tune_covar_AMR.txt"),
        '--input tuning_covar_AFR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_tune_covar_AFR.txt"),
        '--input valid_covar_EUR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_valid_covar_EUR.txt"),
        '--input valid_covar_AMR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_valid_covar_AMR.txt"),
        '--input valid_covar_AFR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_valid_covar_AFR.txt"),
        '--output-recursive OUTPUT_PATH'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait),
    check.names = FALSE))   
}




write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [110]:
!Rscript score_task.R

In [111]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 60 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --min-cores 1 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_LASSOSUM.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-la--williamsjacr--250428-190015-72
  job-name: prosper-lassosum
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13383691184684028660
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14622073818099952684
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/96705514830176920
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9905529682641647141
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/340593287056854147
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15735860106761377480
Launched job-id: prosper-la--williamsjacr--250428-190015-72
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-la--williamsjacr--250428-19001

In [38]:
%%writefile MUSSEL_LDPred2.sh
#!/bin/bash

set -o errexit
set -o nounset

tuning_bed_EUR=${tuning_bed_EUR%??}
tuning_bed_AFR=${tuning_bed_AFR%??}
tuning_bed_AMR=${tuning_bed_AMR%??}

Rscript ${MUSSEL_Scripts}/R/LDpred2.R \
--PATH_package ${MUSSEL_Scripts} \
--PATH_ref ${MUSSEL_LD} \
--PATH_out ${OUTPUT_PATH} \
--FILE_sst ${EUR_Sumstats},${AFR_Sumstats},${AMR_Sumstats} \
--pop EUR,AFR,AMR \
--bfile_tuning ${tuning_bed_EUR},${tuning_bed_AFR},${tuning_bed_AMR} \
--verbose 2 \
--NCORES 1

Overwriting MUSSEL_LDPred2.sh


In [39]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

trait <- "LDL"

for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
        '--input EUR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/MUSSEL/EUR_",trait,"_GWAS_SumStats_MUSSEL.csv"),
        '--input AFR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/MUSSEL/AFR_",trait,"_GWAS_SumStats_MUSSEL.csv"),
        '--input AMR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/MUSSEL/AMR_",trait,"_GWAS_SumStats_MUSSEL.csv"),
        '--input-recursive MUSSEL_Scripts'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/MUSSEL/MUSSEL",
        '--input-recursive MUSSEL_LD'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/MUSSEL/MUSSEL_LD",
        '--input tuning_bed_EUR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_EUR.*",
        '--input tuning_bed_AMR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_AMR.*",
        '--input tuning_bed_AFR'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER_AFR.*",
        '--output-recursive OUTPUT_PATH'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/MUSSEL/",trait),
    check.names = FALSE))   
}


write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [40]:
!Rscript score_task.R

In [41]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 200 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --min-cores 1 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script MUSSEL_LDPred2.sh \
  --tasks score_task.txt

Job properties:
  job-id: mussel-ldp--williamsjacr--240722-180603-76
  job-name: mussel-ldpred2
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1876996133311691452
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2094648451624967140
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13690489926848506048
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5508878527126587761
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16970031399968614553
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5105032732639826203
Launched job-id: mussel-ldp--williamsjacr--240722-180603-76
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'mussel-ldp--williamsjacr--240722-18060

In [9]:
%%writefile PROSPER_Prosper.sh

path_plink='plink2'

set -o errexit
set -o nounset

Rscript ${PROSPER_Scripts}/scripts/PROSPER.R \
--PATH_package ${PROSPER_Scripts} \
--PATH_out ${OUTPUT_PATH} \
--FILE_sst ${EUR_Sumstats},${AFR_Sumstats},${AMR_Sumstats} \
--pop EUR,AFR,AMR \
--lassosum_param ${EUR_Optim_Param},${AFR_Optim_Param},${AMR_Optim_Param} \
--chrom 1-22 \
--NCORES 1

Writing PROSPER_Prosper.sh


In [10]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

trait <- "LDL"

for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
        '--input EUR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/EUR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input AFR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/AFR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input AMR_Sumstats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/AMR_",trait,"_GWAS_SumStats_PROSPER.csv"),
        '--input EUR_Optim_Param'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait,"/EUR/optimal_param.txt"),
        '--input AFR_Optim_Param'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait,"/AFR/optimal_param.txt"),
        '--input AMR_Optim_Param'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait,"/AMR/optimal_param.txt"),
        '--input-recursive PROSPER_Scripts'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER",
        '--output-recursive OUTPUT_PATH'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait),
    check.names = FALSE))   
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [11]:
!Rscript score_task.R

In [12]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 60 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --min-cores 1 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_Prosper.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-pr--williamsjacr--250428-212815-36
  job-name: prosper-prosper
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7388948163095099351
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15375682377900572725
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3784459766502440264
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17282830596162308910
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3601401621939499964
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7321139786719999179
Launched job-id: prosper-pr--williamsjacr--250428-212815-36
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-pr--williamsjacr--250428-2128

In [9]:
%%writefile PROSPER_SL.sh

path_plink='plink2'

set -o errexit
set -o nounset

tuning_bed=${tuning_bed%??}

valid_bed=${valid_bed%??}

Rscript ${PROSPER_Scripts}/scripts/tuning_testing.R \
--PATH_plink ${path_plink} \
--PATH_out ${OUTPUT_PATH} \
--PATH_input ${INPUT_PATH} \
--prefix 'full_results' \
--SL_library 'SL.glmnet,SL.ridge,SL.lm' \
--testing TRUE \
--linear_score TRUE \
--bfile_tuning ${tuning_bed} \
--pheno_tuning ${tuning_y} \
--covar_tuning ${tuning_covar} \
--bfile_testing ${valid_bed} \
--pheno_testing ${valid_y} \
--covar_testing ${valid_covar} \
--cleanup F \
--verbose 2 \
--NCORES 1

Writing PROSPER_SL.sh


In [10]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

trait <- "LDL"

for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
        '--input-recursive PROSPER_Scripts'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER",
        '--input-recursive INPUT_PATH'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait),
        '--input tuning_bed'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_tune_JointPRS_MUSSEL_PROSPER.*",
        '--input valid_bed'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_valid_JointPRS_MUSSEL_PROSPER.*",
        '--input tuning_y'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_tune_y.txt"),
        '--input valid_y'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/",trait,"_full_valid_y.txt"),
        '--input tuning_covar'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_tune_covar.txt"),
        '--input valid_covar'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/Tune_Validation_Phenotypes/full_valid_covar.txt"),
        '--output-recursive OUTPUT_PATH'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait),
    check.names = FALSE))   
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [11]:
!Rscript score_task.R

In [12]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 60 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --min-cores 1 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_SL.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-sl--williamsjacr--250429-000731-50
  job-name: prosper-sl
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7692107175929691071
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3463749443434059570
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16794735172943099674
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/8470576111325403060
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14809784528643477355
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/744517568757117769
Launched job-id: prosper-sl--williamsjacr--250429-000731-50
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-sl--williamsjacr--250429-000731-50'

In [15]:
%%writefile JointPRS_Score.R
rm(list = ls())

INPUT_PATH <- commandArgs(TRUE)[1]
print(INPUT_PATH)

BED_Full_File <- commandArgs(TRUE)[2]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[3]
print(OUTPUT_PATH)


for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
    print(trait)
    a <- read.csv(paste0(INPUT_PATH,"/JointPRS_META_Score_",trait,".csv"))
    colnames(a) <- c("CHR", "rsID", "BP", "A1", "A2", "Post_EffectSize")
    a <- data.frame(SNP = a$rsID,A1 = a$A1,BETA = a$Post_EffectSize)
    write.table(a,file = "Final_PRS_Coefficients",col.names = T,row.names = F,quote=F)
    
    system(paste0("plink2 --threads 2 --score Final_PRS_Coefficients cols=+scoresums,-scoreavgs header no-mean-imputation  --bfile ",BED_Full_File," --out ",OUTPUT_PATH,"/PRS_META_",trait))
} 


Writing JointPRS_Score.R


In [16]:
%%writefile JointPRS_Score.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${INPUT_PATH} ${BED_Full_File} ${OUTPUT_PATH}

Writing JointPRS_Score.sh


In [17]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/Scores",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/JointPRS_Score.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/PRSs",
    check.names = FALSE
))   

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [18]:
!Rscript score_task.R

In [19]:
!gsutil -m cp JointPRS_Score.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://JointPRS_Score.R [Content-Type=application/octet-stream]...
/ [1/1 files][  812.0 B/  812.0 B] 100% Done                                    
Operation completed over 1 objects/812.0 B.                                      


In [20]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 200 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script JointPRS_Score.sh \
  --tasks score_task.txt

Job properties:
  job-id: jointprs-s--williamsjacr--250501-113726-10
  job-name: jointprs-score
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2359695272220548302
Launched job-id: jointprs-s--williamsjacr--250501-113726-10
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'jointprs-s--williamsjacr--250501-113726-10' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'jointprs-s--williamsjacr--250501-113726-10' --users 'williamsjacr'


In [9]:
%%writefile PROSPER_Score.R
rm(list = ls())

Score_BMI <- commandArgs(TRUE)[1]
print(Score_BMI)

Score_LDL <- commandArgs(TRUE)[2]
print(Score_LDL)

Score_HDL <- commandArgs(TRUE)[3]
print(Score_HDL)

Score_logTG <- commandArgs(TRUE)[4]
print(Score_logTG)

Score_TC <- commandArgs(TRUE)[5]
print(Score_TC)

Score_Height <- commandArgs(TRUE)[6]
print(Score_Height)

BED_Full_File <- commandArgs(TRUE)[7]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[8]
print(OUTPUT_PATH)


for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
    
    if(trait == "BMI"){
        Score_File <- Score_BMI
    }else if(trait == "LDL"){
        Score_File <- Score_LDL
    }else if(trait == "HDL"){
        Score_File <- Score_HDL
    }else if(trait == "logTG"){
        Score_File <- Score_logTG
    }else if(trait == "TC"){
        Score_File <- Score_TC
    }else {
        Score_File <- Score_Height
    }
    
    b <- read.delim(Score_File)
    b <- data.frame(SNP = b$rsid,A1 = b$a1,BETA = b$weight)
    write.table(b,file = "Final_PRS_Coefficients",col.names = T,row.names = F,quote=F)
    
    system(paste0("plink2 --threads 2 --score Final_PRS_Coefficients cols=+scoresums,-scoreavgs header no-mean-imputation  --bfile ",BED_Full_File," --out ",OUTPUT_PATH,"/PRS_",trait))
} 

Writing PROSPER_Score.R


In [10]:
%%writefile PROSPER_Score.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${Score_BMI} ${Score_LDL} ${Score_HDL} ${Score_logTG} ${Score_TC} ${Score_Height} ${BED_Full_File} ${OUTPUT_PATH}

Writing PROSPER_Score.sh


In [11]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input Score_BMI'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/BMI/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input Score_LDL'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/LDL/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input Score_HDL'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/HDL/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input Score_logTG'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/logTG/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input Score_TC'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/TC/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input Score_Height'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/Height/after_ensemble_full_results/PROSPER_prs_file.txt",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER_Score.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs",
    check.names = FALSE
))   

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [12]:
!Rscript score_task.R

In [13]:
!gsutil -m cp PROSPER_Score.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://PROSPER_Score.R [Content-Type=application/octet-stream]...
/ [1/1 files][  1.3 KiB/  1.3 KiB] 100% Done                                    
Operation completed over 1 objects/1.3 KiB.                                      


In [14]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 200 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_Score.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-sc--williamsjacr--250429-112419-13
  job-name: prosper-score
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7246999809422609194
Launched job-id: prosper-sc--williamsjacr--250429-112419-13
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-sc--williamsjacr--250429-112419-13' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-sc--williamsjacr--250429-112419-13' --users 'williamsjacr'


In [16]:
%%writefile PROSPER_Score_All.R
rm(list = ls())

trait <- commandArgs(TRUE)[1]
print(trait)

Score_Trait <- commandArgs(TRUE)[2]
print(Score_Trait)

BED_Full_File <- commandArgs(TRUE)[3]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[4]
print(OUTPUT_PATH)

Score_File <- Score_Trait
    
b <- read.delim(Score_File)

system(paste0("plink2 --threads 2 --score ",Score_Trait," cols=+scoresums,-scoreavgs header no-mean-imputation --score-col-nums 4-",ncol(b)," --bfile ",BED_Full_File," --out ",OUTPUT_PATH,"/PRS_All_",trait))

Writing PROSPER_Score_All.R


In [17]:
%%writefile PROSPER_Score_All.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${trait} ${Score_Trait} ${BED_Full_File} ${OUTPUT_PATH}

Writing PROSPER_Score_All.sh


In [18]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
 tasks <- rbind(tasks, data.frame(
    '--env trait'=trait,
    '--input Score_Trait'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait,"/before_ensemble/score_file.txt"),
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/JointPRS_MUSSEL_PROSPER_BED/all_JointPRS_MUSSEL_PROSPER.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER_Score_All.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs",
    check.names = FALSE
))   
   
}
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [19]:
!Rscript score_task.R

In [20]:
!gsutil -m cp PROSPER_Score_All.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://PROSPER_Score_All.R [Content-Type=application/octet-stream]...
/ [1/1 files][  548.0 B/  548.0 B] 100% Done                                    
Operation completed over 1 objects/548.0 B.                                      


In [21]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 200 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "144h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_Score_All.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-sc--williamsjacr--250429-112439-24
  job-name: prosper-score-all
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10280476335283298601
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11984450039516978224
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/157940060024560932
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16892770664284640810
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12521981021396864440
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/465680316409281147
Launched job-id: prosper-sc--williamsjacr--250429-112439-24
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-sc--williamsjacr--250429-11

In [8]:
%%writefile CT_SLEB_SL.R
rm(list = ls())

library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)
library(glmnet)

AFR_PRS <- commandArgs(TRUE)[1]
print(AFR_PRS)

AMR_PRS <- commandArgs(TRUE)[2]
print(AMR_PRS)

all_phenotypes_file <- commandArgs(TRUE)[3]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[4]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[5]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[6]
print(all_validation_file)

trait <- commandArgs(TRUE)[7]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[8]
print(OUTPUT_PATH)



load(AFR_PRS)
colnames(prs_mat_eb)[3:ncol(prs_mat_eb)] <- paste0("AFR_",colnames(prs_mat_eb)[3:ncol(prs_mat_eb)])

PRS_Mat_AFR <- prs_mat_eb
rm(prs_mat_eb)

load(AMR_PRS)
colnames(prs_mat_eb)[3:ncol(prs_mat_eb)] <- paste0("AMR_",colnames(prs_mat_eb)[3:ncol(prs_mat_eb)])

PRS_Mat_AMR <- prs_mat_eb
rm(prs_mat_eb)

PRS_Mat_All <- inner_join(PRS_Mat_AFR,PRS_Mat_AMR)
colnames(PRS_Mat_All) <- gsub("-","_",colnames(PRS_Mat_All))

rm(PRS_Mat_AFR);rm(PRS_Mat_AMR)



pheno_training <- read.delim(all_train_file)
ndrop <- ncol(pheno_training)

PRS_Mat_Train <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_training$IID,]

pheno_training <- left_join(pheno_training,PRS_Mat_Train,by = "IID")

prs_train_all <- pheno_training[,-c(1:ndrop)]



pheno_tuning <- read.delim(all_tune_file)

PRS_Mat_Tune <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_tuning$IID,]

pheno_tuning <- left_join(pheno_tuning,PRS_Mat_Tune,by = "IID")

prs_tune_all <- pheno_tuning[,-c(1:ndrop)]
print(dim(prs_tune_all))


pheno_valid <- read.delim(all_validation_file)

PRS_Mat_Validation <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_valid$IID,]

pheno_valid <- left_join(pheno_valid,PRS_Mat_Validation,by = "IID")

prs_validation_all <- pheno_valid[,-c(1:ndrop)]


model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tuning)
y_tune <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_valid)
y_validation <- model.null$residual

mtx <- cor(prs_tune_all)
drop <- names(prs_tune_all)[apply(mtx,2,function(x){sum(is.na(x))}) == (nrow(mtx) - 1)]

prs_train_all <- dplyr::select(prs_train_all, -c(drop))
prs_tune_all <- dplyr::select(prs_tune_all, -c(drop))
prs_validation_all <- dplyr::select(prs_validation_all, -c(drop))

drop <- caret::findLinearCombos(prs_tune_all)$remove
drop <- names(prs_tune_all)[drop]

prs_train_all = prs_train_all %>% 
  select(-all_of(drop))
prs_tune_all = prs_tune_all %>% 
  select(-all_of(drop))
prs_validation_all = prs_validation_all %>% 
  select(-all_of(drop))

mtx <- cor(prs_tune_all)
drop <- findCorrelation(mtx,cutoff=0.98)
drop <- names(prs_tune_all)[drop]

prs_train_all = prs_train_all %>% 
  select(-all_of(drop))
prs_tune_all = prs_tune_all %>% 
  select(-all_of(drop))
prs_validation_all = prs_validation_all %>% 
  select(-all_of(drop))    


pheno_tuning$y_tune <- NA
pheno_tuning$y_tune[!is.na(pheno_tuning[,trait])] <- y_tune

pheno_valid$y_validation <- NA
pheno_valid$y_validation[!is.na(pheno_valid[,trait])] <- y_validation

print(dim(prs_tune_all))
print(sum(is.na(prs_tune_all)))

print(sum(is.na(prs_tune_all[!is.na(pheno_tuning[,"y_tune"]),])))

Ensemble_Function_Continuous <- function(x,y){
  x <- as.matrix(x[!is.na(y),])
  y <- y[!is.na(y)]
  
  lasso_mod <- cv.glmnet(x,y,family = "gaussian",alpha = 1,type.measure = "mse",nfold = 10)
  ridge_mod <- cv.glmnet(x,y,family = "gaussian",alpha = 0,type.measure = "mse",nfold = 10)
  
  lasso_prediction_x <- predict(lasso_mod, x,s = "lambda.min")
  ridge_prediction_x <- predict(ridge_mod, x,s = "lambda.min")
  
  ensemble_mod <- lm(y~.,data = data.frame(lasso_prediction_x,ridge_prediction_x))
  
  ensemble_prediction_x <- ensemble_mod$fitted
  
  coefficients_x <- coef(lm(y~.,data.frame(y = ensemble_prediction_x,x)))
  return(list(Coefficients = coefficients_x))
}
Ensemble_Function_Binary <- function(x,y){
  x <- as.matrix(x[!is.na(y),])
  y <- y[!is.na(y)]
  
  lasso_mod <- cv.glmnet(x,y,family = "binomial",alpha = 1,type.measure = "auc")
  ridge_mod <- cv.glmnet(x,y,family = "binomial",alpha = 0,type.measure = "auc")
  
  lasso_prediction_x <- predict(lasso_mod, x,type = "link")
  ridge_prediction_x <- predict(ridge_mod, x,type = "link")
  
  ensemble_mod <- glm(y~.,data = data.frame(lasso_prediction_x,ridge_prediction_x),family = binomial())
  ensemble_prediction_x <- predict(ensemble_mod,data.frame(lasso_prediction_x,ridge_prediction_x),type = "link")
  
  coefficients_x <- coef(lm(y~.,data.frame(y = ensemble_prediction_x,x)))
  return(list(Coefficients = coefficients_x))
}
Ensemble_Function <- function(x,y,family = c("continuous","binary")){
  if(family == "continuous"){
    return(Ensemble_Function_Continuous(x,y))
  }else{
    return(Ensemble_Function_Binary(x,y))
  }
}

Results <- Ensemble_Function(x = prs_tune_all,y = pheno_tuning[,"y_tune"],family = "continuous")
Results$Coefficients[is.na(Results$Coefficients)] <- 0
save(Results,file = paste0(OUTPUT_PATH,"/",trait,"_Final_Coefficients.csv"))
print(names(Results$Coefficients))
print(colnames(pheno_training))
print(colnames(pheno_tuning))
print(colnames(pheno_valid))
PRS_Train <- as.matrix(pheno_training[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)
PRS_Tune <- as.matrix(pheno_tuning[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)
PRS_Validation <- as.matrix(pheno_valid[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)

PRS_Train <- data.frame(IID = pheno_training$IID,PRS = PRS_Train)
PRS_Tune <- data.frame(IID = pheno_tuning$IID,PRS = PRS_Tune)
PRS_Validation <- data.frame(IID = pheno_valid$IID,PRS = PRS_Validation)

write.table(PRS_Train,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Train_All.txt"),sep = "\t",row.names = FALSE)
write.table(PRS_Tune,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Tune_All.txt"),sep = "\t",row.names = FALSE)
write.table(PRS_Validation,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Validation_All.txt"),sep = "\t",row.names = FALSE)


all_phenotypes <- read.csv(all_phenotypes_file)

pheno_validation_raw <- inner_join(pheno_valid[,c("IID","age","age2","sex",paste0("PC",1:10),"y_validation")],PRS_Validation)
pheno_validation_adjusted <- pheno_validation_raw


tmp <- data.frame(y = pheno_validation_adjusted[,"PRS"],pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
R <- mod$residuals
tmp <- data.frame(y = R^2,pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
y_hat <- predict(mod,tmp)
if(sum(sqrt(y_hat)) == 0){
  pheno_validation_adjusted[,"PRS"] <- 0
}else{
  pheno_validation_adjusted[,"PRS"] <- R/sqrt(y_hat)
}


pheno_validation_raw_EUR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_raw_SAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_raw_AMR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_raw_AFR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_raw_EAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_raw_MID <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_adjusted_EUR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_adjusted_SAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_adjusted_AMR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_adjusted_AFR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_adjusted_EAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_adjusted_MID <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_raw_EUR$y_validation <- scale(pheno_validation_raw_EUR$y_validation)
pheno_validation_raw_SAS$y_validation <- scale(pheno_validation_raw_SAS$y_validation)
pheno_validation_raw_AMR$y_validation <- scale(pheno_validation_raw_AMR$y_validation)
pheno_validation_raw_AFR$y_validation <- scale(pheno_validation_raw_AFR$y_validation)
pheno_validation_raw_EAS$y_validation <- scale(pheno_validation_raw_EAS$y_validation)
pheno_validation_raw_MID$y_validation <- scale(pheno_validation_raw_MID$y_validation)

pheno_validation_adjusted_EUR$y_validation <- scale(pheno_validation_adjusted_EUR$y_validation)
pheno_validation_adjusted_SAS$y_validation <- scale(pheno_validation_adjusted_SAS$y_validation)
pheno_validation_adjusted_AMR$y_validation <- scale(pheno_validation_adjusted_AMR$y_validation)
pheno_validation_adjusted_AFR$y_validation <- scale(pheno_validation_adjusted_AFR$y_validation)
pheno_validation_adjusted_EAS$y_validation <- scale(pheno_validation_adjusted_EAS$y_validation)
pheno_validation_adjusted_MID$y_validation <- scale(pheno_validation_adjusted_MID$y_validation)


pheno_validation_raw_EUR[,"PRS"] <- scale(pheno_validation_raw_EUR[,"PRS"])
pheno_validation_raw_SAS[,"PRS"] <- scale(pheno_validation_raw_SAS[,"PRS"])
pheno_validation_raw_AMR[,"PRS"] <- scale(pheno_validation_raw_AMR[,"PRS"])
pheno_validation_raw_AFR[,"PRS"] <- scale(pheno_validation_raw_AFR[,"PRS"])
pheno_validation_raw_EAS[,"PRS"] <- scale(pheno_validation_raw_EAS[,"PRS"])
pheno_validation_raw_MID[,"PRS"] <- scale(pheno_validation_raw_MID[,"PRS"])

Beta_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = boot_data))[2]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = boot_data))$r.squared
  return(c(result))
}

beta_validation_raw_EUR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_EUR))[2]
boot_beta <- boot(data = pheno_validation_raw_EUR, statistic = Beta_Boot, R = 10000)
beta_raw_EUR_boot <- boot_beta$t
beta_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

beta_validation_raw_SAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_SAS))[2]
boot_beta <- boot(data = pheno_validation_raw_SAS, statistic = Beta_Boot, R = 10000)
beta_raw_SAS_boot <- boot_beta$t
beta_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

beta_validation_raw_AMR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_AMR))[2]
boot_beta <- boot(data = pheno_validation_raw_AMR, statistic = Beta_Boot, R = 10000)
beta_raw_AMR_boot <- boot_beta$t
beta_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

beta_validation_raw_AFR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_AFR))[2]
boot_beta <- boot(data = pheno_validation_raw_AFR, statistic = Beta_Boot, R = 10000)
beta_raw_AFR_boot <- boot_beta$t
beta_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

beta_validation_raw_EAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_EAS))[2]
boot_beta <- boot(data = pheno_validation_raw_EAS, statistic = Beta_Boot, R = 10000)
beta_raw_EAS_boot <- boot_beta$t
beta_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

beta_validation_raw_MID <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_MID))[2]
boot_beta <- boot(data = pheno_validation_raw_MID, statistic = Beta_Boot, R = 10000)
beta_raw_MID_boot <- boot_beta$t
beta_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_raw_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

beta_validation_adjusted_EUR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_EUR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EUR, statistic = Beta_Boot, R = 10000)
beta_adjusted_EUR_boot <- boot_beta$t
beta_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

beta_validation_adjusted_SAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_SAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_SAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_SAS_boot <- boot_beta$t
beta_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

beta_validation_adjusted_AMR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_AMR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AMR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AMR_boot <- boot_beta$t
beta_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

beta_validation_adjusted_AFR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_AFR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AFR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AFR_boot <- boot_beta$t
beta_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

beta_validation_adjusted_EAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_EAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_EAS_boot <- boot_beta$t
beta_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

beta_validation_adjusted_MID <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_MID))[2]
boot_beta <- boot(data = pheno_validation_adjusted_MID, statistic = Beta_Boot, R = 10000)
beta_adjusted_MID_boot <- boot_beta$t
beta_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = pheno_validation_adjusted_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

CV_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                         beta_raw = c(beta_validation_raw_EUR,beta_validation_raw_SAS,beta_validation_raw_AMR,beta_validation_raw_AFR,beta_validation_raw_EAS,beta_validation_raw_MID), 
                         beta_se_raw = c(beta_se_validation_raw_EUR,beta_se_validation_raw_SAS,beta_se_validation_raw_AMR,beta_se_validation_raw_AFR,beta_se_validation_raw_EAS,beta_se_validation_raw_MID), 
                         R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                         R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                         beta_adjusted = c(beta_validation_adjusted_EUR,beta_validation_adjusted_SAS,beta_validation_adjusted_AMR,beta_validation_adjusted_AFR,beta_validation_adjusted_EAS,beta_validation_adjusted_MID), 
                         beta_se_adjusted = c(beta_se_validation_adjusted_EUR,beta_se_validation_adjusted_SAS,beta_se_validation_adjusted_AMR,beta_se_validation_adjusted_AFR,beta_se_validation_adjusted_EAS,beta_se_validation_adjusted_MID), 
                         R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                         R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

CV_Boot_Results <- data.frame(trait = trait,beta_raw_EUR_boot,R2_raw_EUR_boot,beta_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_raw_AMR_boot,R2_raw_AMR_boot,beta_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_raw_EAS_boot,R2_raw_EAS_boot,beta_raw_MID_boot,R2_raw_MID_boot,
                              beta_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_adjusted_EAS_boot,R2_adjusted_EAS_boot,
                              beta_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(CV_Results,file = paste0(OUTPUT_PATH,"/",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(CV_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Bootstraps.csv"),row.names = FALSE)  

Writing CT_SLEB_SL.R


In [9]:
%%writefile CT_SLEB_SL.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${AFR_PRS} ${AMR_PRS} ${all_phenotypes_file} ${all_train_file} ${all_tune_file} ${all_validation_file} ${trait} ${OUTPUT_PATH}

Writing CT_SLEB_SL.sh


In [10]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input AFR_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AFR_",trait,"_prs_mat_eb.RData"),
    '--input AMR_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AMR_",trait,"_prs_mat_eb.RData"),
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/CT_SLEB_SL.R",
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/Results/",
    check.names = FALSE
))   
   }

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Writing score_task.R


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp CT_SLEB_SL.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://CT_SLEB_SL.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 19.5 KiB/ 19.5 KiB] 100% Done                                    
Operation completed over 1 objects/19.5 KiB.                                     


In [13]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 64 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script CT_SLEB_SL.sh \
  --tasks score_task.txt

Job properties:
  job-id: ct-sleb-sl--williamsjacr--250516-185556-27
  job-name: ct-sleb-sl
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/135171374334310759
Launched job-id: ct-sleb-sl--williamsjacr--250516-185556-27
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'ct-sleb-sl--williamsjacr--250516-185556-27' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'ct-sleb-sl--williamsjacr--250516-185556-27' --users 'williamsjacr'


In [7]:
!dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'ct-sleb-sl--williamsjacr--250516-185556-27' --users 'williamsjacr' --status '*'

Job Name      Task  Status    Last Update
----------  ------  --------  --------------
ct-sleb-sl       1  Success   05-16 19:48:45



In [22]:
%%writefile Joint_PRS_Results.R
rm(list = ls())

library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)

PRS <- commandArgs(TRUE)[1]
print(PRS)

all_phenotypes_file <- commandArgs(TRUE)[2]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[3]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[4]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[5]
print(all_validation_file)

trait <- commandArgs(TRUE)[6]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[7]
print(OUTPUT_PATH)


PRS_Mat_All <- read.delim(PRS)
colnames(PRS_Mat_All) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","prs")
PRS_Mat_All <- PRS_Mat_All[,c("IID","prs")]


pheno_training <- read.delim(all_train_file)

PRS_Mat_Train <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_training$IID,]

pheno_training <- left_join(pheno_training,PRS_Mat_Train,by = "IID")



pheno_tuning <- read.delim(all_tune_file)

PRS_Mat_Tune <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_tuning$IID,]

pheno_tuning <- left_join(pheno_tuning,PRS_Mat_Tune,by = "IID")



pheno_valid <- read.delim(all_validation_file)

PRS_Mat_Validation <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_valid$IID,]

pheno_valid <- left_join(pheno_valid,PRS_Mat_Validation,by = "IID")


model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tuning)
y_tune <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_valid)
y_validation <- model.null$residual

pheno_tuning$y_tune <- NA
pheno_tuning$y_tune[!is.na(pheno_tuning[,trait])] <- y_tune

pheno_valid$y_validation <- NA
pheno_valid$y_validation[!is.na(pheno_valid[,trait])] <- y_validation


all_phenotypes <- read.csv(all_phenotypes_file)

pheno_validation_raw <- pheno_valid
pheno_validation_adjusted <- pheno_valid


tmp <- data.frame(y = pheno_validation_adjusted[,"prs"],pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
R <- mod$residuals
tmp <- data.frame(y = R^2,pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
y_hat <- predict(mod,tmp)
if(sum(y_hat < 0) > 0){
  mod <- lm(y~1,data = tmp)
  y_hat <- predict(mod,tmp)
}
if(sum(sqrt(y_hat)) == 0){
  pheno_validation_adjusted[,"prs"] <- 0
}else{
  pheno_validation_adjusted[,"prs"] <- R/sqrt(y_hat)
}


pheno_validation_raw_EUR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_raw_SAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_raw_AMR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_raw_AFR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_raw_EAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_raw_MID <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_adjusted_EUR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_adjusted_SAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_adjusted_AMR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_adjusted_AFR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_adjusted_EAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_adjusted_MID <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_raw_EUR$y_validation <- scale(pheno_validation_raw_EUR$y_validation)
pheno_validation_raw_SAS$y_validation <- scale(pheno_validation_raw_SAS$y_validation)
pheno_validation_raw_AMR$y_validation <- scale(pheno_validation_raw_AMR$y_validation)
pheno_validation_raw_AFR$y_validation <- scale(pheno_validation_raw_AFR$y_validation)
pheno_validation_raw_EAS$y_validation <- scale(pheno_validation_raw_EAS$y_validation)
pheno_validation_raw_MID$y_validation <- scale(pheno_validation_raw_MID$y_validation)

pheno_validation_adjusted_EUR$y_validation <- scale(pheno_validation_adjusted_EUR$y_validation)
pheno_validation_adjusted_SAS$y_validation <- scale(pheno_validation_adjusted_SAS$y_validation)
pheno_validation_adjusted_AMR$y_validation <- scale(pheno_validation_adjusted_AMR$y_validation)
pheno_validation_adjusted_AFR$y_validation <- scale(pheno_validation_adjusted_AFR$y_validation)
pheno_validation_adjusted_EAS$y_validation <- scale(pheno_validation_adjusted_EAS$y_validation)
pheno_validation_adjusted_MID$y_validation <- scale(pheno_validation_adjusted_MID$y_validation)


pheno_validation_raw_EUR[,"prs"] <- scale(pheno_validation_raw_EUR[,"prs"])
pheno_validation_raw_SAS[,"prs"] <- scale(pheno_validation_raw_SAS[,"prs"])
pheno_validation_raw_AMR[,"prs"] <- scale(pheno_validation_raw_AMR[,"prs"])
pheno_validation_raw_AFR[,"prs"] <- scale(pheno_validation_raw_AFR[,"prs"])
pheno_validation_raw_EAS[,"prs"] <- scale(pheno_validation_raw_EAS[,"prs"])
pheno_validation_raw_MID[,"prs"] <- scale(pheno_validation_raw_MID[,"prs"])


Beta_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))[2]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))$r.squared
  return(c(result))
}

beta_validation_raw_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))[2]
boot_beta <- boot(data = pheno_validation_raw_EUR, statistic = Beta_Boot, R = 10000)
beta_raw_EUR_boot <- boot_beta$t
beta_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

beta_validation_raw_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))[2]
boot_beta <- boot(data = pheno_validation_raw_SAS, statistic = Beta_Boot, R = 10000)
beta_raw_SAS_boot <- boot_beta$t
beta_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

beta_validation_raw_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))[2]
boot_beta <- boot(data = pheno_validation_raw_AMR, statistic = Beta_Boot, R = 10000)
beta_raw_AMR_boot <- boot_beta$t
beta_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

beta_validation_raw_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))[2]
boot_beta <- boot(data = pheno_validation_raw_AFR, statistic = Beta_Boot, R = 10000)
beta_raw_AFR_boot <- boot_beta$t
beta_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

beta_validation_raw_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))[2]
boot_beta <- boot(data = pheno_validation_raw_EAS, statistic = Beta_Boot, R = 10000)
beta_raw_EAS_boot <- boot_beta$t
beta_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

beta_validation_raw_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))[2]
boot_beta <- boot(data = pheno_validation_raw_MID, statistic = Beta_Boot, R = 10000)
beta_raw_MID_boot <- boot_beta$t
beta_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

beta_validation_adjusted_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EUR, statistic = Beta_Boot, R = 10000)
beta_adjusted_EUR_boot <- boot_beta$t
beta_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

beta_validation_adjusted_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_SAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_SAS_boot <- boot_beta$t
beta_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

beta_validation_adjusted_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AMR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AMR_boot <- boot_beta$t
beta_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

beta_validation_adjusted_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AFR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AFR_boot <- boot_beta$t
beta_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

beta_validation_adjusted_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_EAS_boot <- boot_beta$t
beta_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

beta_validation_adjusted_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))[2]
boot_beta <- boot(data = pheno_validation_adjusted_MID, statistic = Beta_Boot, R = 10000)
beta_adjusted_MID_boot <- boot_beta$t
beta_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

CV_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                         beta_raw = c(beta_validation_raw_EUR,beta_validation_raw_SAS,beta_validation_raw_AMR,beta_validation_raw_AFR,beta_validation_raw_EAS,beta_validation_raw_MID), 
                         beta_se_raw = c(beta_se_validation_raw_EUR,beta_se_validation_raw_SAS,beta_se_validation_raw_AMR,beta_se_validation_raw_AFR,beta_se_validation_raw_EAS,beta_se_validation_raw_MID), 
                         R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                         R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                         beta_adjusted = c(beta_validation_adjusted_EUR,beta_validation_adjusted_SAS,beta_validation_adjusted_AMR,beta_validation_adjusted_AFR,beta_validation_adjusted_EAS,beta_validation_adjusted_MID), 
                         beta_se_adjusted = c(beta_se_validation_adjusted_EUR,beta_se_validation_adjusted_SAS,beta_se_validation_adjusted_AMR,beta_se_validation_adjusted_AFR,beta_se_validation_adjusted_EAS,beta_se_validation_adjusted_MID), 
                         R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                         R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

CV_Boot_Results <- data.frame(trait = trait,beta_raw_EUR_boot,R2_raw_EUR_boot,beta_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_raw_AMR_boot,R2_raw_AMR_boot,beta_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_raw_EAS_boot,R2_raw_EAS_boot,beta_raw_MID_boot,R2_raw_MID_boot,
                              beta_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_adjusted_EAS_boot,R2_adjusted_EAS_boot,
                              beta_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(CV_Results,file = paste0(OUTPUT_PATH,"/",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(CV_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Bootstraps.csv"),row.names = FALSE) 

Writing Joint_PRS_Results.R


In [23]:
%%writefile Joint_PRS_Results.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${PRS} ${all_phenotypes_file} ${all_train_file} ${all_tune_file} ${all_validation_file} ${trait} ${OUTPUT_PATH}

Writing Joint_PRS_Results.sh


In [24]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/PRSs/PRS_META_",trait,".sscore"),
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Joint_PRS_Results.R",
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/Results/",
    check.names = FALSE
))   
   }

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [25]:
!Rscript score_task.R

In [26]:
!gsutil -m cp Joint_PRS_Results.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Joint_PRS_Results.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 15.2 KiB/ 15.2 KiB] 100% Done                                    
Operation completed over 1 objects/15.2 KiB.                                     


In [27]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 30 \
  --boot-disk-size 25 \
  --min-ram 8 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Joint_PRS_Results.sh \
  --tasks score_task.txt

Job properties:
  job-id: joint-prs---williamsjacr--250514-130145-77
  job-name: joint-prs-results
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4362912663453334680
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/8026149672767830864
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3362018703464373505
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5321072471225326166
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/14179213402507147645
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17956511654413766800
Launched job-id: joint-prs---williamsjacr--250514-130145-77
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'joint-prs---williamsjacr--250514-13

In [30]:
%%writefile PROSPER_Results.R
rm(list = ls())

library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)

PRS <- commandArgs(TRUE)[1]
print(PRS)

all_phenotypes_file <- commandArgs(TRUE)[2]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[3]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[4]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[5]
print(all_validation_file)

trait <- commandArgs(TRUE)[6]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[7]
print(OUTPUT_PATH)


PRS_Mat_All <- read.delim(PRS)
colnames(PRS_Mat_All) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","prs")
PRS_Mat_All <- PRS_Mat_All[,c("IID","prs")]


pheno_training <- read.delim(all_train_file)

PRS_Mat_Train <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_training$IID,]

pheno_training <- left_join(pheno_training,PRS_Mat_Train,by = "IID")



pheno_tuning <- read.delim(all_tune_file)

PRS_Mat_Tune <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_tuning$IID,]

pheno_tuning <- left_join(pheno_tuning,PRS_Mat_Tune,by = "IID")



pheno_valid <- read.delim(all_validation_file)

PRS_Mat_Validation <- PRS_Mat_All[PRS_Mat_All$IID %in% pheno_valid$IID,]

pheno_valid <- left_join(pheno_valid,PRS_Mat_Validation,by = "IID")


model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tuning)
y_tune <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_valid)
y_validation <- model.null$residual

pheno_tuning$y_tune <- NA
pheno_tuning$y_tune[!is.na(pheno_tuning[,trait])] <- y_tune

pheno_valid$y_validation <- NA
pheno_valid$y_validation[!is.na(pheno_valid[,trait])] <- y_validation


all_phenotypes <- read.csv(all_phenotypes_file)

pheno_validation_raw <- pheno_valid
pheno_validation_adjusted <- pheno_valid


tmp <- data.frame(y = pheno_validation_adjusted[,"prs"],pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
R <- mod$residuals
tmp <- data.frame(y = R^2,pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
y_hat <- predict(mod,tmp)
if(sum(sqrt(y_hat)) == 0){
  pheno_validation_adjusted[,"prs"] <- 0
}else{
  pheno_validation_adjusted[,"prs"] <- R/sqrt(y_hat)
}


pheno_validation_raw_EUR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_raw_SAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_raw_AMR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_raw_AFR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_raw_EAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_raw_MID <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_adjusted_EUR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_adjusted_SAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_adjusted_AMR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_adjusted_AFR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_adjusted_EAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_adjusted_MID <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]


pheno_validation_raw_EUR$y_validation <- scale(pheno_validation_raw_EUR$y_validation)
pheno_validation_raw_SAS$y_validation <- scale(pheno_validation_raw_SAS$y_validation)
pheno_validation_raw_AMR$y_validation <- scale(pheno_validation_raw_AMR$y_validation)
pheno_validation_raw_AFR$y_validation <- scale(pheno_validation_raw_AFR$y_validation)
pheno_validation_raw_EAS$y_validation <- scale(pheno_validation_raw_EAS$y_validation)
pheno_validation_raw_MID$y_validation <- scale(pheno_validation_raw_MID$y_validation)

pheno_validation_adjusted_EUR$y_validation <- scale(pheno_validation_adjusted_EUR$y_validation)
pheno_validation_adjusted_SAS$y_validation <- scale(pheno_validation_adjusted_SAS$y_validation)
pheno_validation_adjusted_AMR$y_validation <- scale(pheno_validation_adjusted_AMR$y_validation)
pheno_validation_adjusted_AFR$y_validation <- scale(pheno_validation_adjusted_AFR$y_validation)
pheno_validation_adjusted_EAS$y_validation <- scale(pheno_validation_adjusted_EAS$y_validation)
pheno_validation_adjusted_MID$y_validation <- scale(pheno_validation_adjusted_MID$y_validation)


pheno_validation_raw_EUR[,"prs"] <- scale(pheno_validation_raw_EUR[,"prs"])
pheno_validation_raw_SAS[,"prs"] <- scale(pheno_validation_raw_SAS[,"prs"])
pheno_validation_raw_AMR[,"prs"] <- scale(pheno_validation_raw_AMR[,"prs"])
pheno_validation_raw_AFR[,"prs"] <- scale(pheno_validation_raw_AFR[,"prs"])
pheno_validation_raw_EAS[,"prs"] <- scale(pheno_validation_raw_EAS[,"prs"])
pheno_validation_raw_MID[,"prs"] <- scale(pheno_validation_raw_MID[,"prs"])


Beta_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))[2]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))$r.squared
  return(c(result))
}

beta_validation_raw_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))[2]
boot_beta <- boot(data = pheno_validation_raw_EUR, statistic = Beta_Boot, R = 10000)
beta_raw_EUR_boot <- boot_beta$t
beta_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

beta_validation_raw_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))[2]
boot_beta <- boot(data = pheno_validation_raw_SAS, statistic = Beta_Boot, R = 10000)
beta_raw_SAS_boot <- boot_beta$t
beta_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

beta_validation_raw_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))[2]
boot_beta <- boot(data = pheno_validation_raw_AMR, statistic = Beta_Boot, R = 10000)
beta_raw_AMR_boot <- boot_beta$t
beta_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

beta_validation_raw_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))[2]
boot_beta <- boot(data = pheno_validation_raw_AFR, statistic = Beta_Boot, R = 10000)
beta_raw_AFR_boot <- boot_beta$t
beta_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

beta_validation_raw_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))[2]
boot_beta <- boot(data = pheno_validation_raw_EAS, statistic = Beta_Boot, R = 10000)
beta_raw_EAS_boot <- boot_beta$t
beta_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

beta_validation_raw_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))[2]
boot_beta <- boot(data = pheno_validation_raw_MID, statistic = Beta_Boot, R = 10000)
beta_raw_MID_boot <- boot_beta$t
beta_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

beta_validation_adjusted_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EUR, statistic = Beta_Boot, R = 10000)
beta_adjusted_EUR_boot <- boot_beta$t
beta_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

beta_validation_adjusted_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_SAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_SAS_boot <- boot_beta$t
beta_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

beta_validation_adjusted_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AMR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AMR_boot <- boot_beta$t
beta_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

beta_validation_adjusted_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AFR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AFR_boot <- boot_beta$t
beta_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

beta_validation_adjusted_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_EAS_boot <- boot_beta$t
beta_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

beta_validation_adjusted_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))[2]
boot_beta <- boot(data = pheno_validation_adjusted_MID, statistic = Beta_Boot, R = 10000)
beta_adjusted_MID_boot <- boot_beta$t
beta_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

CV_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                         beta_raw = c(beta_validation_raw_EUR,beta_validation_raw_SAS,beta_validation_raw_AMR,beta_validation_raw_AFR,beta_validation_raw_EAS,beta_validation_raw_MID), 
                         beta_se_raw = c(beta_se_validation_raw_EUR,beta_se_validation_raw_SAS,beta_se_validation_raw_AMR,beta_se_validation_raw_AFR,beta_se_validation_raw_EAS,beta_se_validation_raw_MID), 
                         R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                         R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                         beta_adjusted = c(beta_validation_adjusted_EUR,beta_validation_adjusted_SAS,beta_validation_adjusted_AMR,beta_validation_adjusted_AFR,beta_validation_adjusted_EAS,beta_validation_adjusted_MID), 
                         beta_se_adjusted = c(beta_se_validation_adjusted_EUR,beta_se_validation_adjusted_SAS,beta_se_validation_adjusted_AMR,beta_se_validation_adjusted_AFR,beta_se_validation_adjusted_EAS,beta_se_validation_adjusted_MID), 
                         R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                         R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

CV_Boot_Results <- data.frame(trait = trait,beta_raw_EUR_boot,R2_raw_EUR_boot,beta_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_raw_AMR_boot,R2_raw_AMR_boot,beta_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_raw_EAS_boot,R2_raw_EAS_boot,beta_raw_MID_boot,R2_raw_MID_boot,
                              beta_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_adjusted_EAS_boot,R2_adjusted_EAS_boot,
                              beta_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(CV_Results,file = paste0(OUTPUT_PATH,"/",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(CV_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Bootstraps.csv"),row.names = FALSE) 

Writing PROSPER_Results.R


In [31]:
%%writefile PROSPER_Results.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${PRS} ${all_phenotypes_file} ${all_train_file} ${all_tune_file} ${all_validation_file} ${trait} ${OUTPUT_PATH}

Writing PROSPER_Results.sh


In [32]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
    tasks <- rbind(tasks, data.frame(
    '--input PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs/PRS_",trait,".sscore"),
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/PROSPER_Results.R",
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/Results/",
    check.names = FALSE
))   
   }

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [33]:
!Rscript score_task.R

In [34]:
!gsutil -m cp PROSPER_Results.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://PROSPER_Results.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 15.1 KiB/ 15.1 KiB] 100% Done                                    
Operation completed over 1 objects/15.1 KiB.                                     


In [35]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script PROSPER_Results.sh \
  --tasks score_task.txt

Job properties:
  job-id: prosper-re--williamsjacr--250514-130216-09
  job-name: prosper-results
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16373633924048282295
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/503809240254683343
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2765906364711580764
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2079727502755999668
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9050702195468526515
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13293838478095224273
Launched job-id: prosper-re--williamsjacr--250514-130216-09
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'prosper-re--williamsjacr--250514-13021

In [15]:
%%writefile OneCommonPRS.R
rm(list = ls())
library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)
library(glmnet)

PROSPER_PRSs <- commandArgs(TRUE)[1]
print(PROSPER_PRSs)

JointPRS_PRS <- commandArgs(TRUE)[2]
print(JointPRS_PRS)

CTSLEB_AFR <- commandArgs(TRUE)[3]
print(CTSLEB_AFR)

CTSLEB_AMR <- commandArgs(TRUE)[4]
print(CTSLEB_AMR)

all_phenotypes_file <- commandArgs(TRUE)[5]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[6]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[7]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[8]
print(all_validation_file)

trait <- commandArgs(TRUE)[9]
print(trait)

OUTPUT_PATH <- commandArgs(TRUE)[10]
print(OUTPUT_PATH)

PROSPER_PRSs <- read.delim(PROSPER_PRSs)
colnames(PROSPER_PRSs) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM",paste0("PROSPER_PRS_",1:(ncol(PROSPER_PRSs) - 4)))
PROSPER_PRSs <- PROSPER_PRSs[,c("IID",paste0("PROSPER_PRS_",1:(ncol(PROSPER_PRSs) - 4)))]

JointPRS_PRS <- read.delim(JointPRS_PRS)
colnames(JointPRS_PRS) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","JointPRS_PRS")
JointPRS_PRS <- JointPRS_PRS[,c("IID","JointPRS_PRS")]

load(CTSLEB_AFR)
CTSLEB_PRS_AFR <- prs_mat_eb;rm(prs_mat_eb)
colnames(CTSLEB_PRS_AFR) <- c("FID","IID",paste0("AFR_",colnames(CTSLEB_PRS_AFR)[3:ncol(CTSLEB_PRS_AFR)]))
colnames(CTSLEB_PRS_AFR) <- gsub("-","_",colnames(CTSLEB_PRS_AFR))
CTSLEB_PRS_AFR <- CTSLEB_PRS_AFR[,c("IID",colnames(CTSLEB_PRS_AFR)[3:ncol(CTSLEB_PRS_AFR)])]

load(CTSLEB_AMR)
CTSLEB_PRS_AMR <- prs_mat_eb;rm(prs_mat_eb)
colnames(CTSLEB_PRS_AMR) <- c("FID","IID",paste0("AMR_",colnames(CTSLEB_PRS_AMR)[3:ncol(CTSLEB_PRS_AMR)]))
colnames(CTSLEB_PRS_AMR) <- gsub("-","_",colnames(CTSLEB_PRS_AMR))
CTSLEB_PRS_AMR <- CTSLEB_PRS_AMR[,c("IID",colnames(CTSLEB_PRS_AMR)[3:ncol(CTSLEB_PRS_AMR)])]

PRS_All <- inner_join(PROSPER_PRSs,JointPRS_PRS)
PRS_All <- inner_join(PRS_All,CTSLEB_PRS_AFR)
PRS_All <- inner_join(PRS_All,CTSLEB_PRS_AMR)

colnames(PRS_All) <- gsub("-","_",colnames(PRS_All))

pheno_training <- read.delim(all_train_file)

PRS_Mat_Train <- PRS_All[PRS_All$IID %in% pheno_training$IID,]

pheno_training <- left_join(pheno_training,PRS_Mat_Train,by = "IID")

prs_train_all <- pheno_training[,-c(1:22)]



pheno_tuning <- read.delim(all_tune_file)

PRS_Mat_Tune <- PRS_All[PRS_All$IID %in% pheno_tuning$IID,]

pheno_tuning <- left_join(pheno_tuning,PRS_Mat_Tune,by = "IID")

prs_tune_all <- pheno_tuning[,-c(1:22)]



pheno_valid <- read.delim(all_validation_file)

PRS_Mat_Validation <- PRS_All[PRS_All$IID %in% pheno_valid$IID,]

pheno_valid <- left_join(pheno_valid,PRS_Mat_Validation,by = "IID")

prs_validation_all <- pheno_valid[,-c(1:22)]


## Null Models

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_training)
y_train <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tuning)
y_tune <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_valid)
y_validation <- model.null$residual


drop <- caret::findLinearCombos(prs_tune_all)$remove
drop <- names(prs_tune_all)[drop]

prs_train_all = prs_train_all %>% 
  select(-all_of(drop))
prs_tune_all = prs_tune_all %>% 
  select(-all_of(drop))
prs_validation_all = prs_validation_all %>% 
  select(-all_of(drop))


mtx <- cor(prs_tune_all)
drop <- findCorrelation(mtx,cutoff=0.98)
drop <- names(prs_tune_all)[drop]

prs_train_all = prs_train_all %>% 
  select(-all_of(drop))
prs_tune_all = prs_tune_all %>% 
  select(-all_of(drop))
prs_validation_all = prs_validation_all %>% 
  select(-all_of(drop))    


pheno_tuning$y_tune <- NA
pheno_tuning$y_tune[!is.na(pheno_tuning[,trait])] <- y_tune

pheno_valid$y_validation <- NA
pheno_valid$y_validation[!is.na(pheno_valid[,trait])] <- y_validation

Ensemble_Function_Continuous <- function(x,y){
  x <- as.matrix(x[!is.na(y),])
  y <- y[!is.na(y)]
  
  lasso_mod <- cv.glmnet(x,y,family = "gaussian",alpha = 1,type.measure = "mse",nfold = 10)
  ridge_mod <- cv.glmnet(x,y,family = "gaussian",alpha = 0,type.measure = "mse",nfold = 10)
  
  lasso_prediction_x <- predict(lasso_mod, x,s = "lambda.min")
  ridge_prediction_x <- predict(ridge_mod, x,s = "lambda.min")
  
  ensemble_mod <- lm(y~.,data = data.frame(lasso_prediction_x,ridge_prediction_x))
  
  ensemble_prediction_x <- ensemble_mod$fitted
  
  coefficients_x <- coef(lm(y~.,data.frame(y = ensemble_prediction_x,x)))
  return(list(Coefficients = coefficients_x))
}
Ensemble_Function_Binary <- function(x,y){
  x <- as.matrix(x[!is.na(y),])
  y <- y[!is.na(y)]
  
  lasso_mod <- cv.glmnet(x,y,family = "binomial",alpha = 1,type.measure = "auc")
  ridge_mod <- cv.glmnet(x,y,family = "binomial",alpha = 0,type.measure = "auc")
  
  lasso_prediction_x <- predict(lasso_mod, x,type = "link")
  ridge_prediction_x <- predict(ridge_mod, x,type = "link")
  
  ensemble_mod <- glm(y~.,data = data.frame(lasso_prediction_x,ridge_prediction_x),family = binomial())
  ensemble_prediction_x <- predict(ensemble_mod,data.frame(lasso_prediction_x,ridge_prediction_x),type = "link")
  
  coefficients_x <- coef(lm(y~.,data.frame(y = ensemble_prediction_x,x)))
  return(list(Coefficients = coefficients_x))
}
Ensemble_Function <- function(x,y,family = c("continuous","binary")){
  if(family == "continuous"){
    return(Ensemble_Function_Continuous(x,y))
  }else{
    return(Ensemble_Function_Binary(x,y))
  }
}

Results <- Ensemble_Function(x = prs_tune_all,y = pheno_tuning[,"y_tune"],family = "continuous")
Results$Coefficients[is.na(Results$Coefficients)] <- 0
write.csv(Results$Coefficients,file = paste0(OUTPUT_PATH,"/",trait,"_final_coef.csv"))

PRS_Train <- as.matrix(pheno_training[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)
PRS_Tune <- as.matrix(pheno_tuning[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)
PRS_Validation <- as.matrix(pheno_valid[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)

prs_best_train <- data.frame(IID = pheno_training$IID,prs = PRS_Train)
prs_best_tune <- data.frame(IID = pheno_tuning$IID,prs = PRS_Tune)
prs_best_validation <- data.frame(IID = pheno_valid$IID,prs = PRS_Validation)

write.table(prs_best_train,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Train_All.txt"),sep = "\t",row.names = FALSE)
write.table(prs_best_tune,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Tune_All.txt"),sep = "\t",row.names = FALSE)
write.table(prs_best_validation,file=paste0(OUTPUT_PATH,"/",trait,"_Best_Validation_All.txt"),sep = "\t",row.names = FALSE)


all_phenotypes <- read.csv(all_phenotypes_file)

pheno_validation_raw <- inner_join(pheno_valid[,c("IID","age","age2","sex",paste0("PC",1:10),"y_validation")],prs_best_validation)
pheno_validation_adjusted <- pheno_validation_raw

tmp <- data.frame(y = pheno_validation_adjusted[,"prs"],pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
R <- mod$residuals
tmp <- data.frame(y = R^2,pheno_validation_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
y_hat <- predict(mod,tmp)
if(sum(sqrt(y_hat)) == 0){
  pheno_validation_adjusted[,"prs"] <- 0
}else{
  pheno_validation_adjusted[,"prs"] <- R/sqrt(y_hat)
}


pheno_validation_raw_EUR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_raw_SAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_raw_AMR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_raw_AFR <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_raw_EAS <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_raw_MID <- pheno_validation_raw[pheno_validation_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

pheno_validation_adjusted_EUR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
pheno_validation_adjusted_SAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
pheno_validation_adjusted_AMR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
pheno_validation_adjusted_AFR <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
pheno_validation_adjusted_EAS <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
pheno_validation_adjusted_MID <- pheno_validation_adjusted[pheno_validation_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

pheno_validation_raw_EUR$y_validation <- scale(pheno_validation_raw_EUR$y_validation)
pheno_validation_raw_SAS$y_validation <- scale(pheno_validation_raw_SAS$y_validation)
pheno_validation_raw_AMR$y_validation <- scale(pheno_validation_raw_AMR$y_validation)
pheno_validation_raw_AFR$y_validation <- scale(pheno_validation_raw_AFR$y_validation)
pheno_validation_raw_EAS$y_validation <- scale(pheno_validation_raw_EAS$y_validation)
pheno_validation_raw_MID$y_validation <- scale(pheno_validation_raw_MID$y_validation)

pheno_validation_adjusted_EUR$y_validation <- scale(pheno_validation_adjusted_EUR$y_validation)
pheno_validation_adjusted_SAS$y_validation <- scale(pheno_validation_adjusted_SAS$y_validation)
pheno_validation_adjusted_AMR$y_validation <- scale(pheno_validation_adjusted_AMR$y_validation)
pheno_validation_adjusted_AFR$y_validation <- scale(pheno_validation_adjusted_AFR$y_validation)
pheno_validation_adjusted_EAS$y_validation <- scale(pheno_validation_adjusted_EAS$y_validation)
pheno_validation_adjusted_MID$y_validation <- scale(pheno_validation_adjusted_MID$y_validation)

pheno_validation_raw_EUR[,"prs"] <- scale(pheno_validation_raw_EUR[,"prs"])
pheno_validation_raw_SAS[,"prs"] <- scale(pheno_validation_raw_SAS[,"prs"])
pheno_validation_raw_AMR[,"prs"] <- scale(pheno_validation_raw_AMR[,"prs"])
pheno_validation_raw_AFR[,"prs"] <- scale(pheno_validation_raw_AFR[,"prs"])
pheno_validation_raw_EAS[,"prs"] <- scale(pheno_validation_raw_EAS[,"prs"])
pheno_validation_raw_MID[,"prs"] <- scale(pheno_validation_raw_MID[,"prs"])

Beta_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))[2]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(as.formula(paste0("y_validation~","prs")),data = boot_data))$r.squared
  return(c(result))
}

beta_validation_raw_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))[2]
boot_beta <- boot(data = pheno_validation_raw_EUR, statistic = Beta_Boot, R = 10000)
beta_raw_EUR_boot <- boot_beta$t
beta_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

beta_validation_raw_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))[2]
boot_beta <- boot(data = pheno_validation_raw_SAS, statistic = Beta_Boot, R = 10000)
beta_raw_SAS_boot <- boot_beta$t
beta_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

beta_validation_raw_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))[2]
boot_beta <- boot(data = pheno_validation_raw_AMR, statistic = Beta_Boot, R = 10000)
beta_raw_AMR_boot <- boot_beta$t
beta_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

beta_validation_raw_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))[2]
boot_beta <- boot(data = pheno_validation_raw_AFR, statistic = Beta_Boot, R = 10000)
beta_raw_AFR_boot <- boot_beta$t
beta_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

beta_validation_raw_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))[2]
boot_beta <- boot(data = pheno_validation_raw_EAS, statistic = Beta_Boot, R = 10000)
beta_raw_EAS_boot <- boot_beta$t
beta_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

beta_validation_raw_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))[2]
boot_beta <- boot(data = pheno_validation_raw_MID, statistic = Beta_Boot, R = 10000)
beta_raw_MID_boot <- boot_beta$t
beta_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_raw_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

beta_validation_adjusted_EUR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EUR, statistic = Beta_Boot, R = 10000)
beta_adjusted_EUR_boot <- boot_beta$t
beta_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EUR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

beta_validation_adjusted_SAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_SAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_SAS_boot <- boot_beta$t
beta_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_SAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

beta_validation_adjusted_AMR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AMR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AMR_boot <- boot_beta$t
beta_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AMR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

beta_validation_adjusted_AFR <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))[2]
boot_beta <- boot(data = pheno_validation_adjusted_AFR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AFR_boot <- boot_beta$t
beta_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_AFR))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

beta_validation_adjusted_EAS <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))[2]
boot_beta <- boot(data = pheno_validation_adjusted_EAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_EAS_boot <- boot_beta$t
beta_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_EAS))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

beta_validation_adjusted_MID <- coef(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))[2]
boot_beta <- boot(data = pheno_validation_adjusted_MID, statistic = Beta_Boot, R = 10000)
beta_adjusted_MID_boot <- boot_beta$t
beta_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(as.formula(paste0("y_validation~","prs")),data = pheno_validation_adjusted_MID))$r.squared
boot_R2 <- boot(data = pheno_validation_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

CV_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                         beta_raw = c(beta_validation_raw_EUR,beta_validation_raw_SAS,beta_validation_raw_AMR,beta_validation_raw_AFR,beta_validation_raw_EAS,beta_validation_raw_MID), 
                         beta_se_raw = c(beta_se_validation_raw_EUR,beta_se_validation_raw_SAS,beta_se_validation_raw_AMR,beta_se_validation_raw_AFR,beta_se_validation_raw_EAS,beta_se_validation_raw_MID), 
                         R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                         R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                         beta_adjusted = c(beta_validation_adjusted_EUR,beta_validation_adjusted_SAS,beta_validation_adjusted_AMR,beta_validation_adjusted_AFR,beta_validation_adjusted_EAS,beta_validation_adjusted_MID), 
                         beta_se_adjusted = c(beta_se_validation_adjusted_EUR,beta_se_validation_adjusted_SAS,beta_se_validation_adjusted_AMR,beta_se_validation_adjusted_AFR,beta_se_validation_adjusted_EAS,beta_se_validation_adjusted_MID), 
                         R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                         R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

CV_Boot_Results <- data.frame(trait = trait,beta_raw_EUR_boot,R2_raw_EUR_boot,beta_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_raw_AMR_boot,R2_raw_AMR_boot,beta_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_raw_EAS_boot,R2_raw_EAS_boot,beta_raw_MID_boot,R2_raw_MID_boot,
                              beta_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_adjusted_EAS_boot,R2_adjusted_EAS_boot,
                              beta_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(CV_Results,file = paste0(OUTPUT_PATH,"/",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(CV_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Bootstraps.csv"),row.names = FALSE) 

Overwriting OneCommonPRS.R


In [16]:
%%writefile OneCommonPRS.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${PROSPER_PRSs} ${JointPRS_PRS} ${CTSLEB_AFR} ${CTSLEB_AMR} ${all_phenotypes_file} ${all_train_file} ${all_tune_file} ${all_validation_file} ${trait} ${OUTPUT_PATH}

Overwriting OneCommonPRS.sh


In [17]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


for(trait in c("logTG")){
    tasks <- rbind(tasks, data.frame(
    '--input PROSPER_PRSs'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs/PRS_All_",trait,".sscore"),
    '--input JointPRS_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/PRSs/PRS_META_",trait,".sscore"),
    '--input CTSLEB_AFR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AFR_",trait,"_prs_mat_eb.RData"),
    '--input CTSLEB_AMR'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AMR_",trait,"_prs_mat_eb.RData"),
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/OneCommonPRS.R",
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",
    check.names = FALSE
))   
   }

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [18]:
!Rscript score_task.R

In [19]:
!gsutil -m cp OneCommonPRS.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://OneCommonPRS.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 20.1 KiB/ 20.1 KiB] 100% Done                                    
Operation completed over 1 objects/20.1 KiB.                                     


In [20]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 64 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script OneCommonPRS.sh \
  --tasks score_task.txt

Job properties:
  job-id: onecommonp--williamsjacr--250516-214332-35
  job-name: onecommonprs
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7513765309207800356
Launched job-id: onecommonp--williamsjacr--250516-214332-35
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'onecommonp--williamsjacr--250516-214332-35' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'onecommonp--williamsjacr--250516-214332-35' --users 'williamsjacr'


In [26]:
%%writefile Extract_Betas_All.R
library(dplyr)
library(stringr)
library(data.table)

GWAS_SumStats <- commandArgs(TRUE)[1]
print(GWAS_SumStats)

snp_info <- commandArgs(TRUE)[2]
print(snp_info)

PROSPER_PRSs <- commandArgs(TRUE)[3]
print(PROSPER_PRSs)

JointPRS_PRS <- commandArgs(TRUE)[4]
print(JointPRS_PRS)

CTSLEB_AFR_PRSs <- commandArgs(TRUE)[5]
print(CTSLEB_AFR_PRSs)

CTSLEB_AMR_PRSs <- commandArgs(TRUE)[6]
print(CTSLEB_AMR_PRSs)

PROSPER_Scores <- commandArgs(TRUE)[7]
print(PROSPER_Scores)

JointPRS_Score <- commandArgs(TRUE)[8]
print(JointPRS_Score)

CTSLEB_AFR_Scores <- commandArgs(TRUE)[9]
print(CTSLEB_AFR_Scores)

CTSLEB_AMR_Scores <- commandArgs(TRUE)[10]
print(CTSLEB_AMR_Scores)

RICE_CV_PRS_Tune <- commandArgs(TRUE)[11]
print(RICE_CV_PRS_Tune)

RICE_CV_PRS_Validation <- commandArgs(TRUE)[12]
print(RICE_CV_PRS_Validation)

Validation <- commandArgs(TRUE)[13]
print(Validation)

trait <- commandArgs(TRUE)[14]
print(trait)

BED_Full_File <- commandArgs(TRUE)[15]
BED_Full_File <- gsub(".bed","",BED_Full_File)
print(BED_Full_File)

OUTPUT_PATH <- commandArgs(TRUE)[16]
print(OUTPUT_PATH)

PROSPER_PRSs <- read.delim(PROSPER_PRSs)
colnames(PROSPER_PRSs) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM",paste0("PROSPER_PRS_",1:(ncol(PROSPER_PRSs) - 4)))
PROSPER_PRSs <- PROSPER_PRSs[,c("IID",paste0("PROSPER_PRS_",1:(ncol(PROSPER_PRSs) - 4)))]

JointPRS_PRS <- read.delim(JointPRS_PRS)
colnames(JointPRS_PRS) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","JointPRS_PRS")
JointPRS_PRS <- JointPRS_PRS[,c("IID","JointPRS_PRS")]

load(CTSLEB_AFR_PRSs)
CTSLEB_PRS_AFR <- prs_mat_eb;rm(prs_mat_eb)
colnames(CTSLEB_PRS_AFR) <- c("FID","IID",paste0("AFR_",colnames(CTSLEB_PRS_AFR)[3:ncol(CTSLEB_PRS_AFR)]))
colnames(CTSLEB_PRS_AFR) <- gsub("-","_",colnames(CTSLEB_PRS_AFR))
CTSLEB_PRS_AFR <- CTSLEB_PRS_AFR[,c("IID",colnames(CTSLEB_PRS_AFR)[3:ncol(CTSLEB_PRS_AFR)])]

load(CTSLEB_AMR_PRSs)
CTSLEB_PRS_AMR <- prs_mat_eb;rm(prs_mat_eb)
colnames(CTSLEB_PRS_AMR) <- c("FID","IID",paste0("AMR_",colnames(CTSLEB_PRS_AMR)[3:ncol(CTSLEB_PRS_AMR)]))
colnames(CTSLEB_PRS_AMR) <- gsub("-","_",colnames(CTSLEB_PRS_AMR))
CTSLEB_PRS_AMR <- CTSLEB_PRS_AMR[,c("IID",colnames(CTSLEB_PRS_AMR)[3:ncol(CTSLEB_PRS_AMR)])]

PRS_All <- inner_join(PROSPER_PRSs,JointPRS_PRS)
PRS_All <- inner_join(PRS_All,CTSLEB_PRS_AFR)
PRS_All <- inner_join(PRS_All,CTSLEB_PRS_AMR)

rm(CTSLEB_PRS_AMR);rm(CTSLEB_PRS_AFR);rm(JointPRS_PRS);rm(PROSPER_PRSs)
gc()

colnames(PRS_All) <- gsub("-","_",colnames(PRS_All))


GWAS_SumStats <- fread(GWAS_SumStats)

GWAS_SumStats <- as.data.frame(GWAS_SumStats)
GWAS_SumStats <- GWAS_SumStats[,c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","SNP")]
colnames(GWAS_SumStats) <- c("CHR","SNP","BP","REF","ALT","OBS_CT","BETA","SE","P","rs_ID")
GWAS_SumStats <- GWAS_SumStats[,c("CHR","SNP","BP","ALT","BETA","SE","P","rs_ID")]
colnames(GWAS_SumStats) <- c("CHR","SNP","BP","A1","BETA","SE","P","rs_ID")

GWAS_SumStats$SNP_Merge <- toupper(GWAS_SumStats$SNP)
GWAS_SumStats$SNP_Merge <- gsub(" ","",GWAS_SumStats$SNP_Merge)

SNP_GRCh37_38_match_update <- readRDS(snp_info)

SNP_GRCh37_38_match_update$unique_id1 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele1_38,":",SNP_GRCh37_38_match_update$allele2_38)
SNP_GRCh37_38_match_update$unique_id2 <- paste0("chr",SNP_GRCh37_38_match_update$chr,":",SNP_GRCh37_38_match_update$pos38,":",
                                                SNP_GRCh37_38_match_update$allele2_38,":",SNP_GRCh37_38_match_update$allele1_38)

SNP_GRCh37_38_match_update$unique_id1 <- toupper(SNP_GRCh37_38_match_update$unique_id1)
SNP_GRCh37_38_match_update$unique_id1 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id1)

SNP_GRCh37_38_match_update$unique_id2 <- toupper(SNP_GRCh37_38_match_update$unique_id2)
SNP_GRCh37_38_match_update$unique_id2 <- gsub(" ","",SNP_GRCh37_38_match_update$unique_id2)

GWAS_SumStats <- left_join(GWAS_SumStats,SNP_GRCh37_38_match_update[,c("unique_id1","rsid")],by = c("SNP_Merge"="unique_id1"))
GWAS_SumStats$rs_ID[!is.na(GWAS_SumStats$rsid)] <- GWAS_SumStats$rsid[!is.na(GWAS_SumStats$rsid)]

GWAS_SumStats <- subset(GWAS_SumStats,select = -c(rsid))

GWAS_SumStats <- left_join(GWAS_SumStats,SNP_GRCh37_38_match_update[,c("unique_id2","rsid")],by = c("SNP_Merge"="unique_id2"))
GWAS_SumStats$rs_ID[!is.na(GWAS_SumStats$rsid)] <- GWAS_SumStats$rsid[!is.na(GWAS_SumStats$rsid)]

GWAS_SumStats <- GWAS_SumStats[,c("SNP","A1","rs_ID")]

rm(SNP_GRCh37_38_match_update);gc()


PROSPER_Scores <- read.delim(PROSPER_Scores)
colnames(PROSPER_Scores) <- c("rs_ID","A1","A0",paste0("PROSPER_PRS_",1:(ncol(PROSPER_Scores) - 3)))

JointPRS_Score <- read.csv(JointPRS_Score,header = TRUE)
colnames(JointPRS_Score) <- c("CHR","rs_ID","BP","A1","A0","JointPRS_PRS")

CTSLEB_AFR_Scores <- get(load(CTSLEB_AFR_Scores))
CTSLEB_AFR_Scores <- CTSLEB_AFR_Scores[,-c(ncol(CTSLEB_AFR_Scores) - 1,ncol(CTSLEB_AFR_Scores))]
colnames(CTSLEB_AFR_Scores) <- c("SNP","A1",paste0("AFR_",colnames(CTSLEB_AFR_Scores)[3:ncol(CTSLEB_AFR_Scores)]))
colnames(CTSLEB_AFR_Scores) <- gsub("-","_",colnames(CTSLEB_AFR_Scores))

CTSLEB_AMR_Scores <- get(load(CTSLEB_AMR_Scores))
CTSLEB_AMR_Scores <- CTSLEB_AMR_Scores[,-c(ncol(CTSLEB_AMR_Scores) - 1,ncol(CTSLEB_AMR_Scores))]
colnames(CTSLEB_AMR_Scores) <- c("SNP","A1",paste0("AMR_",colnames(CTSLEB_AMR_Scores)[3:ncol(CTSLEB_AMR_Scores)]))
colnames(CTSLEB_AMR_Scores) <- gsub("-","_",colnames(CTSLEB_AMR_Scores))

PROSPER_Scores[!(paste0(PROSPER_Scores$rs_ID,"_",PROSPER_Scores$A1) %in% paste0(GWAS_SumStats$rs_ID,"_",GWAS_SumStats$A1)),paste0("PROSPER_PRS_",1:(ncol(PROSPER_Scores) - 3))] <- (-1)*PROSPER_Scores[!(paste0(PROSPER_Scores$rs_ID,"_",PROSPER_Scores$A1) %in% paste0(GWAS_SumStats$rs_ID,"_",GWAS_SumStats$A1)),paste0("PROSPER_PRS_",1:(ncol(PROSPER_Scores) - 3))]
JointPRS_Score[!(paste0(JointPRS_Score$rs_ID,"_",JointPRS_Score$A1) %in% paste0(GWAS_SumStats$rs_ID,"_",GWAS_SumStats$A1)),"JointPRS_PRS"] <- (-1)*JointPRS_Score[!(paste0(JointPRS_Score$rs_ID,"_",JointPRS_Score$A1) %in% paste0(GWAS_SumStats$rs_ID,"_",GWAS_SumStats$A1)),"JointPRS_PRS"]
CTSLEB_AFR_Scores[!(paste0(CTSLEB_AFR_Scores$SNP,"_",CTSLEB_AFR_Scores$A1) %in% paste0(GWAS_SumStats$SNP,"_",GWAS_SumStats$A1)),3:ncol(CTSLEB_AFR_Scores)] <- (-1)*CTSLEB_AFR_Scores[!(paste0(CTSLEB_AFR_Scores$SNP,"_",CTSLEB_AFR_Scores$A1) %in% paste0(GWAS_SumStats$SNP,"_",GWAS_SumStats$A1)),3:ncol(CTSLEB_AFR_Scores)]
CTSLEB_AMR_Scores[!(paste0(CTSLEB_AMR_Scores$SNP,"_",CTSLEB_AMR_Scores$A1) %in% paste0(GWAS_SumStats$SNP,"_",GWAS_SumStats$A1)),3:ncol(CTSLEB_AMR_Scores)] <- (-1)*CTSLEB_AMR_Scores[!(paste0(CTSLEB_AMR_Scores$SNP,"_",CTSLEB_AMR_Scores$A1) %in% paste0(GWAS_SumStats$SNP,"_",GWAS_SumStats$A1)),3:ncol(CTSLEB_AMR_Scores)]


## Merge covariates and y for tuning with the prs_mat
Best_Tune_All <- read.delim(RICE_CV_PRS_Tune)
Best_Tune_All <- inner_join(Best_Tune_All,PRS_All)
Best_Tune_All <- subset(Best_Tune_All,select = -IID)

tmp1 <- lm(prs ~.,data = Best_Tune_All) 
tmp <- coef(tmp1)
Beta_Star <- matrix(unname(tmp),ncol = 1)
Beta_Star <- Beta_Star[-1,,drop = FALSE]
Beta_Star[is.na(Beta_Star)] <- 0

PROSPER_Scores$Final_Prosper <- as.numeric(as.matrix(PROSPER_Scores[,paste0("PROSPER_PRS_",1:(ncol(PROSPER_Scores) - 3))])%*% Beta_Star[names(tmp)[-1] %in% paste0("PROSPER_PRS_",1:(ncol(PROSPER_Scores) - 3)),1,drop = FALSE])
JointPRS_Score$Final_JointPRS <- as.numeric(as.matrix(JointPRS_Score[,"JointPRS_PRS"])%*% Beta_Star[names(tmp)[-1] %in% "JointPRS_PRS",1,drop = FALSE])
CTSLEB_AMR_Scores$Final_CTSLEB_AMR <- as.numeric(as.matrix(CTSLEB_AMR_Scores[,colnames(CTSLEB_AMR_Scores)[3:ncol(CTSLEB_AMR_Scores)]])%*% Beta_Star[names(tmp)[-1] %in% colnames(CTSLEB_AMR_Scores)[3:ncol(CTSLEB_AMR_Scores)],1,drop = FALSE])
CTSLEB_AFR_Scores$Final_CTSLEB_AFR <- as.numeric(as.matrix(CTSLEB_AFR_Scores[,colnames(CTSLEB_AFR_Scores)[3:ncol(CTSLEB_AFR_Scores)]])%*% Beta_Star[names(tmp)[-1] %in% colnames(CTSLEB_AFR_Scores)[3:ncol(CTSLEB_AFR_Scores)],1,drop = FALSE])

GWAS_SumStats <- left_join(GWAS_SumStats,PROSPER_Scores[,c("rs_ID","Final_Prosper")])  
GWAS_SumStats <- left_join(GWAS_SumStats,JointPRS_Score[,c("rs_ID","Final_JointPRS")]) 
GWAS_SumStats <- left_join(GWAS_SumStats,CTSLEB_AMR_Scores[,c("SNP","Final_CTSLEB_AMR")])  
GWAS_SumStats <- left_join(GWAS_SumStats,CTSLEB_AFR_Scores[,c("SNP","Final_CTSLEB_AFR")]) 

GWAS_SumStats[is.na(GWAS_SumStats)] <- 0 
GWAS_SumStats$BETA <- GWAS_SumStats$Final_Prosper +GWAS_SumStats$Final_JointPRS +GWAS_SumStats$Final_CTSLEB_AMR +GWAS_SumStats$Final_CTSLEB_AFR

GWAS_SumStats <- GWAS_SumStats[,c("SNP","A1","BETA")]
GWAS_SumStats <- GWAS_SumStats[GWAS_SumStats$BETA != 0,]

write.table(GWAS_SumStats,file = paste0(OUTPUT_PATH,"/",trait,"_Final_Score"),col.names = T,row.names = F,quote=F)

Validation <- read.delim(Validation)
Keep <- data.frame(FID = 0,IID = Validation$IID)
write.table(Keep,"Keep.txt",row.names = FALSE,col.names = FALSE)

system(paste0("plink2 --threads 2 --score ",OUTPUT_PATH,"/",trait,"_Final_Score cols=+scoresums,-scoreavgs header no-mean-imputation --keep Keep.txt --bfile ",BED_Full_File," --out test_validation"),intern = TRUE)

Best_Validation_All <- read.csv(RICE_CV_PRS_Validation,sep = "\t")
test_validation <- read.delim("test_validation.sscore", header=FALSE, comment.char="#")

all.equal(Best_Validation_All$IID,test_validation[,2])
all.equal(Best_Validation_All$prs,test_validation[,5])
cor(Best_Validation_All$prs,test_validation[,5])

Writing Extract_Betas_All.R


In [27]:
%%writefile Extract_Betas_All.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${GWAS_SumStats} ${snp_info} ${PROSPER_PRSs} ${JointPRS_PRS} ${CTSLEB_AFR_PRSs} ${CTSLEB_AMR_PRSs} ${PROSPER_Scores} ${JointPRS_Score} ${CTSLEB_AFR_Scores} ${CTSLEB_AMR_Scores} ${RICE_CV_PRS_Tune} ${RICE_CV_PRS_Validation} ${Validation} ${trait} ${BED_Full_File} ${OUTPUT_PATH}

Writing Extract_Betas_All.sh


In [28]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


for(trait in c("BMI","LDL","HDL","TC","logTG","Height")){
  tasks <- rbind(tasks, data.frame(
    '--input GWAS_SumStats'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GWAS_SummaryStats/","EUR","_",trait,"_GWAS_SumStats_Cleaned"),
    '--input snp_info'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/SNP_GRCh37_38_match_update.rds",
    '--input PROSPER_PRSs'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs/PRS_All_",trait,".sscore"),
    '--input JointPRS_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/PRSs/PRS_META_",trait,".sscore"),
    '--input CTSLEB_AFR_PRSs'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AFR_",trait,"_prs_mat_eb.RData"),
    '--input CTSLEB_AMR_PRSs'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AMR_",trait,"_prs_mat_eb.RData"),
    '--input PROSPER_Scores'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/",trait,"/before_ensemble/score_file.txt"),
    '--input JointPRS_Score'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/Scores/JointPRS_META_Score_",trait,".csv"),
    '--input CTSLEB_AFR_Scores'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AFR_",trait,"_betas_eb.RData"),
    '--input CTSLEB_AMR_Scores'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/AMR_",trait,"_betas_eb.RData"),
    '--input RICE_CV_PRS_Tune'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",trait,"_Best_Tune_All.txt"),
    '--input RICE_CV_PRS_Validation'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",trait,"_Best_Validation_All.txt"),
    '--input Validation'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input BED_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bed",
    '--input BIM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.bim",
    '--input FAM_Full_File'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Data/CommonVariants/all_chr.fam",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Extract_Betas_All.R",
    '--env trait'=trait,
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",
    check.names = FALSE
  ))   
}

write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [29]:
!Rscript score_task.R

In [30]:
!gsutil -m cp Extract_Betas_All.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Extract_Betas_All.R [Content-Type=application/octet-stream]...
/ [1/1 files][  9.3 KiB/  9.3 KiB] 100% Done                                    
Operation completed over 1 objects/9.3 KiB.                                      


In [31]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 700 \
  --boot-disk-size 25 \
  --min-ram 72 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Extract_Betas_All.sh \
  --tasks score_task.txt

Job properties:
  job-id: extract-be--williamsjacr--250516-234309-35
  job-name: extract-betas-all
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3885708353785787379
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17699662607382907460
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/5596598642924517590
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6714268682014752827
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4772606334659318813
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6504039311456878092
Launched job-id: extract-be--williamsjacr--250516-234309-35
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'extract-be--williamsjacr--250516-234

In [8]:
%%writefile NullModel.R
rm(list = ls())

all_train_file <- commandArgs(TRUE)[1]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[2]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[3]
print(all_validation_file)

INPUT_PATH <- commandArgs(TRUE)[4]
print(INPUT_PATH)

OUTPUT_PATH <- commandArgs(TRUE)[5]
print(OUTPUT_PATH)

library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(dplyr)
library(STAAR)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(Matrix)
library(SCANG)
library(STAARpipeline)

trait <- "BMI"

for(trait in c("BMI","TC","LDL","HDL","logTG","Height")){
  pheno_train <- read.delim(all_train_file)
  
  common_prs <- read.delim(paste0(INPUT_PATH,"/",trait,"_Best_Train_All.txt"))
  
  pheno_train <- inner_join(pheno_train,common_prs,by = "IID")
  
  obj.STAAR.UKB <- fit_nullmodel(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10 + prs")), data = pheno_train,id = "IID",kins = NULL,family = gaussian(link = "identity"))
  
  save(obj.STAAR.UKB,file = paste0(OUTPUT_PATH,"/",trait,"_Train_Null_Model.RData"))
  
  
  
  
  pheno_tune <- read.delim(all_tune_file)
  
  common_prs <- read.delim(paste0(INPUT_PATH,"/",trait,"_Best_Tune_All.txt"))
  
  pheno_tune <- inner_join(pheno_tune,common_prs,by = "IID")
  
  obj.STAAR.UKB <- fit_nullmodel(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10 + prs")), data = pheno_tune,id = "IID",kins = NULL,family = gaussian(link = "identity"))
  
  save(obj.STAAR.UKB,file = paste0(OUTPUT_PATH,"/",trait,"_Tune_Null_Model.RData"))
  
  
  
  
  pheno_validation <- read.delim(all_validation_file)
  
  common_prs <- read.delim(paste0(INPUT_PATH,"/",trait,"_Best_Validation_All.txt"))
  
  pheno_validation <- inner_join(pheno_validation,common_prs,by = "IID")
  
  obj.STAAR.UKB <- fit_nullmodel(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10 + prs")), data = pheno_validation,id = "IID",kins = NULL,family = gaussian(link = "identity"))
  
  save(obj.STAAR.UKB,file = paste0(OUTPUT_PATH,"/",trait,"_Validation_Null_Model.RData"))
  
  
}

Overwriting NullModel.R


In [9]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

tasks <- rbind(tasks, data.frame(
    '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
    '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS",
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/NullModel.R",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels",
    check.names = FALSE
))   
   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [10]:
%%writefile NullModel.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${all_train_file} ${all_tune_file} ${all_validation_file} ${INPUT_PATH} ${OUTPUT_PATH}

Overwriting NullModel.sh


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp NullModel.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://NullModel.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.0 KiB/  2.0 KiB] 100% Done                                    
Operation completed over 1 objects/2.0 KiB.                                      


In [13]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script NullModel.sh \
  --tasks score_task.txt

Job properties:
  job-id: nullmodel--williamsjacr--240801-214711-99
  job-name: nullmodel
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4654998437295274832
Launched job-id: nullmodel--williamsjacr--240801-214711-99
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'nullmodel--williamsjacr--240801-214711-99' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'nullmodel--williamsjacr--240801-214711-99' --users 'williamsjacr'


In [33]:
%%writefile ~/aou_dsub.bash
#!/bin/bash
function aou_dsub () {

  # Get a shorter username to leave more characters for the job name.
  local DSUB_USER_NAME="$(echo "${OWNER_EMAIL}" | cut -d@ -f1)"

  # For AoU RWB projects network name is "network".
  local AOU_NETWORK=network
  local AOU_SUBNETWORK=subnetwork

  dsub \
      --provider google-cls-v2 \
      --user-project "${GOOGLE_PROJECT}"\
      --project "${GOOGLE_PROJECT}"\
      --image 'marketplace.gcr.io/google/ubuntu1804:latest' \
      --network "${AOU_NETWORK}" \
      --subnetwork "${AOU_SUBNETWORK}" \
      --service-account "$(gcloud config get-value account)" \
      --user "${DSUB_USER_NAME}" \
      --regions us-central1 \
      --logging "${WORKSPACE_BUCKET}/dsub/logs/{job-name}/{user-id}/$(date +'%Y%m%d/%H%M%S')/{job-id}-{task-id}-{task-attempt}.log" \
      "$@"
}

Overwriting /home/jupyter/aou_dsub.bash


In [34]:
!echo source ~/aou_dsub.bash >> ~/.bashrc

In [91]:
%%writefile ncRNA_Analysis.R
rm(list=ls())
gc()

arrayid <- as.numeric(commandArgs(TRUE)[1])

trait <- commandArgs(TRUE)[2]
print(trait)

Train_Null_Model <- commandArgs(TRUE)[3]
print(Train_Null_Model)

Annotation_name_catalog <- commandArgs(TRUE)[4]
print(Annotation_name_catalog)

GDS_File <- commandArgs(TRUE)[5]
print(GDS_File)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)

## load required packages
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
library(STAARpipelineSummary)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(readr)
library(dplyr)
library(stringr)

obj_nullmodel <- get(load(Train_Null_Model))

## Parameter
QC_label <- "annotation/filter"
geno_missing_imputation <- "mean"
variant_type <- "SNV"

## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- read.csv(Annotation_name_catalog)
## Use_annotation_weights
Use_annotation_weights <- TRUE
## Annotation name
Annotation_name <- c("CADD","LINSIGHT","FATHMM.XF","aPC.EpigeneticActive","aPC.EpigeneticRepressed","aPC.EpigeneticTranscription",
                     "aPC.Conservation","aPC.LocalDiversity","aPC.Mappability","aPC.TF","aPC.Protein")

## output file name
output_file_name <- paste0(OUTPUT_PATH,"/",trait,"_AoU_WGS_ncRNA_Train")

## gene number in job
gene_num_in_array <- 100 
group.num.allchr <- ceiling(table(ncRNA_gene[,1])/gene_num_in_array)

chr <- which.max(arrayid <= cumsum(group.num.allchr))
group.num <- group.num.allchr[chr]

if (chr == 1){
  groupid <- arrayid
}else{
  groupid <- arrayid - cumsum(group.num.allchr)[chr-1]
}

ncRNA_gene_chr <- ncRNA_gene[ncRNA_gene[,1]==chr,]
ncRNA_longmasks <- c("KCNQ1OT1","AC006548.28","RP3-394A18.1","RP3-323A16.1","RP5-1039K5.19","SNHG14","LL22NC03-86G7.1")
ncRNA_gene_chr <- ncRNA_gene_chr[!ncRNA_gene_chr[,2] %in% ncRNA_longmasks,]
sub_seq_num <- dim(ncRNA_gene_chr)[1]

if(groupid < group.num){
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):(groupid*gene_num_in_array)
}else{
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):sub_seq_num
}

###########################################################
#           Main Function 
###########################################################

ncRNA_nolongmask <- function(chr,gene_name,genofile,obj_nullmodel,
                             rare_maf_cutoff=0.01,rv_num_cutoff=2,
                             QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                             Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                             Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                             SPA_p_filter=TRUE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  ## get SNV id
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant"){
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV"){
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel"){
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ## ncRNA SNVs
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  is.in <- ((GENCODE.Category=="ncRNA_exonic")|(GENCODE.Category=="ncRNA_exonic;splicing")|(GENCODE.Category=="ncRNA_splicing"))&(SNVlist)
  
  variant.id.ncRNA <- variant.id[is.in]
  
  rm(GENCODE.Category)
  gc()
  
  seqSetFilter(genofile,variant.id=variant.id.ncRNA,sample.id=phenotype.id)
  
  rm(variant.id.ncRNA)
  gc()
  
  GENCODE.Info <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
  GENCODE.Info.split <- strsplit(GENCODE.Info, split = "[;]")
  Gene <- as.character(sapply(GENCODE.Info.split,function(z) gsub("\\(.*\\)","",z[1])))
  
  Gene_list_1 <- as.character(sapply(strsplit(Gene,','),'[',1))
  Gene_list_2 <- as.character(sapply(strsplit(Gene,','),'[',2))
  Gene_list_3 <- as.character(sapply(strsplit(Gene,','),'[',3))
  
  rm(GENCODE.Info)
  gc()
  
  rm(GENCODE.Info.split)
  gc()
  
  variant.id.ncRNA <- seqGetData(genofile, "variant.id")
  
  seqResetFilter(genofile)
  
  ### Gene
  is.in <- union(which(Gene_list_1==gene_name),which(Gene_list_2==gene_name))
  is.in <- union(is.in,which(Gene_list_3==gene_name))
  
  variant.is.in <- variant.id.ncRNA[is.in]
  
  if(length(variant.is.in) > 10000){
    return(NULL)
  }
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list")){
    results_temp <- rep(NA,4)
    results_temp[3] <- "ncRNA"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results)){
    if(!use_SPA){
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

agds.path <- GDS_File

genofile <- seqOpen(agds.path)

gene_centric_ncRNA_dnanexus <- function(ncRNA_gene_chr,gene_name,chr,genofile,obj_nullmodel,rare_maf_cutoff,
                                        QC_label,variant_type,geno_missing_imputation,
                                        Annotation_dir,Annotation_name_catalog,
                                        Use_annotation_weights,Annotation_name,silent){
  results <- try(ncRNA_nolongmask(chr=chr,gene_name=gene_name,genofile=genofile,obj_nullmodel=obj_nullmodel,rare_maf_cutoff=rare_maf_cutoff,
                                  QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                  Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                  Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=silent),silent=TRUE)
  return(results)
}

results_ncRNA <- c()
for(kk in sub_seq_id){
  print(kk)
  gene_name <- ncRNA_gene_chr[kk,2]
  results <- gene_centric_ncRNA_dnanexus(ncRNA_gene_chr=ncRNA_gene_chr,gene_name = gene_name,chr=chr,genofile=genofile,obj_nullmodel=obj_nullmodel,rare_maf_cutoff=max.maf,
                                         QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                         Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                         Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=TRUE)
  
  results_ncRNA <- append(results_ncRNA,results)
}

seqClose(genofile)

rm(list=setdiff(ls(), c("results_ncRNA", "output_file_name","arrayid","agds.path","trait"))); gc()
save(results_ncRNA, file = paste0(output_file_name,"_",arrayid,".Rdata"))

Overwriting ncRNA_Analysis.R


In [92]:
%%writefile ncRNA_Analysis.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${arrayid} ${trait} ${Train_Null_Model} ${Annotation_name_catalog} ${GDS_File} ${OUTPUT_PATH}

Overwriting ncRNA_Analysis.sh


In [93]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","Height","TC")){
    for(arrayid in 222){
        if(arrayid < 19){
            chr <- 1
        }else if(arrayid < 35){
            chr <- 2
        }else if(arrayid < 46){
            chr <- 3
        }else if(arrayid < 56){
            chr <- 4
        }else if(arrayid < 69){
            chr <- 5
        }else if(arrayid < 79){
            chr <- 6
        }else if(arrayid < 89){
            chr <- 7
        }else if(arrayid < 100){
            chr <- 8
        }else if(arrayid < 108){
            chr <- 9
        }else if(arrayid < 117){
            chr <- 10
        }else if(arrayid < 128){
            chr <- 11
        }else if(arrayid < 141){
            chr <- 12
        }else if(arrayid < 147){
            chr <- 13
        }else if(arrayid < 156){
            chr <- 14
        }else if(arrayid < 166){
            chr <- 15
        }else if(arrayid < 178){
            chr <- 16
        }else if(arrayid < 191){
            chr <- 17
        }else if(arrayid < 198){
            chr <- 18
        }else if(arrayid < 208){
            chr <- 19
        }else if(arrayid < 214){
            chr <- 20
        }else if(arrayid < 218){
            chr <- 21
        }else{
            chr <- 22
        }
        
        tasks <- rbind(tasks, data.frame(
            '--env arrayid'=arrayid,
            '--env trait'=trait,
            '--input Train_Null_Model'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Train_Null_Model.RData"),
            '--input Annotation_name_catalog'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataGDS/acaf_threshold_v7/Annotation_name_catalog.csv",
            '--input GDS_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataAGDS/acaf_threshold_v7/acaf_threshold.chr",chr,".gds"),
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/ncRNA_Analysis.R",
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentric_ncRNA",
            check.names = FALSE
        )) 
    }
}  
   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [94]:
!Rscript score_task.R

In [95]:
!gsutil -m cp ncRNA_Analysis.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://ncRNA_Analysis.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 10.5 KiB/ 10.5 KiB] 100% Done                                    
Operation completed over 1 objects/10.5 KiB.                                     


In [96]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 32 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script ncRNA_Analysis.sh \
  --tasks score_task.txt

Job properties:
  job-id: ncrna-anal--williamsjacr--240802-141249-95
  job-name: ncrna-analysis
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15945330792918160714
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12474725655731960867
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10526454129034710544
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/2471146011125752468
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16532595984810075093
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/6231998838750485626
Launched job-id: ncrna-anal--williamsjacr--240802-141249-95
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'ncrna-anal--williamsjacr--240802-141

In [98]:
%%writefile Noncoding_Analysis.R
rm(list=ls())
gc()

arrayid <- as.numeric(commandArgs(TRUE)[1])

trait <- commandArgs(TRUE)[2]
print(trait)

Train_Null_Model <- commandArgs(TRUE)[3]
print(Train_Null_Model)

Annotation_name_catalog <- commandArgs(TRUE)[4]
print(Annotation_name_catalog)

GDS_File <- commandArgs(TRUE)[5]
print(GDS_File)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)

## load required packages
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
library(STAARpipelineSummary)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(readr)
library(dplyr)
library(stringr)

obj_nullmodel <- get(load(Train_Null_Model))

## Parameter
QC_label <- "annotation/filter"
geno_missing_imputation <- "mean"
variant_type <- "SNV"	

## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- read.csv(Annotation_name_catalog)
## Use_annotation_weights
Use_annotation_weights <- TRUE
## Annotation name
Annotation_name <- c("CADD","LINSIGHT","FATHMM.XF","aPC.EpigeneticActive","aPC.EpigeneticRepressed","aPC.EpigeneticTranscription",
                     "aPC.Conservation","aPC.LocalDiversity","aPC.Mappability","aPC.TF","aPC.Protein")

## output file name
output_file_name <- paste0(OUTPUT_PATH,"/",trait,"_AoU_WGS_Noncoding_Train")

###########################################################
#           Main Function 
###########################################################


## gene number in job
gene_num_in_array <- 50 
group.num.allchr <- ceiling(table(genes_info[,2])/gene_num_in_array)

chr <- which.max(arrayid <= cumsum(group.num.allchr))
group.num <- group.num.allchr[chr]

if(chr == 1){
  groupid <- arrayid
}else{
  groupid <- arrayid - cumsum(group.num.allchr)[chr-1]
}

genes_info_chr <- genes_info[genes_info[,2]==chr,]
noncoding_longmasks <- c("ARID4B","RERE","CRIM1","EML4","EML6","GLI2","ITGB6","FNDC3B","AFAP1","TRIO",
                         "NR3C1","SGK1","PRDM1","TNS3","GNA12","HOXA2","PLEC","ASAP1","CSGALNACT1","RAPGEF1",
                         "KLF6","DNAJB12","ABLIM1","ETV6","IGF1R","RMI2","CMIP","HOXB2","SEPT9","SPECC1",
                         "CIRBP","ADNP","CDC42EP3","EPAS1","MGAT5")
genes_info_chr <- genes_info_chr[!genes_info_chr[,1] %in% noncoding_longmasks,]
sub_seq_num <- dim(genes_info_chr)[1]

if(groupid < group.num){
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):(groupid*gene_num_in_array)
}else{
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):sub_seq_num
}

Gene_Centric_Noncoding_preload <- function(chr,gene_name,category=c("all_categories","downstream","upstream","UTR","promoter_CAGE","promoter_DHS","enhancer_CAGE","enhancer_DHS"),
                                           genofile,obj_nullmodel,
                                           dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE,
                                           dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs,
                                           dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE,
                                           dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs,
                                           rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                           QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                           Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                           Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                           SPA_p_filter=TRUE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  category <- match.arg(category)
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  if(category=="all_categories"){
    results <- noncoding_preload(chr,gene_name,genofile,obj_nullmodel,
                                 dfPromCAGEVarGene.SNV=dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE=variant.id.SNV.PromCAGE,
                                 dfPromrOCRsVarGene.SNV=dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs=variant.id.SNV.PromrOCRs,
                                 dfHancerCAGEVarGene.SNV=dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE=variant.id.SNV.HancerCAGE,
                                 dfHancerrOCRsVarGene.SNV=dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs=variant.id.SNV.HancerrOCRs,
                                 rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                 Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                 Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                 SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="downstream"){
    results <- downstream(chr,gene_name,genofile,obj_nullmodel,
                          rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                          QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                          Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                          Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                          SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="upstream"){
    results <- upstream(chr,gene_name,genofile,obj_nullmodel,
                        rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                        QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                        Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                        Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                        SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="UTR"){
    results <- UTR(chr,gene_name,genofile,obj_nullmodel,
                   rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                   QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                   Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                   Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                   SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="promoter_CAGE"){
    results <- promoter_CAGE_preload(chr,gene_name,genofile,obj_nullmodel,
                                     dfPromCAGEVarGene.SNV=dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE=variant.id.SNV.PromCAGE,
                                     rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                                     QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                     Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                     Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                     SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="promoter_DHS"){
    results <- promoter_DHS_preload(chr,gene_name,genofile,obj_nullmodel,
                                    dfPromrOCRsVarGene.SNV=dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs=variant.id.SNV.PromrOCRs,
                                    rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                                    QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                    Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                    Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                    SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="enhancer_CAGE"){
    results <- enhancer_CAGE_preload(chr,gene_name,genofile,obj_nullmodel,
                                     dfHancerCAGEVarGene.SNV=dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE=variant.id.SNV.HancerCAGE,
                                     rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                                     QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                     Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                     Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                     SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="enhancer_DHS"){
    results <- enhancer_DHS_preload(chr,gene_name,genofile,obj_nullmodel,
                                    dfHancerrOCRsVarGene.SNV=dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs=variant.id.SNV.HancerrOCRs,
                                    rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                                    QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                    Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                    Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                    SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  return(results)
}

noncoding_preload <- function(chr,gene_name,genofile,obj_nullmodel,
                              dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE,
                              dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs,
                              dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE,
                              dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs,
                              rare_maf_cutoff=0.01,rv_num_cutoff=2,
                              QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                              Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                              Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                              SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  #####################################
  #   Gene Info
  ## get SNV id
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant"){
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV"){
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel"){
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ########################################
  #   Downstream
  ## downstream SNVs
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  is.in <- (GENCODE.Category=="downstream")&(SNVlist)
  variant.id.downstream <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.downstream,sample.id=phenotype.id)
  
  rm(variant.id.downstream)
  gc()
  
  GENCODE.Info <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
  GENCODE.Info.split <- strsplit(GENCODE.Info, split = "[,]")
  variant_gene_num <- sapply(GENCODE.Info.split,function(z) length(z))
  
  variant.id.SNV <- seqGetData(genofile, "variant.id")
  variant.id.SNV <- rep(variant.id.SNV,variant_gene_num)
  
  rm(GENCODE.Info)
  gc()
  
  rm(variant_gene_num)
  gc()
  
  Gene <- as.character(unlist(GENCODE.Info.split))
  
  rm(GENCODE.Info.split)
  gc()
  
  seqResetFilter(genofile)
  
  ### Gene
  is.in <- which(Gene==gene_name)
  variant.is.in <- variant.id.SNV[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_downstream <- c()
  if(inherits(pvalues, "list")){
    results_temp <- rep(NA,4)
    results_temp[3] <- "downstream"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_downstream <- rbind(results_downstream,results_temp)
  }
  
  if(!is.null(results_downstream)){
    if(!use_SPA){
      colnames(results_downstream) <- colnames(results_downstream, do.NULL = FALSE, prefix = "col")
      colnames(results_downstream)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_downstream)[(dim(results_downstream)[2]-1):dim(results_downstream)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_downstream) <- colnames(results_downstream, do.NULL = FALSE, prefix = "col")
      colnames(results_downstream)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_downstream)[dim(results_downstream)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ########################################
  #   Upstream
  
  is.in <- (GENCODE.Category=="upstream")&(SNVlist)
  variant.id.upstream <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.upstream,sample.id=phenotype.id)
  
  rm(variant.id.upstream)
  gc()
  
  GENCODE.Info <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
  GENCODE.Info.split <- strsplit(GENCODE.Info, split = "[,]")
  variant_gene_num <- sapply(GENCODE.Info.split,function(z) length(z))
  
  variant.id.SNV <- seqGetData(genofile, "variant.id")
  variant.id.SNV <- rep(variant.id.SNV,variant_gene_num)
  
  rm(GENCODE.Info)
  gc()
  
  rm(variant_gene_num)
  gc()
  
  Gene <- as.character(unlist(GENCODE.Info.split))
  
  rm(GENCODE.Info.split)
  gc()
  
  seqResetFilter(genofile)
  
  ### Gene
  is.in <- which(Gene==gene_name)
  variant.is.in <- variant.id.SNV[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_upstream <- c()
  if(inherits(pvalues, "list")){
    results_temp <- rep(NA,4)
    results_temp[3] <- "upstream"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_upstream <- rbind(results_upstream,results_temp)
  }
  
  if(!is.null(results_upstream)){
    if(!use_SPA){
      colnames(results_upstream) <- colnames(results_upstream, do.NULL = FALSE, prefix = "col")
      colnames(results_upstream)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_upstream)[(dim(results_upstream)[2]-1):dim(results_upstream)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_upstream) <- colnames(results_upstream, do.NULL = FALSE, prefix = "col")
      colnames(results_upstream)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_upstream)[dim(results_upstream)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ########################################################
  #                UTR
  
  is.in <- ((GENCODE.Category=="UTR3")|(GENCODE.Category=="UTR5")|(GENCODE.Category=="UTR5;UTR3"))&(SNVlist)
  variant.id.UTR <- variant.id[is.in]
  
  rm(GENCODE.Category)
  gc()
  
  seqSetFilter(genofile,variant.id=variant.id.UTR,sample.id=phenotype.id)
  
  rm(variant.id.UTR)
  gc()
  
  GENCODE.Info <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
  GENCODE.Info.split <- strsplit(GENCODE.Info, split = "[(]")
  
  rm(GENCODE.Info)
  gc()
  
  # Gene <- as.character(sapply(GENCODE.Info.split,function(z) z[seq(1,length(z),2)]))
  Gene <- as.character(sapply(GENCODE.Info.split,function(z) z[1]))
  
  rm(GENCODE.Info.split)
  gc()
  
  variant.id.SNV <- seqGetData(genofile, "variant.id")
  
  seqResetFilter(genofile)
  
  ### Gene
  is.in <- which(Gene==gene_name)
  variant.is.in <- variant.id.SNV[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_UTR <- c()
  if(inherits(pvalues, "list")){
    results_temp <- rep(NA,4)
    results_temp[3] <- "UTR"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_UTR <- rbind(results_UTR,results_temp)
  }
  
  if(!is.null(results_UTR)){
    if(!use_SPA){
      colnames(results_UTR) <- colnames(results_UTR, do.NULL = FALSE, prefix = "col")
      colnames(results_UTR)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_UTR)[(dim(results_UTR)[2]-1):dim(results_UTR)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_UTR) <- colnames(results_UTR, do.NULL = FALSE, prefix = "col")
      colnames(results_UTR)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_UTR)[dim(results_UTR)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  #############################################
  #   Promoter-CAGE
  
  ### Gene
  is.in <- which(dfPromCAGEVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.PromCAGE[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_promoter_CAGE <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfPromCAGEVarGene.SNV[1,1:4]
    results_temp[3] <- "promoter_CAGE"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_promoter_CAGE <- rbind(results_promoter_CAGE,results_temp)
  }
  
  if(!is.null(results_promoter_CAGE)){
    if(!use_SPA){
      colnames(results_promoter_CAGE) <- colnames(results_promoter_CAGE, do.NULL = FALSE, prefix = "col")
      colnames(results_promoter_CAGE)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_promoter_CAGE)[(dim(results_promoter_CAGE)[2]-1):dim(results_promoter_CAGE)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_promoter_CAGE) <- colnames(results_promoter_CAGE, do.NULL = FALSE, prefix = "col")
      colnames(results_promoter_CAGE)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_promoter_CAGE)[dim(results_promoter_CAGE)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ##################################################
  #       Promoter-DHS
  
  ### Gene
  is.in <- which(dfPromrOCRsVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.PromrOCRs[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_promoter_DHS <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfPromrOCRsVarGene.SNV[1,1:4]
    results_temp[3] <- "promoter_DHS"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_promoter_DHS <- rbind(results_promoter_DHS ,results_temp)
  }
  
  if(!is.null(results_promoter_DHS)){
    if(!use_SPA){
      colnames(results_promoter_DHS) <- colnames(results_promoter_DHS, do.NULL = FALSE, prefix = "col")
      colnames(results_promoter_DHS)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_promoter_DHS)[(dim(results_promoter_DHS)[2]-1):dim(results_promoter_DHS)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_promoter_DHS) <- colnames(results_promoter_DHS, do.NULL = FALSE, prefix = "col")
      colnames(results_promoter_DHS)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_promoter_DHS)[dim(results_promoter_DHS)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ###########################################
  #        Enhancer-CAGE
  
  ### Gene
  is.in <- which(dfHancerCAGEVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.HancerCAGE[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_enhancer_CAGE <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfHancerCAGEVarGene.SNV[1,1:4]
    results_temp[3] <- "enhancer_CAGE"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_enhancer_CAGE <- rbind(results_enhancer_CAGE,results_temp)
  }
  
  if(!is.null(results_enhancer_CAGE)){
    if(!use_SPA){
      colnames(results_enhancer_CAGE) <- colnames(results_enhancer_CAGE, do.NULL = FALSE, prefix = "col")
      colnames(results_enhancer_CAGE)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_enhancer_CAGE)[(dim(results_enhancer_CAGE)[2]-1):dim(results_enhancer_CAGE)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_enhancer_CAGE) <- colnames(results_enhancer_CAGE, do.NULL = FALSE, prefix = "col")
      colnames(results_enhancer_CAGE)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_enhancer_CAGE)[dim(results_enhancer_CAGE)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ##################################################
  #       Enhancer-DHS
  
  ### Gene
  is.in <- which(dfHancerrOCRsVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.HancerrOCRs[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_enhancer_DHS <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfHancerrOCRsVarGene.SNV[1,1:4]
    results_temp[3] <- "enhancer_DHS"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_enhancer_DHS <- rbind(results_enhancer_DHS,results_temp)
  }
  
  if(!is.null(results_enhancer_DHS)){
    if(!use_SPA){
      colnames(results_enhancer_DHS) <- colnames(results_enhancer_DHS, do.NULL = FALSE, prefix = "col")
      colnames(results_enhancer_DHS)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_enhancer_DHS)[(dim(results_enhancer_DHS)[2]-1):dim(results_enhancer_DHS)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results_enhancer_DHS) <- colnames(results_enhancer_DHS, do.NULL = FALSE, prefix = "col")
      colnames(results_enhancer_DHS)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_enhancer_DHS)[dim(results_enhancer_DHS)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  ############################################
  #           results
  
  results_noncoding <- list(upstream=results_upstream,downstream=results_downstream,UTR=results_UTR,
                            promoter_CAGE=results_promoter_CAGE,promoter_DHS=results_promoter_DHS,
                            enhancer_CAGE=results_enhancer_CAGE,enhancer_DHS=results_enhancer_DHS)
  
  return(results_noncoding)
  
}

promoter_CAGE_preload <- function(chr,gene_name,genofile,obj_nullmodel,
                                  dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE,
                                  rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                  QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                  Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                  Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                  SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  ### Gene
  is.in <- which(dfPromCAGEVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.PromCAGE[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfPromCAGEVarGene.SNV[1,1:4]
    results_temp[3] <- "promoter_CAGE"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results)){
    if(!use_SPA){
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

promoter_DHS_preload <- function(chr,gene_name,genofile,obj_nullmodel,
                                 dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs,
                                 rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                 QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                 Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                 Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                 SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  ### Gene
  is.in <- which(dfPromrOCRsVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.PromrOCRs[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfPromrOCRsVarGene.SNV[1,1:4]
    results_temp[3] <- "promoter_DHS"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  
  if(!is.null(results)){
    if(!use_SPA){
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

enhancer_CAGE_preload <- function(chr,gene_name,genofile,obj_nullmodel,
                                  dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE,
                                  rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                  QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                  Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                  Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                  SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  ### Gene
  is.in <- which(dfHancerCAGEVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.HancerCAGE[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfHancerCAGEVarGene.SNV[1,1:4]
    results_temp[3] <- "enhancer_CAGE"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results)){
    if(!use_SPA){
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

enhancer_DHS_preload <- function(chr,gene_name,genofile,obj_nullmodel,
                                 dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs,
                                 rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                 QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                 Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                 Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                 SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA)){
    use_SPA <- obj_nullmodel$use_SPA
  }else{
    use_SPA <- FALSE
  }
  
  ### Gene
  is.in <- which(dfHancerrOCRsVarGene.SNV[,5]==gene_name)
  variant.is.in <- variant.id.SNV.HancerrOCRs[is.in]
  
  seqSetFilter(genofile,variant.id=variant.is.in,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno))){
    if(dim(Geno)[2]>0){
      if(geno_missing_imputation=="mean"){
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor"){
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV"){
    if(Use_annotation_weights){
      for(k in 1:length(Annotation_name)){
        if(Annotation_name[k]%in%Annotation_name_catalog$name){
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD"){
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity"){
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1){
    if(!use_SPA){
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else{
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list")){
    results_temp <- dfHancerrOCRsVarGene.SNV[1,1:4]
    results_temp[3] <- "enhancer_DHS"
    results_temp[2] <- chr
    results_temp[1] <- as.character(gene_name)
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA){
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else{
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results)){
    if(!use_SPA){
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else{
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

gene_centric_noncoding_dnanexus <- function(genes_info_chr,gene_name,chr,genofile,obj_nullmodel,
                                            dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE,
                                            dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs,
                                            dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE,
                                            dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs,
                                            rare_maf_cutoff,QC_label,variant_type,geno_missing_imputation,
                                            Annotation_dir,Annotation_name_catalog,
                                            Use_annotation_weights,Annotation_name,silent){
  results <- try(Gene_Centric_Noncoding_preload(chr=chr,gene_name=gene_name,genofile=genofile,obj_nullmodel=obj_nullmodel,
                                                dfPromCAGEVarGene.SNV=dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE=variant.id.SNV.PromCAGE,
                                                dfPromrOCRsVarGene.SNV=dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs=variant.id.SNV.PromrOCRs,
                                                dfHancerCAGEVarGene.SNV=dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE=variant.id.SNV.HancerCAGE,
                                                dfHancerrOCRsVarGene.SNV=dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs=variant.id.SNV.HancerrOCRs,
                                                rare_maf_cutoff=rare_maf_cutoff,QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                                Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                                Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=silent),silent=TRUE)
  return(results)
}

#########################################################
#             Promoter_CAGE
#########################################################

agds.path <- GDS_File

genofile <- seqOpen(agds.path)

varid <- seqGetData(genofile, "variant.id")
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
promGobj <- promoters(genes(txdb), upstream = 3000, downstream = 3000)

#Subsetting Promoters that within +/-3kb of TSS and have CAGE signals
CAGEAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="CAGE")]))
CAGEBvt <- CAGEAnno!=""
CAGEidx <- which(CAGEBvt,useNames=TRUE)
seqSetFilter(genofile,variant.id=varid[CAGEidx])
seqSetFilter(genofile,promGobj,intersect=TRUE)
CAGEpromgene <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
CAGEGene <- unlist(lapply(strsplit(CAGEpromgene,"\\(|\\,|;|-"),`[[`,1))
##obtain variants info
CAGEvchr <- as.numeric(seqGetData(genofile,"chromosome"))
CAGEvpos <- as.numeric(seqGetData(genofile,"position"))
CAGEvref <- as.character(seqGetData(genofile,"$ref"))
CAGEvalt <- as.character(seqGetData(genofile,"$alt"))
dfPromCAGEVarGene <- data.frame(CAGEvchr,CAGEvpos,CAGEvref,CAGEvalt,CAGEGene)

rm(varid)
gc()

## get SNV id
filter <- seqGetData(genofile, QC_label)
if(variant_type=="variant"){
  SNVlist <- filter == "PASS"
}

if(variant_type=="SNV"){
  SNVlist <- (filter == "PASS") & isSNV(genofile)
}

if(variant_type=="Indel"){
  SNVlist <- (filter == "PASS") & (!isSNV(genofile))
}

variant.id <- seqGetData(genofile, "variant.id")
variant.id.SNV.PromCAGE <- variant.id[SNVlist]

dfPromCAGEVarGene.SNV <- dfPromCAGEVarGene[SNVlist,]
dfPromCAGEVarGene.SNV$CAGEvpos <- as.character(dfPromCAGEVarGene.SNV$CAGEvpos)
dfPromCAGEVarGene.SNV$CAGEvref <- as.character(dfPromCAGEVarGene.SNV$CAGEvref)
dfPromCAGEVarGene.SNV$CAGEvalt <- as.character(dfPromCAGEVarGene.SNV$CAGEvalt)

seqResetFilter(genofile)

rm(dfPromCAGEVarGene)
gc()

#########################################################
#             Promoter_DHS
#########################################################

varid <- seqGetData(genofile, "variant.id")
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
promGobj <- promoters(genes(txdb), upstream = 3000, downstream = 3000)

# Subsetting Promoters that within +/-3kb of TSS and have rOCRs signals
rOCRsAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="DHS")]))
rOCRsBvt <- rOCRsAnno!=""
rOCRsidx <- which(rOCRsBvt,useNames=TRUE)
seqSetFilter(genofile,variant.id=varid[rOCRsidx])

seqSetFilter(genofile,promGobj,intersect=TRUE)
rOCRspromgene <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Info")]))
rOCRsGene <- unlist(lapply(strsplit(rOCRspromgene,"\\(|\\,|;|-"),`[[`,1))
## obtain variants info
rOCRsvchr <- as.numeric(seqGetData(genofile,"chromosome"))
rOCRsvpos <- as.numeric(seqGetData(genofile,"position"))
rOCRsvref <- as.character(seqGetData(genofile,"$ref"))
rOCRsvalt <- as.character(seqGetData(genofile,"$alt"))
dfPromrOCRsVarGene <- data.frame(rOCRsvchr,rOCRsvpos,rOCRsvref,rOCRsvalt,rOCRsGene)

rm(varid)
gc()

## get SNV id
filter <- seqGetData(genofile, QC_label)
if(variant_type=="variant"){
  SNVlist <- filter == "PASS"
}

if(variant_type=="SNV"){
  SNVlist <- (filter == "PASS") & isSNV(genofile)
}

if(variant_type=="Indel"){
  SNVlist <- (filter == "PASS") & (!isSNV(genofile))
}

variant.id <- seqGetData(genofile, "variant.id")
variant.id.SNV.PromrOCRs <- variant.id[SNVlist]

dfPromrOCRsVarGene.SNV <- dfPromrOCRsVarGene[SNVlist,]
dfPromrOCRsVarGene.SNV$rOCRsvpos <- as.character(dfPromrOCRsVarGene.SNV$rOCRsvpos)
dfPromrOCRsVarGene.SNV$rOCRsvref <- as.character(dfPromrOCRsVarGene.SNV$rOCRsvref)
dfPromrOCRsVarGene.SNV$rOCRsvalt <- as.character(dfPromrOCRsVarGene.SNV$rOCRsvalt)

seqResetFilter(genofile)

rm(dfPromrOCRsVarGene)
gc()

#########################################################
#             Enhancer_CAGE
#########################################################

varid <- seqGetData(genofile, "variant.id")

#Now extract the GeneHancer with CAGE Signal Overlay
genehancerAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GeneHancer")]))
genehancer <- genehancerAnno!=""

CAGEAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="CAGE")]))
CAGE <- CAGEAnno!=""
CAGEGeneHancervt <- CAGEAnno!=""&genehancerAnno!=""
CAGEGeneHanceridx <- which(CAGEGeneHancervt,useNames=TRUE)
seqSetFilter(genofile,variant.id=varid[CAGEGeneHanceridx])

# variants that covered by whole GeneHancer without CAGE overlap.
genehancerSet <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GeneHancer")]))
enhancerGene <- unlist(lapply(strsplit(genehancerSet,"="),`[[`,4))
enhancer2GENE <- unlist(lapply(strsplit(enhancerGene,";"),`[[`,1))
enhancervchr <- as.numeric(seqGetData(genofile,"chromosome"))
enhancervpos <- as.numeric(seqGetData(genofile,"position"))
enhancervref <- as.character(seqGetData(genofile,"$ref"))
enhancervalt <- as.character(seqGetData(genofile,"$alt"))
dfHancerCAGEVarGene <- data.frame(enhancervchr,enhancervpos,enhancervref,enhancervalt,enhancer2GENE)

rm(varid)
gc()

## get SNV id
filter <- seqGetData(genofile, QC_label)
if(variant_type=="variant"){
  SNVlist <- filter == "PASS"
}

if(variant_type=="SNV"){
  SNVlist <- (filter == "PASS") & isSNV(genofile)
}

if(variant_type=="Indel"){
  SNVlist <- (filter == "PASS") & (!isSNV(genofile))
}

variant.id <- seqGetData(genofile, "variant.id")
variant.id.SNV.HancerCAGE <- variant.id[SNVlist]

dfHancerCAGEVarGene.SNV <- dfHancerCAGEVarGene[SNVlist,]
dfHancerCAGEVarGene.SNV$enhancervpos <- as.character(dfHancerCAGEVarGene.SNV$enhancervpos)
dfHancerCAGEVarGene.SNV$enhancervref <- as.character(dfHancerCAGEVarGene.SNV$enhancervref)
dfHancerCAGEVarGene.SNV$enhancervalt <- as.character(dfHancerCAGEVarGene.SNV$enhancervalt)

seqResetFilter(genofile)

rm(dfHancerCAGEVarGene)
gc()

#########################################################
#             Enhancer_DHS
#########################################################

varid <- seqGetData(genofile, "variant.id")

#Now extract the GeneHancer with rOCRs Signal Overlay
genehancerAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GeneHancer")]))
genehancer <- genehancerAnno!=""

rOCRsAnno <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="DHS")]))
rOCRs <- rOCRsAnno!=""
rOCRsGeneHancervt <- rOCRsAnno!=""&genehancerAnno!=""
rOCRsGeneHanceridx <- which(rOCRsGeneHancervt,useNames=TRUE)
seqSetFilter(genofile,variant.id=varid[rOCRsGeneHanceridx])
# variants that covered by whole GeneHancer without rOCRs overlap.

genehancerSet <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GeneHancer")]))
enhancerGene <- unlist(lapply(strsplit(genehancerSet,"="),`[[`,4))
enhancer2GENE <- unlist(lapply(strsplit(enhancerGene,";"),`[[`,1))
enhancervchr <- as.numeric(seqGetData(genofile,"chromosome"))
enhancervpos <- as.numeric(seqGetData(genofile,"position"))
enhancervref <- as.character(seqGetData(genofile,"$ref"))
enhancervalt <- as.character(seqGetData(genofile,"$alt"))
dfHancerrOCRsVarGene <- data.frame(enhancervchr,enhancervpos,enhancervref,enhancervalt,enhancer2GENE)

rm(varid)
gc()

## get SNV id
filter <- seqGetData(genofile, QC_label)
if(variant_type=="variant"){
  SNVlist <- filter == "PASS"
}

if(variant_type=="SNV"){
  SNVlist <- (filter == "PASS") & isSNV(genofile)
}

if(variant_type=="Indel"){
  SNVlist <- (filter == "PASS") & (!isSNV(genofile))
}

variant.id <- seqGetData(genofile, "variant.id")
variant.id.SNV.HancerrOCRs <- variant.id[SNVlist]

dfHancerrOCRsVarGene.SNV <- dfHancerrOCRsVarGene[SNVlist,]
dfHancerrOCRsVarGene.SNV$enhancervpos <- as.character(dfHancerrOCRsVarGene.SNV$enhancervpos)
dfHancerrOCRsVarGene.SNV$enhancervref <- as.character(dfHancerrOCRsVarGene.SNV$enhancervref)
dfHancerrOCRsVarGene.SNV$enhancervalt <- as.character(dfHancerrOCRsVarGene.SNV$enhancervalt)

seqResetFilter(genofile)

rm(dfHancerrOCRsVarGene)
gc()

results_noncoding <- list()
count <- 1
for(kk in sub_seq_id){
  print(kk)
  gene_name <- genes_info_chr[kk,1]
  results <- gene_centric_noncoding_dnanexus(genes_info_chr=genes_info_chr,gene_name = gene_name,chr=chr,genofile=genofile,obj_nullmodel=obj_nullmodel,
                                             dfPromCAGEVarGene.SNV=dfPromCAGEVarGene.SNV,variant.id.SNV.PromCAGE=variant.id.SNV.PromCAGE,
                                             dfPromrOCRsVarGene.SNV=dfPromrOCRsVarGene.SNV,variant.id.SNV.PromrOCRs=variant.id.SNV.PromrOCRs,
                                             dfHancerCAGEVarGene.SNV=dfHancerCAGEVarGene.SNV,variant.id.SNV.HancerCAGE=variant.id.SNV.HancerCAGE,
                                             dfHancerrOCRsVarGene.SNV=dfHancerrOCRsVarGene.SNV,variant.id.SNV.HancerrOCRs=variant.id.SNV.HancerrOCRs,
                                             rare_maf_cutoff=0.01,QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                             Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                             Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=TRUE)
  
  results_noncoding[[count]] <-results
  count <- count + 1
}

seqClose(genofile)

rm(list=setdiff(ls(), c("results_noncoding", "output_file_name","arrayid","agds.path","trait"))); gc()
save(results_noncoding, file = paste0(output_file_name,"_",arrayid,".Rdata"))

Overwriting Noncoding_Analysis.R


In [99]:
%%writefile Noncoding_Analysis.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${arrayid} ${trait} ${Train_Null_Model} ${Annotation_name_catalog} ${GDS_File} ${OUTPUT_PATH}

Overwriting Noncoding_Analysis.sh


In [100]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","Height","TC")){
    for(arrayid in 379){
        if(arrayid < 41){
            chr <- 1
        }else if(arrayid < 66){
            chr <- 2
        }else if(arrayid < 87){
            chr <- 3
        }else if(arrayid < 102){
            chr <- 4
        }else if(arrayid < 120){
            chr <- 5
        }else if(arrayid < 141){
            chr <- 6
        }else if(arrayid < 159){
            chr <- 7
        }else if(arrayid < 173){
            chr <- 8
        }else if(arrayid < 189){
            chr <- 9
        }else if(arrayid < 204){
            chr <- 10
        }else if(arrayid < 230){
            chr <- 11
        }else if(arrayid < 250){
            chr <- 12
        }else if(arrayid < 257){
            chr <- 13
        }else if(arrayid < 269){
            chr <- 14
        }else if(arrayid < 281){
            chr <- 15
        }else if(arrayid < 298){
            chr <- 16
        }else if(arrayid < 321){
            chr <- 17
        }else if(arrayid < 327){
            chr <- 18
        }else if(arrayid < 355){
            chr <- 19
        }else if(arrayid < 366){
            chr <- 20
        }else if(arrayid < 371){
            chr <- 21
        }else{
            chr <- 22
        }
        
        tasks <- rbind(tasks, data.frame(
            '--env arrayid'=arrayid,
            '--env trait'=trait,
            '--input Train_Null_Model'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Train_Null_Model.RData"),
            '--input Annotation_name_catalog'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataGDS/acaf_threshold_v7/Annotation_name_catalog.csv",
            '--input GDS_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataAGDS/acaf_threshold_v7/acaf_threshold.chr",chr,".gds"),
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Noncoding_Analysis.R",
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricNoncoding",
            check.names = FALSE
        )) 
    }
}  
   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [7]:
%%writefile Coding_Analysis.R
rm(list=ls())
gc()

arrayid <- as.numeric(commandArgs(TRUE)[1])

trait <- commandArgs(TRUE)[2]
print(trait)

Train_Null_Model <- commandArgs(TRUE)[3]
print(Train_Null_Model)

Annotation_name_catalog <- commandArgs(TRUE)[4]
print(Annotation_name_catalog)

GDS_File <- commandArgs(TRUE)[5]
print(GDS_File)

OUTPUT_PATH <- commandArgs(TRUE)[6]
print(OUTPUT_PATH)

## load required packages
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
library(STAARpipelineSummary)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(readr)
library(dplyr)
library(stringr)

coding <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                   rare_maf_cutoff=0.01,rv_num_cutoff=2,
                   QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                   Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                   Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                   SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  rm(position)
  gc()
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  ################################################
  #           Coding
  ################################################
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.coding <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|(GENCODE.EXONIC.Category=="nonsynonymous SNV")|(GENCODE.EXONIC.Category=="synonymous SNV")
  variant.id.gene <- variant.id.gene[lof.in.coding]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  ################################################
  #                  plof_ds
  ################################################
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_plof_ds <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof_ds"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_plof_ds <- rbind(results_plof_ds,results_temp)
  }
  
  if(!is.null(results_plof_ds))
  {
    if(!use_SPA)
    {
      colnames(results_plof_ds) <- colnames(results_plof_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_plof_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof_ds)[(dim(results_plof_ds)[2]-1):dim(results_plof_ds)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_plof_ds) <- colnames(results_plof_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_plof_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof_ds)[dim(results_plof_ds)[2]] <- c("STAAR-B")
      
    }
  }
  
  #####################################################
  #                      plof
  #####################################################
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_plof <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_plof <- rbind(results_plof,results_temp)
  }
  
  if(!is.null(results_plof))
  {
    if(!use_SPA)
    {
      colnames(results_plof) <- colnames(results_plof, do.NULL = FALSE, prefix = "col")
      colnames(results_plof)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof)[(dim(results_plof)[2]-1):dim(results_plof)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_plof) <- colnames(results_plof, do.NULL = FALSE, prefix = "col")
      colnames(results_plof)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof)[dim(results_plof)[2]] <- c("STAAR-B")
    }
  }
  
  #############################################
  #             synonymous
  #############################################
  lof.in.synonymous <- (GENCODE.EXONIC.Category=="synonymous SNV")
  variant.id.gene.category <- variant.id.gene[lof.in.synonymous]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.synonymous,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_synonymous <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "synonymous"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_synonymous <- rbind(results_synonymous,results_temp)
  }
  
  if(!is.null(results_synonymous))
  {
    if(!use_SPA)
    {
      colnames(results_synonymous) <- colnames(results_synonymous, do.NULL = FALSE, prefix = "col")
      colnames(results_synonymous)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_synonymous)[(dim(results_synonymous)[2]-1):dim(results_synonymous)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_synonymous) <- colnames(results_synonymous, do.NULL = FALSE, prefix = "col")
      colnames(results_synonymous)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_synonymous)[dim(results_synonymous)[2]] <- c("STAAR-B")
    }
    
  }
  
  #################################################
  #        missense
  #################################################
  lof.in.missense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")
  variant.id.gene.category <- variant.id.gene[lof.in.missense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.missense,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  #################################################
  #         disruptive missense
  #################################################
  lof.in.dmissense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D")
  variant.id.gene.category <- variant.id.gene[lof.in.dmissense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.dmissense,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "disruptive_missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
    
    if(dim(results)[1]==1)
    {
      if(results[3]!="disruptive_missense")
      {
        if(!use_SPA)
        {
          results <- cbind(results,matrix(1,1,6))
          colnames(results)[(dim(results)[2]-5):dim(results)[2]] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
          results_missense <- results
          results_ds <- c()
        }else{
          results <- cbind(results,matrix(1,1,2))
          colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
          results_missense <- results
          results_ds <- c()
        }
      }else
      {
        results_missense <- c()
        results_ds <- results
        results <- c()
      }
    }
    
    if(!is.null(results))
    {
      if(dim(results)[1]==2)
      {
        if(!use_SPA)
        {
          results_m <- c(results[1,],rep(0,6))
          names(results_m)[(length(results_m)-5):length(results_m)] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
          results_m[(length(results_m)-5):length(results_m)] <- results[2,c("SKAT(1,25)","SKAT(1,1)","Burden(1,25)","Burden(1,1)","ACAT-V(1,25)","ACAT-V(1,1)")]
          apc_num <- (length(results_m)-19)/6
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),1:apc_num+2*(apc_num+1),1:apc_num+3*(apc_num+1),1:apc_num+4*(apc_num+1),1:apc_num+5*(apc_num+1),(6*apc_num+9):(6*apc_num+14))
          results_m["STAAR-O"] <- CCT(as.numeric(results_m[6:length(results_m)][p_seq]))
          results_m["STAAR-S(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num,6*apc_num+9)]))
          results_m["STAAR-S(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),6*apc_num+10)]))
          results_m["STAAR-B(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+2*(apc_num+1),6*apc_num+11)]))
          results_m["STAAR-B(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+3*(apc_num+1),6*apc_num+12)]))
          results_m["STAAR-A(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+4*(apc_num+1),6*apc_num+13)]))
          results_m["STAAR-A(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+5*(apc_num+1),6*apc_num+14)]))
          
          results_ds <- c()
          results_ds <- rbind(results_ds,results[2,])
          
          results <- c()
          results <- rbind(results,results_m)
        }else
        {
          results_m <- c(results[1,],rep(0,2))
          names(results_m)[(length(results_m)-1):length(results_m)] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
          results_m[(length(results_m)-1):length(results_m)] <- results[2,c("Burden(1,25)","Burden(1,1)")]
          
          ## check whether the p-values is NA. If so, set NA equals 1.
          if(is.na(results_m[(length(results_m)-1)]))
          {
            results_m[(length(results_m)-1)] <- 1
          }
          
          if(is.na(results_m[length(results_m)]))
          {
            results_m[length(results_m)] <- 1
          }
          
          apc_num <- (length(results_m)-10)/2
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),(length(results_m)-6):(length(results_m)-5))
          
          ## calculate STAAR-B
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][p_seq])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,25)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num,(length(results_m)-6))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,25)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,25)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,25)"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,1)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),(length(results_m)-5))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,1)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,1)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,1)"] <- 1
            }
          }
          
          results_ds <- c()
          results_ds <- rbind(results_ds,results[2,])
          
          results <- c()
          results <- rbind(results,results_m)
        }
      }
    }
  }else
  {
    results <- c()
    results_ds <- c()
  }
  
  results_coding <- list(plof=results_plof,plof_ds=results_plof_ds,missense=results,disruptive_missense=results_ds,synonymous=results_synonymous)
  
  seqResetFilter(genofile)
  
  return(results_coding)
}

plof <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                 rare_maf_cutoff=0.01,rv_num_cutoff=2,
                 QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                 Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                 Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                 SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")
  variant.id.gene <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

plof_ds <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                    rare_maf_cutoff=0.01,rv_num_cutoff=2,
                    QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                    Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                    Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                    SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof_ds
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  variant.id.gene <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else
    {
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof_ds"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

missense <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                     rare_maf_cutoff=0.01,rv_num_cutoff=2,
                     QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                     Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                     Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                     SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  ############################################################
  #                      missense
  ############################################################
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.missense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")
  variant.id.gene <- variant.id.gene[lof.in.missense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  ############################################################
  #                      disruptive missense
  ############################################################
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.dmissense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D")
  variant.id.gene <- variant.id.gene[lof.in.dmissense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "disruptive_missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
    
    if(dim(results)[1]==1)
    {
      if(results[3]!="disruptive_missense")
      {
        if(!use_SPA)
        {
          results <- cbind(results,matrix(1,1,6))
          colnames(results)[(dim(results)[2]-5):dim(results)[2]] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
        }else{
          results <- cbind(results,matrix(1,1,2))
          colnames(results)[(dim(results)[2]-5):dim(results)[2]] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
        }
      }else
      {
        results <- c()
      }
    }
    
    if(!is.null(results))
    {
      if(dim(results)[1]==2)
      {
        if(!use_SPA)
        {
          results_m <- c(results[1,],rep(0,6))
          names(results_m)[(length(results_m)-5):length(results_m)] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
          results_m[(length(results_m)-5):length(results_m)] <- results[2,c("SKAT(1,25)","SKAT(1,1)","Burden(1,25)","Burden(1,1)","ACAT-V(1,25)","ACAT-V(1,1)")]
          apc_num <- (length(results_m)-19)/6
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),1:apc_num+2*(apc_num+1),1:apc_num+3*(apc_num+1),1:apc_num+4*(apc_num+1),1:apc_num+5*(apc_num+1),(6*apc_num+9):(6*apc_num+14))
          results_m["STAAR-O"] <- CCT(as.numeric(results_m[6:length(results_m)][p_seq]))
          results_m["STAAR-S(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num,6*apc_num+9)]))
          results_m["STAAR-S(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),6*apc_num+10)]))
          results_m["STAAR-B(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+2*(apc_num+1),6*apc_num+11)]))
          results_m["STAAR-B(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+3*(apc_num+1),6*apc_num+12)]))
          results_m["STAAR-A(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+4*(apc_num+1),6*apc_num+13)]))
          results_m["STAAR-A(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+5*(apc_num+1),6*apc_num+14)]))
          
          results <- c()
          results <- rbind(results,results_m)
        }else
        {
          results_m <- c(results[1,],rep(0,2))
          names(results_m)[(length(results_m)-1):length(results_m)] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
          results_m[(length(results_m)-1):length(results_m)] <- results[2,c("Burden(1,25)","Burden(1,1)")]
          
          ## check whether the p-values is NA. If so, set NA equals 1.
          if(is.na(results_m[(length(results_m)-1)]))
          {
            results_m[(length(results_m)-1)] <- 1
          }
          
          if(is.na(results_m[length(results_m)]))
          {
            results_m[length(results_m)] <- 1
          }
          
          apc_num <- (length(results_m)-10)/2
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),(length(results_m)-6):(length(results_m)-5))
          
          ## calculate STAAR-B
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][p_seq])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,25)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num,(length(results_m)-6))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,25)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,25)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,25)"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,1)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),(length(results_m)-5))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,1)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,1)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,1)"] <- 1
            }
          }
          
          results <- c()
          results <- rbind(results,results_m)
          
        }
      }
    }
  }else
  {
    results <- c()
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

disruptive_missense <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                                rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## disruptive_missense
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.ds <- ((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  variant.id.gene <- variant.id.gene[lof.in.ds]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "disruptive_missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

synonymous <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                       rare_maf_cutoff=0.01,rv_num_cutoff=2,
                       QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                       Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                       Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                       SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ### synonymous
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.synonymous <- (GENCODE.EXONIC.Category=="synonymous SNV")
  variant.id.gene <- variant.id.gene[lof.in.synonymous]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "synonymous"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

ptv <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                rare_maf_cutoff=0.01,rv_num_cutoff=2,
                QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  
  if(variant_type=="SNV")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
  }
  
  if(variant_type=="Indel")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
  }
  
  if(variant_type=="variant")
  {
    lof.in.plof.snv <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
    lof.in.plof.indel <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
    lof.in.plof <- lof.in.plof.snv|lof.in.plof.indel
  }
  
  variant.id.gene <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "ptv"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

ptv_ds <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                   rare_maf_cutoff=0.01,rv_num_cutoff=2,
                   QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                   Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                   Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                   SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof_ds
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")
  
  if(variant_type=="SNV")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  if(variant_type=="Indel")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  if(variant_type=="variant")
  {
    lof.in.plof.snv <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
    lof.in.plof.indel <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
    lof.in.plof <- lof.in.plof.snv|lof.in.plof.indel|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  variant.id.gene <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else
    {
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "ptv_ds"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
  }
  
  seqResetFilter(genofile)
  
  return(results)
}

coding_incl_ptv <- function(chr,gene_name,genofile,obj_nullmodel,genes,
                            rare_maf_cutoff=0.01,rv_num_cutoff=2,
                            QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                            Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                            Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                            SPA_p_filter=FALSE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  phenotype.id <- obj_nullmodel$id_include
  n_pheno <- obj_nullmodel$n.pheno
  
  ## SPA status
  if(!is.null(obj_nullmodel$use_SPA))
  {
    use_SPA <- obj_nullmodel$use_SPA
  }else
  {
    use_SPA <- FALSE
  }
  
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant")
  {
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV")
  {
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel")
  {
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  rm(position)
  gc()
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  ################################################
  #           Coding
  ################################################
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.coding <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|(GENCODE.EXONIC.Category=="nonsynonymous SNV")|(GENCODE.EXONIC.Category=="synonymous SNV")|(GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
  variant.id.gene <- variant.id.gene[lof.in.coding]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## Gencode_Exonic
  GENCODE.EXONIC.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  ## Annotation
  Anno.Int.PHRED.sub <- NULL
  Anno.Int.PHRED.sub.name <- NULL
  
  if(variant_type=="SNV")
  {
    if(Use_annotation_weights)
    {
      for(k in 1:length(Annotation_name))
      {
        if(Annotation_name[k]%in%Annotation_name_catalog$name)
        {
          Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,Annotation_name[k])
          Annotation.PHRED <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name==Annotation_name[k])]))
          
          if(Annotation_name[k]=="CADD")
          {
            Annotation.PHRED[is.na(Annotation.PHRED)] <- 0
          }
          
          if(Annotation_name[k]=="aPC.LocalDiversity")
          {
            Annotation.PHRED.2 <- -10*log10(1-10^(-Annotation.PHRED/10))
            Annotation.PHRED <- cbind(Annotation.PHRED,Annotation.PHRED.2)
            Anno.Int.PHRED.sub.name <- c(Anno.Int.PHRED.sub.name,paste0(Annotation_name[k],"(-)"))
          }
          Anno.Int.PHRED.sub <- cbind(Anno.Int.PHRED.sub,Annotation.PHRED)
        }
      }
      
      Anno.Int.PHRED.sub <- data.frame(Anno.Int.PHRED.sub)
      colnames(Anno.Int.PHRED.sub) <- Anno.Int.PHRED.sub.name
    }
  }
  
  ################################################
  #                  plof_ds
  ################################################
  variant.id.gene <- seqGetData(genofile, "variant.id")
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_plof_ds <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof_ds"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_plof_ds <- rbind(results_plof_ds,results_temp)
  }
  
  if(!is.null(results_plof_ds))
  {
    if(!use_SPA)
    {
      colnames(results_plof_ds) <- colnames(results_plof_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_plof_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof_ds)[(dim(results_plof_ds)[2]-1):dim(results_plof_ds)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_plof_ds) <- colnames(results_plof_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_plof_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof_ds)[dim(results_plof_ds)[2]] <- c("STAAR-B")
      
    }
  }
  
  ################################################
  #                  ptv_ds
  ################################################
  if(variant_type=="SNV")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  if(variant_type=="Indel")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  if(variant_type=="variant")
  {
    lof.in.plof.snv <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
    lof.in.plof.indel <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
    lof.in.plof <- lof.in.plof.snv|lof.in.plof.indel|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
  }
  
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_ptv_ds <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "ptv_ds"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_ptv_ds <- rbind(results_ptv_ds,results_temp)
  }
  
  if(!is.null(results_ptv_ds))
  {
    if(!use_SPA)
    {
      colnames(results_ptv_ds) <- colnames(results_ptv_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_ptv_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_ptv_ds)[(dim(results_ptv_ds)[2]-1):dim(results_ptv_ds)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_ptv_ds) <- colnames(results_ptv_ds, do.NULL = FALSE, prefix = "col")
      colnames(results_ptv_ds)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_ptv_ds)[dim(results_ptv_ds)[2]] <- c("STAAR-B")
      
    }
  }
  
  #####################################################
  #                      plof
  #####################################################
  lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_plof <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "plof"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_plof <- rbind(results_plof,results_temp)
  }
  
  if(!is.null(results_plof))
  {
    if(!use_SPA)
    {
      colnames(results_plof) <- colnames(results_plof, do.NULL = FALSE, prefix = "col")
      colnames(results_plof)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof)[(dim(results_plof)[2]-1):dim(results_plof)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_plof) <- colnames(results_plof, do.NULL = FALSE, prefix = "col")
      colnames(results_plof)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_plof)[dim(results_plof)[2]] <- c("STAAR-B")
    }
  }
  
  #####################################################
  #                      ptv
  #####################################################
  if(variant_type=="SNV")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
  }
  
  if(variant_type=="Indel")
  {
    lof.in.plof <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
  }
  
  if(variant_type=="variant")
  {
    lof.in.plof.snv <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")
    lof.in.plof.indel <- (GENCODE.EXONIC.Category=="frameshift deletion")|(GENCODE.EXONIC.Category=="frameshift insertion")
    lof.in.plof <- lof.in.plof.snv|lof.in.plof.indel
  }
  
  variant.id.gene.category <- variant.id.gene[lof.in.plof]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.plof,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_ptv <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "ptv"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_ptv <- rbind(results_ptv,results_temp)
  }
  
  if(!is.null(results_ptv))
  {
    if(!use_SPA)
    {
      colnames(results_ptv) <- colnames(results_ptv, do.NULL = FALSE, prefix = "col")
      colnames(results_ptv)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_ptv)[(dim(results_ptv)[2]-1):dim(results_ptv)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_ptv) <- colnames(results_ptv, do.NULL = FALSE, prefix = "col")
      colnames(results_ptv)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_ptv)[dim(results_ptv)[2]] <- c("STAAR-B")
    }
  }
  
  #############################################
  #             synonymous
  #############################################
  lof.in.synonymous <- (GENCODE.EXONIC.Category=="synonymous SNV")
  variant.id.gene.category <- variant.id.gene[lof.in.synonymous]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.synonymous,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results_synonymous <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "synonymous"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results_synonymous <- rbind(results_synonymous,results_temp)
  }
  
  if(!is.null(results_synonymous))
  {
    if(!use_SPA)
    {
      colnames(results_synonymous) <- colnames(results_synonymous, do.NULL = FALSE, prefix = "col")
      colnames(results_synonymous)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_synonymous)[(dim(results_synonymous)[2]-1):dim(results_synonymous)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results_synonymous) <- colnames(results_synonymous, do.NULL = FALSE, prefix = "col")
      colnames(results_synonymous)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results_synonymous)[dim(results_synonymous)[2]] <- c("STAAR-B")
    }
    
  }
  
  #################################################
  #        missense
  #################################################
  lof.in.missense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")
  variant.id.gene.category <- variant.id.gene[lof.in.missense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.missense,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  results <- c()
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  #################################################
  #         disruptive missense
  #################################################
  lof.in.dmissense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D")
  variant.id.gene.category <- variant.id.gene[lof.in.dmissense]
  
  seqSetFilter(genofile,variant.id=variant.id.gene.category,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  ## Annotation
  Anno.Int.PHRED.sub.category <- Anno.Int.PHRED.sub[lof.in.dmissense,]
  
  pvalues <- 0
  if(n_pheno == 1)
  {
    if(!use_SPA)
    {
      try(pvalues <- STAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
    }else{
      try(pvalues <- STAAR_Binary_SPA(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff),silent=silent)
    }
  }else
  {
    try(pvalues <- MultiSTAAR(Geno,obj_nullmodel,Anno.Int.PHRED.sub.category,rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff),silent=silent)
  }
  
  if(inherits(pvalues, "list"))
  {
    results_temp <- as.vector(genes[kk,])
    results_temp[3] <- "disruptive_missense"
    results_temp[2] <- chr
    results_temp[1] <- as.character(genes[kk,1])
    results_temp[4] <- pvalues$num_variant
    
    if(!use_SPA)
    {
      results_temp <- c(results_temp,pvalues$cMAC,pvalues$results_STAAR_S_1_25,pvalues$results_STAAR_S_1_1,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_A_1_25,
                        pvalues$results_STAAR_A_1_1,pvalues$results_ACAT_O,pvalues$results_STAAR_O)
    }else
    {
      results_temp <- c(results_temp,pvalues$cMAC,
                        pvalues$results_STAAR_B_1_25,pvalues$results_STAAR_B_1_1,pvalues$results_STAAR_B)
    }
    
    results <- rbind(results,results_temp)
  }
  
  if(!is.null(results))
  {
    if(!use_SPA)
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("ACAT-O","STAAR-O")
    }else
    {
      colnames(results) <- colnames(results, do.NULL = FALSE, prefix = "col")
      colnames(results)[1:5] <- c("Gene name","Chr","Category","#SNV","cMAC")
      colnames(results)[dim(results)[2]] <- c("STAAR-B")
    }
    
    if(dim(results)[1]==1)
    {
      if(results[3]!="disruptive_missense")
      {
        if(!use_SPA)
        {
          results <- cbind(results,matrix(1,1,6))
          colnames(results)[(dim(results)[2]-5):dim(results)[2]] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
          results_missense <- results
          results_ds <- c()
        }else{
          results <- cbind(results,matrix(1,1,2))
          colnames(results)[(dim(results)[2]-1):dim(results)[2]] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
          results_missense <- results
          results_ds <- c()
        }
      }else
      {
        results_missense <- c()
        results_ds <- results
        results <- c()
      }
    }
    
    if(!is.null(results))
    {
      if(dim(results)[1]==2)
      {
        if(!use_SPA)
        {
          results_m <- c(results[1,],rep(0,6))
          names(results_m)[(length(results_m)-5):length(results_m)] <- c("SKAT(1,25)-Disruptive","SKAT(1,1)-Disruptive","Burden(1,25)-Disruptive","Burden(1,1)-Disruptive","ACAT-V(1,25)-Disruptive","ACAT-V(1,1)-Disruptive")
          results_m[(length(results_m)-5):length(results_m)] <- results[2,c("SKAT(1,25)","SKAT(1,1)","Burden(1,25)","Burden(1,1)","ACAT-V(1,25)","ACAT-V(1,1)")]
          apc_num <- (length(results_m)-19)/6
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),1:apc_num+2*(apc_num+1),1:apc_num+3*(apc_num+1),1:apc_num+4*(apc_num+1),1:apc_num+5*(apc_num+1),(6*apc_num+9):(6*apc_num+14))
          results_m["STAAR-O"] <- CCT(as.numeric(results_m[6:length(results_m)][p_seq]))
          results_m["STAAR-S(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num,6*apc_num+9)]))
          results_m["STAAR-S(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),6*apc_num+10)]))
          results_m["STAAR-B(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+2*(apc_num+1),6*apc_num+11)]))
          results_m["STAAR-B(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+3*(apc_num+1),6*apc_num+12)]))
          results_m["STAAR-A(1,25)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+4*(apc_num+1),6*apc_num+13)]))
          results_m["STAAR-A(1,1)"] <- CCT(as.numeric(results_m[6:length(results_m)][c(1:apc_num+5*(apc_num+1),6*apc_num+14)]))
          
          results_ds <- c()
          results_ds <- rbind(results_ds,results[2,])
          
          results <- c()
          results <- rbind(results,results_m)
        }else
        {
          results_m <- c(results[1,],rep(0,2))
          names(results_m)[(length(results_m)-1):length(results_m)] <- c("Burden(1,25)-Disruptive","Burden(1,1)-Disruptive")
          results_m[(length(results_m)-1):length(results_m)] <- results[2,c("Burden(1,25)","Burden(1,1)")]
          
          ## check whether the p-values is NA. If so, set NA equals 1.
          if(is.na(results_m[(length(results_m)-1)]))
          {
            results_m[(length(results_m)-1)] <- 1
          }
          
          if(is.na(results_m[length(results_m)]))
          {
            results_m[length(results_m)] <- 1
          }
          
          apc_num <- (length(results_m)-10)/2
          p_seq <- c(1:apc_num,1:apc_num+(apc_num+1),(length(results_m)-6):(length(results_m)-5))
          
          ## calculate STAAR-B
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][p_seq])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,25)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num,(length(results_m)-6))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,25)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,25)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,25)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,25)"] <- 1
            }
          }
          
          ## calculate STAAR-B(1,1)
          pvalues_sub <- as.numeric(results_m[6:length(results_m)][c(1:apc_num+(apc_num+1),(length(results_m)-5))])
          if(sum(is.na(pvalues_sub))>0)
          {
            if(sum(is.na(pvalues_sub))==length(pvalues_sub))
            {
              results_m["STAAR-B(1,1)"] <- 1
            }else
            {
              ## not all NAs
              pvalues_sub <- pvalues_sub[!is.na(pvalues_sub)]
              if(sum(pvalues_sub[pvalues_sub<1])>0)
              {
                ## not all ones
                results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
                
              }else
              {
                results_m["STAAR-B(1,1)"] <- 1
                
              }
            }
          }else
          {
            if(sum(pvalues_sub[pvalues_sub<1])>0)
            {
              results_m["STAAR-B(1,1)"] <- CCT(pvalues_sub[pvalues_sub<1])
            }else
            {
              results_m["STAAR-B(1,1)"] <- 1
            }
          }
          
          results_ds <- c()
          results_ds <- rbind(results_ds,results[2,])
          
          results <- c()
          results <- rbind(results,results_m)
        }
      }
    }
  }else
  {
    results <- c()
    results_ds <- c()
  }
  
  results_coding <- list(plof=results_plof,plof_ds=results_plof_ds,missense=results,disruptive_missense=results_ds,synonymous=results_synonymous,ptv=results_ptv,ptv_ds=results_ptv_ds)
  
  seqResetFilter(genofile)
  
  return(results_coding)
}

Gene_Centric_Coding <- function(chr,gene_name,category=c("all_categories","plof","plof_ds","missense","disruptive_missense","synonymous","ptv","ptv_ds","all_categories_incl_ptv"),
                                genofile,obj_nullmodel,rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,
                                Use_annotation_weights=c(TRUE,FALSE),Annotation_name=NULL,
                                SPA_p_filter=TRUE,p_filter_cutoff=0.05,silent=FALSE){
  
  ## evaluate choices
  category <- match.arg(category)
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  genes <- genes_info[genes_info[,2]==chr,]
  
  if(category=="all_categories")
  {
    results <- coding(chr,gene_name,genofile,obj_nullmodel,genes,
                      rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                      QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                      Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                      Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                      SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="plof")
  {
    results <- plof(chr,gene_name,genofile,obj_nullmodel,genes,
                    rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                    QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                    Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                    Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                    SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="plof_ds")
  {
    results <- plof_ds(chr,gene_name,genofile,obj_nullmodel,genes,
                       rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                       QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                       Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                       Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                       SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="missense")
  {
    results <- missense(chr,gene_name,genofile,obj_nullmodel,genes,
                        rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                        QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                        Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                        Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                        SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="disruptive_missense")
  {
    results <- disruptive_missense(chr,gene_name,genofile,obj_nullmodel,genes,
                                   rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                                   QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                   Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                   Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                                   SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  if(category=="synonymous")
  {
    results <- synonymous(chr,gene_name,genofile,obj_nullmodel,genes,
                          rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                          QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                          Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                          Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                          SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  
  if(category=="ptv")
  {
    results <- ptv(chr,gene_name,genofile,obj_nullmodel,genes,
                   rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                   QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                   Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                   Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                   SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  
  if(category=="ptv_ds")
  {
    results <- ptv_ds(chr,gene_name,genofile,obj_nullmodel,genes,
                      rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                      QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                      Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                      Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                      SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  
  if(category=="all_categories_incl_ptv")
  {
    results <- coding_incl_ptv(chr,gene_name,genofile,obj_nullmodel,genes,
                               rare_maf_cutoff=rare_maf_cutoff,rv_num_cutoff=rv_num_cutoff,
                               QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                               Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                               Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,
                               SPA_p_filter=SPA_p_filter,p_filter_cutoff=p_filter_cutoff,silent=silent)
  }
  
  return(results)
}

obj_nullmodel <- get(load(Train_Null_Model))

## Parameter
QC_label <- "annotation/filter"
geno_missing_imputation <- "mean"
variant_type <- "SNV"

## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- read.csv(Annotation_name_catalog)
## Use_annotation_weights
Use_annotation_weights <- TRUE
## Annotation name
Annotation_name <- c("CADD","LINSIGHT","FATHMM.XF","aPC.EpigeneticActive","aPC.EpigeneticRepressed","aPC.EpigeneticTranscription",
                     "aPC.Conservation","aPC.LocalDiversity","aPC.Mappability","aPC.TF","aPC.Protein")

## output file name
output_file_name <- paste0(OUTPUT_PATH,"/",trait,"_AoU_WGS_Coding_Train")

## Chr
gene_num_in_array <- 50 
group.num.allchr <- ceiling(table(genes_info[,2])/gene_num_in_array)
sum(group.num.allchr)

chr <- which.max(arrayid <= cumsum(group.num.allchr))
group.num <- group.num.allchr[chr]

if (chr == 1){
  groupid <- arrayid
}else{
  groupid <- arrayid - cumsum(group.num.allchr)[chr-1]
}

coding_longmasks <- c("TTN","PCDHA2","PCDHA3","PCDHGA1","PCDHGA2","PCDHGA3","PCDHGB1","PCDHGA4","PCDHGB2","PCDHGB3")
genes_info_chr <- genes_info[genes_info[,2]==chr,]
genes_info_chr <- genes_info_chr[!genes_info_chr[,1] %in% coding_longmasks,]
sub_seq_num <- dim(genes_info_chr)[1]

if(groupid < group.num){ 
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):(groupid*gene_num_in_array)
}else{
  sub_seq_id <- ((groupid - 1)*gene_num_in_array + 1):sub_seq_num
}

agds.path <- GDS_File

genofile <- seqOpen(agds.path)

gene_centric_coding_dnanexus <- function(genes_info_chr,gene_name,chr,genofile,obj_nullmodel,rare_maf_cutoff,
                                         QC_label,variant_type,geno_missing_imputation,
                                         Annotation_dir,Annotation_name_catalog,
                                         Use_annotation_weights,Annotation_name,silent){
  results <- try(Gene_Centric_Coding(chr=chr,gene_name=gene_name,genofile=genofile,obj_nullmodel=obj_nullmodel,rare_maf_cutoff=rare_maf_cutoff,
                                     QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                     Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                     Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=silent),silent=TRUE)
  return(results)
}

results_coding <- list()
count <- 1
for(kk in sub_seq_id){
  print(kk)
  gene_name <- genes_info_chr[kk,1]
  results <- gene_centric_coding_dnanexus(genes_info_chr=genes_info_chr,gene_name = gene_name,chr=chr,genofile=genofile,obj_nullmodel=obj_nullmodel,rare_maf_cutoff=0.01,
                                          QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                          Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,
                                          Use_annotation_weights=Use_annotation_weights,Annotation_name=Annotation_name,silent=TRUE)
  
  results_coding[[count]] <- results
  count <- count + 1
}

seqClose(genofile)

rm(list=setdiff(ls(), c("results_coding", "output_file_name","arrayid","agds.path","trait"))); gc()
save(results_coding, file = paste0(output_file_name,"_",arrayid,".Rdata"))


Overwriting Coding_Analysis.R


In [8]:
%%writefile Coding_Analysis.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${arrayid} ${trait} ${Train_Null_Model} ${Annotation_name_catalog} ${GDS_File} ${OUTPUT_PATH}

Overwriting Coding_Analysis.sh


In [9]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","Height")){
    for(arrayid in c(86,112,113,332)){
        if(arrayid < 41){
            chr <- 1
        }else if(arrayid < 66){
            chr <- 2
        }else if(arrayid < 87){
            chr <- 3
        }else if(arrayid < 102){
            chr <- 4
        }else if(arrayid < 120){
            chr <- 5
        }else if(arrayid < 141){
            chr <- 6
        }else if(arrayid < 159){
            chr <- 7
        }else if(arrayid < 173){
            chr <- 8
        }else if(arrayid < 189){
            chr <- 9
        }else if(arrayid < 204){
            chr <- 10
        }else if(arrayid < 230){
            chr <- 11
        }else if(arrayid < 250){
            chr <- 12
        }else if(arrayid < 257){
            chr <- 13
        }else if(arrayid < 269){
            chr <- 14
        }else if(arrayid < 281){
            chr <- 15
        }else if(arrayid < 298){
            chr <- 16
        }else if(arrayid < 321){
            chr <- 17
        }else if(arrayid < 327){
            chr <- 18
        }else if(arrayid < 355){
            chr <- 19
        }else if(arrayid < 366){
            chr <- 20
        }else if(arrayid < 371){
            chr <- 21
        }else{
            chr <- 22
        }
        
        tasks <- rbind(tasks, data.frame(
            '--env arrayid'=arrayid,
            '--env trait'=trait,
            '--input Train_Null_Model'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Train_Null_Model.RData"),
            '--input Annotation_name_catalog'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataGDS/acaf_threshold_v7/Annotation_name_catalog.csv",
            '--input GDS_File'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataAGDS/exome_v7.1/exome.chr",chr,".gds"),
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Coding_Analysis.R",
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricCoding",
            check.names = FALSE
        )) 
    }
}  
   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [10]:
!Rscript score_task.R

In [11]:
!gsutil -m cp Coding_Analysis.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Coding_Analysis.R [Content-Type=application/octet-stream]...
/ [1/1 files][129.1 KiB/129.1 KiB] 100% Done                                    
Operation completed over 1 objects/129.1 KiB.                                    


In [12]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 64 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Coding_Analysis.sh \
  --tasks score_task.txt

Job properties:
  job-id: coding-ana--williamsjacr--240805-152344-37
  job-name: coding-analysis
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/17061318534041850195
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/840692259032739568
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9318378643191275524
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/15924022795533132610
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/3860031643969366395
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/18348738718336721892
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10678717588606309768
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/9692262135783927507
Launched job

In [46]:
%%writefile Coding_Analysis_Summary.R
rm(list=ls())
gc()

library(data.table)

INPUT_PATH <- commandArgs(TRUE)[1]
print(INPUT_PATH)

OUTPUT_PATH <- commandArgs(TRUE)[2]
print(OUTPUT_PATH)

gene_centric_coding_jobs_num <- 379

trait <- "BMI"
i <- 1

for(trait in c("BMI","TC","HDL","LDL","logTG","Height")){
  gene_centric_results_name <- paste0(trait,"_AoU_WGS_Coding_Train")
  
  results_coding_genome <- NULL
  for (i in 1:gene_centric_coding_jobs_num){
    results <- get(load(paste0(INPUT_PATH,"/",gene_centric_results_name,"_",i,".Rdata")))
    results_coding_genome <- c(results_coding_genome, results)
  }
  
  combine_jake <- function(x){
    a <- data.frame(Gene = c(unlist(x$plof[,c("Gene name")]),unlist(x$plof_ds[,c("Gene name")]),unlist(x$missense[,c("Gene name")]),unlist(x$disruptive_missense[,c("Gene name")]),unlist(x$synonymous[,c("Gene name")])),
                    Chr = c(unlist(x$plof[,c("Chr")]),unlist(x$plof_ds[,c("Chr")]),unlist(x$missense[,c("Chr")]),unlist(x$disruptive_missense[,c("Chr")]),unlist(x$synonymous[,c("Chr")])),
                    Category = c(unlist(x$plof[,c("Category")]),unlist(x$plof_ds[,c("Category")]),unlist(x$missense[,c("Category")]),unlist(x$disruptive_missense[,c("Category")]),unlist(x$synonymous[,c("Category")])),
                    Number_SNV = c(unlist(x$plof[,c("#SNV")]),unlist(x$plof_ds[,c("#SNV")]),unlist(x$missense[,c("#SNV")]),unlist(x$disruptive_missense[,c("#SNV")]),unlist(x$synonymous[,c("#SNV")])),
                    Burden_1_1 = c(unlist(x$plof[,c("Burden(1,1)")]),unlist(x$plof_ds[,c("Burden(1,1)")]),unlist(x$missense[,c("Burden(1,1)")]),unlist(x$disruptive_missense[,c("Burden(1,1)")]),unlist(x$synonymous[,c("Burden(1,1)")])),
                    STAARB = c(unlist(x$plof[,c("STAAR-B(1,1)")]),unlist(x$plof_ds[,c("STAAR-B(1,1)")]),unlist(x$missense[,c("STAAR-B(1,1)")]),unlist(x$disruptive_missense[,c("STAAR-B(1,1)")]),unlist(x$synonymous[,c("STAAR-B(1,1)")])))
    return(a)
  }
  
  results_coding_genome <- lapply(results_coding_genome, combine_jake)
  results_coding_genome <- rbindlist(results_coding_genome)
  results_coding_genome$Number_SNV <- as.numeric(results_coding_genome$Number_SNV)
  results_coding_genome <- results_coding_genome[results_coding_genome$Number_SNV < 2000,]
  
  write.csv(results_coding_genome,file = paste0(OUTPUT_PATH,"/",trait,"_coding_sig.csv"),row.names = FALSE)
}

Overwriting Coding_Analysis_Summary.R


In [47]:
%%writefile Coding_Analysis_Summary.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${INPUT_PATH} ${OUTPUT_PATH}

Overwriting Coding_Analysis_Summary.sh


In [48]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)


tasks <- rbind(tasks, data.frame(
            '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricCoding",
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Coding_Analysis_Summary.R",
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricCoding",
            check.names = FALSE
        ))

   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [49]:
!Rscript score_task.R

In [50]:
!gsutil -m cp Coding_Analysis_Summary.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Coding_Analysis_Summary.R [Content-Type=application/octet-stream]...
/ [1/1 files][  2.3 KiB/  2.3 KiB] 100% Done                                    
Operation completed over 1 objects/2.3 KiB.                                      


In [51]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Coding_Analysis_Summary.sh \
  --tasks score_task.txt

Job properties:
  job-id: coding-ana--williamsjacr--240805-203638-20
  job-name: coding-analysis-summary
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/18328105492588917735
Launched job-id: coding-ana--williamsjacr--240805-203638-20
1 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'coding-ana--williamsjacr--240805-203638-20' --users 'williamsjacr' --status '*'
To cancel the job, run:
  ddel --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'coding-ana--williamsjacr--240805-203638-20' --users 'williamsjacr'
Waiting for job to complete...
Monitoring for failed tasks to retry...
*** This dsub process must continue running to retry failed tasks.


In [None]:
%%writefile Single_RareVariant_PRS.R
rm(list = ls())
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
library(STAARpipelineSummary)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(readr)
library(dplyr)
library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)
library(glmnet)

Gene_Centric_Coding_G_Star <- function(chr,gene_name,category=c("plof","plof_ds","missense","disruptive_missense","synonymous"),
                                       genofile,obj_nullmodel,rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                       QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                       Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,silent=FALSE){
  ## evaluate choices
  category <- match.arg(category)
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  genes <- genes_info[genes_info[,2]==chr,]  
  
  phenotype.id <- obj_nullmodel$id_include
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant"){
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV"){
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel"){
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof
  ## Gencode_Exonic
  GENCODE.EXONIC.Category  <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")  
  
  if(category == "plof"){
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")
    variant.id.gene <- variant.id.gene[lof.in.plof]
  }else if(category == "plof_ds"){
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
    variant.id.gene <- variant.id.gene[lof.in.plof]
  }else if(category == "missense"){
    lof.in.missense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")
    variant.id.gene <- variant.id.gene[lof.in.missense]
  }else if(category == "disruptive_missense"){
    lof.in.ds <- ((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
    variant.id.gene <- variant.id.gene[lof.in.ds]
  }else{
    lof.in.synonymous <- (GENCODE.EXONIC.Category=="synonymous SNV")
    variant.id.gene <- variant.id.gene[lof.in.synonymous]
  }
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  genotype <- Geno
  
  if(dim(genotype)[2] == 1){
    return(matrix(0,nrow = dim(genotype)[1],ncol = 1))
  }
  
  if(!is.null(attr(class(genotype), "package")) && attr(class(genotype), "package") == "Matrix"){
    genotype <- as.matrix(genotype)
  }
  genotype <- matrix_flip(genotype)
  MAF <- genotype$MAF
  RV_label <- as.vector((MAF<rare_maf_cutoff)&(MAF>0))
  Geno_rare <- genotype$Geno[,RV_label]
  G <- Geno_rare
  rm(Geno_rare)
  gc()
  
  if(is.null(dim(G))){
    G <- matrix(G,ncol = 1)
  }
  
  C <- G%*%matrix(1,nrow=ncol(G),ncol = 1)
  
  seqResetFilter(genofile)
  
  return(C)
}

trait <- commandArgs(TRUE)[1]
print(trait)

Train_Pvals <- commandArgs(TRUE)[2]
print(Train_Pvals)

Train_NullModel <- commandArgs(TRUE)[3]
print(Train_NullModel)

Tune_NullModel <- commandArgs(TRUE)[4]
print(Tune_NullModel)

Validation_NullModel <- commandArgs(TRUE)[5]
print(Validation_NullModel)

Annotation_name_catalog <- commandArgs(TRUE)[6]
print(Annotation_name_catalog)

INPUT_PATH <- commandArgs(TRUE)[7]
print(INPUT_PATH)

all_phenotypes_file <- commandArgs(TRUE)[8]
print(all_phenotypes_file)

all_train_file <- commandArgs(TRUE)[9]
print(all_train_file)

all_tune_file <- commandArgs(TRUE)[10]
print(all_tune_file)

all_validation_file <- commandArgs(TRUE)[11]
print(all_validation_file)

OUTPUT_PATH <- commandArgs(TRUE)[12]
print(OUTPUT_PATH)

Train_PVals_All <- read.csv(Train_Pvals)
Train_PVals_All <- Train_PVals_All[Train_PVals_All$STAARB <= 1e-03,]

## Null Model
obj_nullmodel_train <- get(load(Train_NullModel))
obj_nullmodel_tune <- get(load(Tune_NullModel))
obj_nullmodel_validation <- get(load(Validation_NullModel))

obj_nullmodel <- obj_nullmodel_train
obj_nullmodel$id_include <- c(obj_nullmodel_train$id_include,obj_nullmodel_tune$id_include,obj_nullmodel_validation$id_include)

## Parameter
QC_label <- "annotation/filter"
geno_missing_imputation <- "mean"
variant_type <- "SNV"

## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- read.csv(Annotation_name_catalog)

G_star_gene_centric_coding <- list()

for(i in 1:nrow(Train_PVals_All)){
  ## Chr
  chr <- Train_PVals_All$Chr[i]
  ## Gene name
  gene_name <- Train_PVals_All$Gene[i]
  ## Coding mask
  category <- Train_PVals_All$Category[i]
  
  ### gds file
  gds.path <- paste0(INPUT_PATH,"/exome.chr",chr,".gds")
  genofile <- seqOpen(gds.path)
  
  G_star_gene_centric_coding[[i]] <- Gene_Centric_Coding_G_Star(chr=chr,gene_name=gene_name,category=category ,
                                                                genofile,obj_nullmodel,rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                                                QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                                                Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,silent=FALSE) 
  seqClose(genofile) 
} 

G_star_gene_centric_coding <- do.call(cbind,G_star_gene_centric_coding)

col_remove <- apply(G_star_gene_centric_coding,2,function(x){sum(x != 0)}) > 10 & colSums(G_star_gene_centric_coding) > 10 
G_star_gene_centric_coding <- G_star_gene_centric_coding[,col_remove,drop = FALSE]

Train_PVals_All <- Train_PVals_All[col_remove,]

ids_gstar <- obj_nullmodel$id_include

G_star_gene_centric_coding_train <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_train$id_include,]
G_star_gene_centric_coding_tune <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_tune$id_include,]
G_star_gene_centric_coding_vad <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_validation$id_include,]

rm(G_star_gene_centric_coding)

X_train <- data.frame(IID = ids_gstar[ids_gstar %in% obj_nullmodel_train$id_include],G_star_gene_centric_coding_train)
pheno_train <- read.delim(all_train_file)
pheno_train <- inner_join(pheno_train,X_train)
print("Made it")
X_train <- as.matrix(pheno_train[,22:ncol(pheno_train),drop = FALSE])
print("Made it")
model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_train)
y_train <- model.null$residual
pheno_train$y_train <- NA
pheno_train$y_train[!is.na(pheno_train[,trait])] <- y_train

X_tune <- data.frame(IID = ids_gstar[ids_gstar %in% obj_nullmodel_tune$id_include],G_star_gene_centric_coding_tune)
pheno_tune <- read.delim(all_tune_file)
pheno_tune <- inner_join(pheno_tune,X_tune)
X_tune <- as.matrix(pheno_tune[,22:ncol(pheno_tune),drop = FALSE])
model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tune)
y_tune <- model.null$residual
pheno_tune$y_tune <- NA
pheno_tune$y_tune[!is.na(pheno_tune[,trait])] <- y_tune

X_valid <- data.frame(IID = ids_gstar[ids_gstar %in% obj_nullmodel_validation$id_include],G_star_gene_centric_coding_vad)
pheno_valid <- read.delim(all_validation_file)
pheno_valid <- inner_join(pheno_valid,X_valid)
X_valid <- as.matrix(pheno_valid[,22:ncol(pheno_valid),drop = FALSE])
model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_valid)
y_valid <- model.null$residual
pheno_valid$y_validation <- NA
pheno_valid$y_validation[!is.na(pheno_valid[,trait])] <- y_valid

print("Made it")

lasso_train <- glmnet(X_train,pheno_train[,"y_train"],family = "gaussian",alpha = 1)
ridge_train <- glmnet(X_train,pheno_train[,"y_train"],family = "gaussian",alpha = 0)
lm_train <- lm.fit(cbind(1,X_train),pheno_train[,"y_train"])
lm_train$coefficients[is.na(lm_train$coefficients)] <- 0

beta_matrix <- cbind(as.matrix(lasso_train$beta),as.matrix(ridge_train$beta),lm_train$coefficients[-1])
beta_matrix <- as.data.frame(beta_matrix)
colnames(beta_matrix) <- c(paste0("lasso_prs",1:ncol(as.matrix(lasso_train$beta))),paste0("ridge_prs",1:ncol(as.matrix(ridge_train$beta))),paste0("lm_prs",1))

beta_matrix <- cbind(Train_PVals_All[,c(1:4)],beta_matrix)

print("Made it")

lasso_prs_tune <- predict(lasso_train,X_tune)
ridge_prs_tune <- predict(ridge_train,X_tune)
lm_prs_tune <- as.numeric(cbind(1,X_tune)%*%matrix(lm_train$coefficients,ncol = 1))

lasso_prs_vad <- predict(lasso_train,X_valid)
ridge_prs_vad <- predict(ridge_train,X_valid)
lm_prs_vad <- as.numeric(cbind(1,X_valid)%*%matrix(lm_train$coefficients,ncol = 1))


lasso_tune_dat <- data.frame(y = pheno_tune[,"y_tune"],lasso_prs_tune)
colnames(lasso_tune_dat) <- c("y",paste0("lasso_prs",1:(ncol(lasso_tune_dat) - 1)))
lasso_valid_dat <- data.frame(y = pheno_valid[,"y_validation"],lasso_prs_vad)
colnames(lasso_valid_dat) <- c("y",paste0("lasso_prs",1:(ncol(lasso_valid_dat) - 1)))


ridge_tune_dat <- data.frame(y = pheno_tune[,"y_tune"],ridge_prs_tune)
colnames(ridge_tune_dat) <- c("y",paste0("ridge_prs",1:(ncol(ridge_tune_dat) - 1)))
ridge_valid_dat <- data.frame(y = pheno_valid[,"y_validation"],ridge_prs_vad)
colnames(ridge_valid_dat) <- c("y",paste0("ridge_prs",1:(ncol(ridge_valid_dat) - 1)))

lm_tune_dat <- data.frame(y = pheno_tune[,"y_tune"],lm_prs_tune)
colnames(lm_tune_dat) <- c("y",paste0("lm_prs",1:(ncol(lm_tune_dat) - 1)))
lm_valid_dat <- data.frame(y = pheno_valid[,"y_validation"],lm_prs_vad)
colnames(lm_valid_dat) <- c("y",paste0("lm_prs",1:(ncol(lm_valid_dat) - 1)))


all_prs_tune <- cbind(lasso_prs_tune,ridge_prs_tune,lm_prs_tune)
colnames(all_prs_tune) <- c(paste0("lasso_prs",1:ncol(lasso_prs_tune)),paste0("ridge_prs",1:ncol(ridge_prs_tune)),"lm_prs1")
all_prs_valid <- cbind(lasso_prs_vad,ridge_prs_vad,lm_prs_vad)
colnames(all_prs_valid) <- c(paste0("lasso_prs",1:ncol(lasso_prs_vad)),paste0("ridge_prs",1:ncol(ridge_prs_vad)),"lm_prs1")

all_prs_tune <- as.data.frame(all_prs_tune)
all_prs_valid <- as.data.frame(all_prs_valid)

mtx <- cor(all_prs_tune)
drop <- names(all_prs_tune)[apply(mtx,2,function(x){sum(is.na(x))}) == (nrow(mtx) - 1)]

all_prs_tune <- dplyr::select(all_prs_tune, -c(drop))
all_prs_valid <- dplyr::select(all_prs_valid, -c(drop))

mtx <- cor(all_prs_tune)
drop <- findCorrelation(mtx,cutoff=0.98)
drop <- names(all_prs_tune)[drop]

all_prs_tune <- dplyr::select(all_prs_tune, -c(drop))
all_prs_valid <- dplyr::select(all_prs_valid, -c(drop))

drop <- findLinearCombos(all_prs_tune)$remove
drop <- names(data.frame(all_prs_tune))[drop]

all_prs_tune <- dplyr::select(all_prs_tune, -c(drop))
all_prs_valid <- dplyr::select(all_prs_valid, -c(drop))

print("Made it")


  Ensemble_Function_Continuous <- function(x,y){
    x <- as.matrix(x[!is.na(y),])
    y <- y[!is.na(y)]
    
    lasso_train <- glmnet(x,y,family = "gaussian",alpha = 1)
    ridge_train <- glmnet(x,y,family = "gaussian",alpha = 0)
    
    lasso_prs_tune <- predict(lasso_train,x)
    ridge_prs_tune <- predict(ridge_train,x)
    
    all <- cbind(lasso_prs_tune,ridge_prs_tune)
    
    R2_Vector <- vector()
    for(i in 1:ncol(all)){
      tmp <- data.frame(y = y, x_try = all[,i])
      R2_Vector[i] <- summary(lm(y~x_try,data = tmp))$r.square
    }
    
    coefficients_x <- coef(lm(y~.,data.frame(y = all[,which.max(R2_Vector)],x)))
    return(list(Coefficients = coefficients_x))
  }
  Ensemble_Function_Binary <- function(x,y){
    x <- as.matrix(x[!is.na(y),])
    y <- y[!is.na(y)]
    
    lasso_train <- glmnet(x,y,family = "binomial",alpha = 1)
    ridge_train <- glmnet(x,y,family = "binomial",alpha = 0)
    
    lasso_prs_tune <- predict(lasso_train,x)
    ridge_prs_tune <- predict(ridge_train,x)
    
    all <- cbind(lasso_prs_tune,ridge_prs_tune)
    
    AUC_Vector <- vector()
    for(i in 1:ncol(all)){
      tmp <- data.frame(y = y, x_try = all[,i])
      roc_obj <- roc.binary(status = "y",
                            variable = "x_try",
                            confounders = "~1",
                            data = tmp,
                            precision=seq(0.05,0.95, by=0.05))
      AUC_Vector[i] <- roc_obj$auc
    }
    
    coefficients_x <- coef(lm(y~.,data.frame(y = all[,which.max(AUC_Vector)],x)))
    return(list(Coefficients = coefficients_x))
  }
  Ensemble_Function <- function(x,y,family = c("continuous","binary")){
    if(family == "continuous"){
      return(Ensemble_Function_Continuous(x,y))
    }else{
      return(Ensemble_Function_Binary(x,y))
    }
  }

print(str(all_prs_tune))

Results <- Ensemble_Function(x = all_prs_tune,y = pheno_tune[,"y_tune"],family = "continuous")
Results$Coefficients[is.na(Results$Coefficients)] <- 0

print("Made it")

Final_Coefficients <- data.frame(beta_matrix[,1:4],BETA = as.matrix(beta_matrix[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1))
print("Made it")
write.csv(Final_Coefficients,file = paste0(OUTPUT_PATH,"/",trait,"_final_coef.csv"),row.names = FALSE)



PRS_Tune <- as.matrix(all_prs_tune[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)
PRS_Validation <- as.matrix(all_prs_valid[,names(Results$Coefficients)[-1]]) %*% matrix(Results$Coefficients[-1],ncol = 1)

print("Made it")

PRS_Tune <- data.frame(IID = pheno_tune$IID,PRS = PRS_Tune)
write.csv(PRS_Tune,paste0(OUTPUT_PATH,"/",trait,"_PRS_Tune.csv"),row.names = FALSE)
PRS_Validation <- data.frame(IID = pheno_valid$IID,PRS = PRS_Validation)
write.csv(PRS_Validation,paste0(OUTPUT_PATH,"/",trait,"_PRS_Validation.csv"),row.names = FALSE)


all_phenotypes <- read.csv(all_phenotypes_file)

RV_PRS_raw <- inner_join(pheno_valid[,c("IID","age","age2","sex",paste0("PC",1:10),"y_validation")],PRS_Validation)
RV_PRS_adjusted <- RV_PRS_raw


tmp <- data.frame(y = RV_PRS_adjusted[,"PRS"],RV_PRS_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
R <- mod$residuals
tmp <- data.frame(y = R^2,RV_PRS_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
mod <- lm(y~.,data = tmp)
y_hat <- predict(mod,tmp)
if(sum(y_hat < 0) > 0){
  mod <- lm(y~1,data = tmp)
  y_hat <- predict(mod,tmp)
}
if(sum(sqrt(y_hat)) == 0){
  RV_PRS_adjusted[,"PRS"] <- 0
}else{
  RV_PRS_adjusted[,"PRS"] <- R/sqrt(y_hat)
}


RV_PRS_raw_EUR <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
RV_PRS_raw_SAS <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
RV_PRS_raw_AMR <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
RV_PRS_raw_AFR <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
RV_PRS_raw_EAS <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
RV_PRS_raw_MID <- RV_PRS_raw[RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

RV_PRS_adjusted_EUR <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
RV_PRS_adjusted_SAS <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
RV_PRS_adjusted_AMR <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
RV_PRS_adjusted_AFR <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
RV_PRS_adjusted_EAS <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
RV_PRS_adjusted_MID <- RV_PRS_adjusted[RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

RV_PRS_raw_EUR$y_validation <- scale(RV_PRS_raw_EUR$y_validation)
RV_PRS_raw_SAS$y_validation <- scale(RV_PRS_raw_SAS$y_validation)
RV_PRS_raw_AMR$y_validation <- scale(RV_PRS_raw_AMR$y_validation)
RV_PRS_raw_AFR$y_validation <- scale(RV_PRS_raw_AFR$y_validation)
RV_PRS_raw_EAS$y_validation <- scale(RV_PRS_raw_EAS$y_validation)
RV_PRS_raw_MID$y_validation <- scale(RV_PRS_raw_MID$y_validation)

RV_PRS_adjusted_EUR$y_validation <- scale(RV_PRS_adjusted_EUR$y_validation)
RV_PRS_adjusted_SAS$y_validation <- scale(RV_PRS_adjusted_SAS$y_validation)
RV_PRS_adjusted_AMR$y_validation <- scale(RV_PRS_adjusted_AMR$y_validation)
RV_PRS_adjusted_AFR$y_validation <- scale(RV_PRS_adjusted_AFR$y_validation)
RV_PRS_adjusted_EAS$y_validation <- scale(RV_PRS_adjusted_EAS$y_validation)
RV_PRS_adjusted_MID$y_validation <- scale(RV_PRS_adjusted_MID$y_validation)

RV_PRS_raw_EUR$PRS <- scale(RV_PRS_raw_EUR$PRS)
RV_PRS_raw_SAS$PRS <- scale(RV_PRS_raw_SAS$PRS)
RV_PRS_raw_AMR$PRS <- scale(RV_PRS_raw_AMR$PRS)
RV_PRS_raw_AFR$PRS <- scale(RV_PRS_raw_AFR$PRS)
RV_PRS_raw_EAS$PRS <- scale(RV_PRS_raw_EAS$PRS)
RV_PRS_raw_MID$PRS <- scale(RV_PRS_raw_MID$PRS)

Beta_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = boot_data))[2]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = boot_data))$r.squared
  return(c(result))
}

beta_validation_raw_EUR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_EUR))[2]
boot_beta <- boot(data = RV_PRS_raw_EUR, statistic = Beta_Boot, R = 10000)
beta_raw_EUR_boot <- boot_beta$t
beta_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_EUR))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

beta_validation_raw_SAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_SAS))[2]
boot_beta <- boot(data = RV_PRS_raw_SAS, statistic = Beta_Boot, R = 10000)
beta_raw_SAS_boot <- boot_beta$t
beta_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_SAS))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

beta_validation_raw_AMR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_AMR))[2]
boot_beta <- boot(data = RV_PRS_raw_AMR, statistic = Beta_Boot, R = 10000)
beta_raw_AMR_boot <- boot_beta$t
beta_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_AMR))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

beta_validation_raw_AFR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_AFR))[2]
boot_beta <- boot(data = RV_PRS_raw_AFR, statistic = Beta_Boot, R = 10000)
beta_raw_AFR_boot <- boot_beta$t
beta_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_AFR))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

beta_validation_raw_EAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_EAS))[2]
boot_beta <- boot(data = RV_PRS_raw_EAS, statistic = Beta_Boot, R = 10000)
beta_raw_EAS_boot <- boot_beta$t
beta_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_EAS))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

beta_validation_raw_MID <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_MID))[2]
boot_beta <- boot(data = RV_PRS_raw_MID, statistic = Beta_Boot, R = 10000)
beta_raw_MID_boot <- boot_beta$t
beta_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_raw_MID))$r.squared
boot_R2 <- boot(data = RV_PRS_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

beta_validation_adjusted_EUR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_EUR))[2]
boot_beta <- boot(data = RV_PRS_adjusted_EUR, statistic = Beta_Boot, R = 10000)
beta_adjusted_EUR_boot <- boot_beta$t
beta_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_EUR))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

beta_validation_adjusted_SAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_SAS))[2]
boot_beta <- boot(data = RV_PRS_adjusted_SAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_SAS_boot <- boot_beta$t
beta_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_SAS))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

beta_validation_adjusted_AMR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_AMR))[2]
boot_beta <- boot(data = RV_PRS_adjusted_AMR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AMR_boot <- boot_beta$t
beta_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_AMR))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

beta_validation_adjusted_AFR <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_AFR))[2]
boot_beta <- boot(data = RV_PRS_adjusted_AFR, statistic = Beta_Boot, R = 10000)
beta_adjusted_AFR_boot <- boot_beta$t
beta_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_AFR))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

beta_validation_adjusted_EAS <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_EAS))[2]
boot_beta <- boot(data = RV_PRS_adjusted_EAS, statistic = Beta_Boot, R = 10000)
beta_adjusted_EAS_boot <- boot_beta$t
beta_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_EAS))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

beta_validation_adjusted_MID <- coef(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_MID))[2]
boot_beta <- boot(data = RV_PRS_adjusted_MID, statistic = Beta_Boot, R = 10000)
beta_adjusted_MID_boot <- boot_beta$t
beta_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(as.formula(paste0("y_validation~","PRS")),data = RV_PRS_adjusted_MID))$r.squared
boot_R2 <- boot(data = RV_PRS_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

RV_PRS_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                             beta_raw = c(beta_validation_raw_EUR,beta_validation_raw_SAS,beta_validation_raw_AMR,beta_validation_raw_AFR,beta_validation_raw_EAS,beta_validation_raw_MID), 
                             beta_se_raw = c(beta_se_validation_raw_EUR,beta_se_validation_raw_SAS,beta_se_validation_raw_AMR,beta_se_validation_raw_AFR,beta_se_validation_raw_EAS,beta_se_validation_raw_MID), 
                             R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                             R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                             beta_adjusted = c(beta_validation_adjusted_EUR,beta_validation_adjusted_SAS,beta_validation_adjusted_AMR,beta_validation_adjusted_AFR,beta_validation_adjusted_EAS,beta_validation_adjusted_MID), 
                             beta_se_adjusted = c(beta_se_validation_adjusted_EUR,beta_se_validation_adjusted_SAS,beta_se_validation_adjusted_AMR,beta_se_validation_adjusted_AFR,beta_se_validation_adjusted_EAS,beta_se_validation_adjusted_MID), 
                             R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                             R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

RV_Boot_Results <- data.frame(trait = trait,beta_raw_EUR_boot,R2_raw_EUR_boot,beta_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_raw_AMR_boot,R2_raw_AMR_boot,beta_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_raw_EAS_boot,R2_raw_EAS_boot,beta_raw_MID_boot,R2_raw_MID_boot,beta_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_adjusted_EAS_boot,R2_adjusted_EAS_boot,beta_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(RV_PRS_Results,file = paste0(OUTPUT_PATH,"/",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(RV_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Bootstraps.csv"),row.names = FALSE)


In [12]:
%%writefile Single_RareVariant_PRS.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${trait} ${Train_Pvals} ${Train_NullModel} ${Tune_NullModel} ${Validation_NullModel} ${Annotation_name_catalog} ${INPUT_PATH} ${all_phenotypes_file} ${all_train_file} ${all_tune_file} ${all_validation_file} ${OUTPUT_PATH}

Writing Single_RareVariant_PRS.sh


In [13]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
 tasks <- rbind(tasks, data.frame(
            '--env trait'=trait,
            '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataAGDS/exome_v7.1",
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Single_RareVariant_PRS.R",
            '--input Train_Pvals'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricCoding/",trait,"_coding_sig.csv"),
            '--input Train_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Train_Null_Model.RData"),
            '--input Tune_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Tune_Null_Model.RData"),
            '--input Validation_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Validation_Null_Model.RData"),
            '--input Annotation_name_catalog'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataGDS/acaf_threshold_v7/Annotation_name_catalog.csv",
            '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
            '--input all_train_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Train.txt",
            '--input all_tune_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
            '--input all_validation_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/RareVariant_PRS",
            check.names = FALSE
        ))   
}

   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [14]:
!Rscript score_task.R

In [15]:
!gsutil -m cp Single_RareVariant_PRS.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Single_RareVariant_PRS.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 27.8 KiB/ 27.8 KiB] 100% Done                                    
Operation completed over 1 objects/27.8 KiB.                                     


In [16]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Single_RareVariant_PRS.sh \
  --tasks score_task.txt

Job properties:
  job-id: single-rar--williamsjacr--250511-005419-70
  job-name: single-rarevariant-prs
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4588539315175791449
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10993787666976377948
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4946105226847872983
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10491198882476459573
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4444183596653775443
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/716398368260328813
Launched job-id: single-rar--williamsjacr--250511-005419-70
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'single-rar--williamsjacr--25051

In [8]:
%%writefile RareVariantCoefficients.R
rm(list = ls())
library(gdsfmt)
library(SeqArray)
library(SeqVarTools)
library(STAAR)
library(STAARpipeline)
library(STAARpipelineSummary)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(readr)
library(dplyr)
library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)
library(glmnet)

Gene_Centric_Coding_G_Star <- function(chr,gene_name,category=c("plof","plof_ds","missense","disruptive_missense","synonymous"),
                                       genofile,obj_nullmodel,rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                       QC_label="annotation/filter",variant_type=c("SNV","Indel","variant"),geno_missing_imputation=c("mean","minor"),
                                       Annotation_dir="annotation/info/FunctionalAnnotation",Annotation_name_catalog,silent=FALSE){
  ## evaluate choices
  category <- match.arg(category)
  variant_type <- match.arg(variant_type)
  geno_missing_imputation <- match.arg(geno_missing_imputation)
  
  genes <- genes_info[genes_info[,2]==chr,]  
  
  phenotype.id <- obj_nullmodel$id_include
  ## get SNV id, position, REF, ALT (whole genome)
  filter <- seqGetData(genofile, QC_label)
  if(variant_type=="variant"){
    SNVlist <- filter == "PASS"
  }
  
  if(variant_type=="SNV"){
    SNVlist <- (filter == "PASS") & isSNV(genofile)
  }
  
  if(variant_type=="Indel"){
    SNVlist <- (filter == "PASS") & (!isSNV(genofile))
  }
  
  position <- as.numeric(seqGetData(genofile, "position"))
  variant.id <- seqGetData(genofile, "variant.id")
  
  rm(filter)
  gc()
  
  ### Gene
  kk <- which(genes[,1]==gene_name)
  
  sub_start_loc <- genes[kk,3]
  sub_end_loc <- genes[kk,4]
  
  is.in <- (SNVlist)&(position>=sub_start_loc)&(position<=sub_end_loc)
  variant.id.gene <- variant.id[is.in]
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## plof
  ## Gencode_Exonic
  GENCODE.EXONIC.Category  <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.EXONIC.Category")]))
  ## Gencode
  GENCODE.Category <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="GENCODE.Category")]))
  ## Meta.SVM.Pred
  MetaSVM_pred <- seqGetData(genofile, paste0(Annotation_dir,Annotation_name_catalog$dir[which(Annotation_name_catalog$name=="MetaSVM")]))
  
  variant.id.gene <- seqGetData(genofile, "variant.id")  
  
  if(category == "plof"){
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")
    variant.id.gene <- variant.id.gene[lof.in.plof]
  }else if(category == "plof_ds"){
    lof.in.plof <- (GENCODE.EXONIC.Category=="stopgain")|(GENCODE.EXONIC.Category=="stoploss")|(GENCODE.Category=="splicing")|(GENCODE.Category=="exonic;splicing")|(GENCODE.Category=="ncRNA_splicing")|(GENCODE.Category=="ncRNA_exonic;splicing")|((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
    variant.id.gene <- variant.id.gene[lof.in.plof]
  }else if(category == "missense"){
    lof.in.missense <- (GENCODE.EXONIC.Category=="nonsynonymous SNV")
    variant.id.gene <- variant.id.gene[lof.in.missense]
  }else if(category == "disruptive_missense"){
    lof.in.ds <- ((GENCODE.EXONIC.Category=="nonsynonymous SNV")&(MetaSVM_pred=="D"))
    variant.id.gene <- variant.id.gene[lof.in.ds]
  }else{
    lof.in.synonymous <- (GENCODE.EXONIC.Category=="synonymous SNV")
    variant.id.gene <- variant.id.gene[lof.in.synonymous]
  }
  
  seqSetFilter(genofile,variant.id=variant.id.gene,sample.id=phenotype.id)
  
  ## genotype id
  id.genotype <- seqGetData(genofile,"sample.id")
  # id.genotype.match <- rep(0,length(id.genotype))
  
  id.genotype.merge <- data.frame(id.genotype,index=seq(1,length(id.genotype)))
  phenotype.id.merge <- data.frame(phenotype.id)
  phenotype.id.merge <- dplyr::left_join(phenotype.id.merge,id.genotype.merge,by=c("phenotype.id"="id.genotype"))
  id.genotype.match <- phenotype.id.merge$index
  
  ## Genotype
  Geno <- seqGetData(genofile, "$dosage")
  Geno <- Geno[id.genotype.match,,drop=FALSE]
  
  ## impute missing
  if(!is.null(dim(Geno)))
  {
    if(dim(Geno)[2]>0)
    {
      if(geno_missing_imputation=="mean")
      {
        Geno <- matrix_flip_mean(Geno)$Geno
      }
      if(geno_missing_imputation=="minor")
      {
        Geno <- matrix_flip_minor(Geno)$Geno
      }
    }
  }
  
  genotype <- Geno
  
  if(dim(genotype)[2] == 1){
    return(matrix(0,nrow = dim(genotype)[1],ncol = 1))
  }
  
  if(!is.null(attr(class(genotype), "package")) && attr(class(genotype), "package") == "Matrix"){
    genotype <- as.matrix(genotype)
  }
  genotype <- matrix_flip(genotype)
  MAF <- genotype$MAF
  RV_label <- as.vector((MAF<rare_maf_cutoff)&(MAF>0))
  Geno_rare <- genotype$Geno[,RV_label]
  G <- Geno_rare
  rm(Geno_rare)
  gc()
  
  if(is.null(dim(G))){
    G <- matrix(G,ncol = 1)
  }
  
  C <- G%*%matrix(1,nrow=ncol(G),ncol = 1)
  
  seqResetFilter(genofile)
  
  return(C)
}

trait <- commandArgs(TRUE)[1]
print(trait)

Train_Pvals <- commandArgs(TRUE)[2]
print(Train_Pvals)

Train_NullModel <- commandArgs(TRUE)[3]
print(Train_NullModel)

Tune_NullModel <- commandArgs(TRUE)[4]
print(Tune_NullModel)

Validation_NullModel <- commandArgs(TRUE)[5]
print(Validation_NullModel)

Annotation_name_catalog <- commandArgs(TRUE)[6]
print(Annotation_name_catalog)

INPUT_PATH <- commandArgs(TRUE)[7]
print(INPUT_PATH)

RV_PRS <- commandArgs(TRUE)[8]
print(RV_PRS)

OUTPUT_PATH <- commandArgs(TRUE)[9]
print(OUTPUT_PATH)

Train_PVals_All <- read.csv(Train_Pvals)
Train_PVals_All <- Train_PVals_All[Train_PVals_All$STAARB <= 1e-03,]

## Null Model
obj_nullmodel_train <- get(load(Train_NullModel))
obj_nullmodel_tune <- get(load(Tune_NullModel))
obj_nullmodel_validation <- get(load(Validation_NullModel))

obj_nullmodel <- obj_nullmodel_train
obj_nullmodel$id_include <- c(obj_nullmodel_train$id_include,obj_nullmodel_tune$id_include,obj_nullmodel_validation$id_include)

## Parameter
QC_label <- "annotation/filter"
geno_missing_imputation <- "mean"
variant_type <- "SNV"

## Annotation_dir
Annotation_dir <- "annotation/info/FunctionalAnnotation"
## Annotation channel
Annotation_name_catalog <- read.csv(Annotation_name_catalog)

G_star_gene_centric_coding <- list()

for(i in 1:nrow(Train_PVals_All)){
  ## Chr
  chr <- Train_PVals_All$Chr[i]
  ## Gene name
  gene_name <- Train_PVals_All$Gene[i]
  ## Coding mask
  category <- Train_PVals_All$Category[i]
  
  ### gds file
  gds.path <- paste0(INPUT_PATH,"/exome.chr",chr,".gds")
  genofile <- seqOpen(gds.path)
  
  G_star_gene_centric_coding[[i]] <- Gene_Centric_Coding_G_Star(chr=chr,gene_name=gene_name,category=category ,
                                                                genofile,obj_nullmodel,rare_maf_cutoff=0.01,rv_num_cutoff=2,
                                                                QC_label=QC_label,variant_type=variant_type,geno_missing_imputation=geno_missing_imputation,
                                                                Annotation_dir=Annotation_dir,Annotation_name_catalog=Annotation_name_catalog,silent=FALSE) 
  seqClose(genofile) 
} 

G_star_gene_centric_coding <- do.call(cbind,G_star_gene_centric_coding)

col_remove <- apply(G_star_gene_centric_coding,2,function(x){sum(x != 0)}) > 10 & colSums(G_star_gene_centric_coding) > 10 
G_star_gene_centric_coding <- G_star_gene_centric_coding[,col_remove,drop = FALSE]

Train_PVals_All <- Train_PVals_All[col_remove,]

print(str(G_star_gene_centric_coding))

ids_gstar <- obj_nullmodel$id_include

G_star_gene_centric_coding_train <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_train$id_include,]
G_star_gene_centric_coding_tune <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_tune$id_include,]
G_star_gene_centric_coding_vad <- G_star_gene_centric_coding[ids_gstar %in% obj_nullmodel_validation$id_include,]

rm(G_star_gene_centric_coding)



X_valid <- data.frame(IID = ids_gstar[ids_gstar %in% obj_nullmodel_validation$id_include],G_star_gene_centric_coding_vad)
RV_PRS <- read.csv(RV_PRS)
tmp <- inner_join(RV_PRS[,c("IID","PRS")],X_valid)
tmp <- subset(tmp,select = -c(IID))
Train_PVals_All$Beta <- coef(lm(PRS~.,tmp))[-1]
Train_PVals_All$Beta[is.na(Train_PVals_All$Beta)] <- 0
Train_PVals_All$Beta[abs(Train_PVals_All$Beta) < 1e-10] <- 0

write.csv(Train_PVals_All,file = paste0(OUTPUT_PATH,"/",trait,"_final_coef.csv"),row.names = FALSE)

Writing RareVariantCoefficients.R


In [9]:
%%writefile RareVariantCoefficients.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${trait} ${Train_Pvals} ${Train_NullModel} ${Tune_NullModel} ${Validation_NullModel} ${Annotation_name_catalog} ${INPUT_PATH} ${RV_PRS} ${OUTPUT_PATH}

Writing RareVariantCoefficients.sh


In [10]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
 tasks <- rbind(tasks, data.frame(
            '--env trait'=trait,
            '--input-recursive INPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataAGDS/exome_v7.1",
            '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/RareVariantCoefficients.R",
            '--input Train_Pvals'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/GeneCentricCoding/",trait,"_coding_sig.csv"),
            '--input Train_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Train_Null_Model.RData"),
            '--input Tune_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Tune_Null_Model.RData"),
            '--input Validation_NullModel'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/NullModels/",trait,"_Validation_Null_Model.RData"),
            '--input Annotation_name_catalog'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/dataGDS/acaf_threshold_v7/Annotation_name_catalog.csv",
            '--input RV_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/RareVariant_PRS/",trait,"_PRS_Validation.csv"),
            '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/RareVariant_PRS",
            check.names = FALSE
        ))   
}

   
write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [11]:
!Rscript score_task.R

In [12]:
!gsutil -m cp RareVariantCoefficients.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://RareVariantCoefficients.R [Content-Type=application/octet-stream]...
/ [1/1 files][  8.4 KiB/  8.4 KiB] 100% Done                                    
Operation completed over 1 objects/8.4 KiB.                                      


In [13]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script RareVariantCoefficients.sh \
  --tasks score_task.txt

Job properties:
  job-id: rarevarian--williamsjacr--250511-125417-59
  job-name: rarevariantcoefficients
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/10641838240056700816
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/16025562658320565953
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12772089216612072498
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7742665512195031476
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/11176103684187754673
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7247589663043056178
Launched job-id: rarevarian--williamsjacr--250511-125417-59
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'rarevarian--williamsjacr--2

In [33]:
%%writefile Common_Plus_Rare.R
rm(list = ls())
library(readr)
library(dplyr)
library(caret)
library(ranger)
library(SuperLearner)
library(dplyr)
library(boot)
library(stringr)
library(glmnet)

trait <- commandArgs(TRUE)[1]
print(trait)

Tune_Data <- commandArgs(TRUE)[2]
print(Tune_Data)

Validation_Data <- commandArgs(TRUE)[3]
print(Validation_Data)

CommonVariantPRS_Tune <- commandArgs(TRUE)[4]
print(CommonVariantPRS_Tune)

RareVariantPRS_Tune <- commandArgs(TRUE)[5]
print(RareVariantPRS_Tune)

CommonVariantPRS_Validation <- commandArgs(TRUE)[6]
print(CommonVariantPRS_Validation)

RareVariantPRS_Validation <- commandArgs(TRUE)[7]
print(RareVariantPRS_Validation)

JointPRS <- commandArgs(TRUE)[8]
print(JointPRS)

PROSPER_PRS <- commandArgs(TRUE)[9]
print(PROSPER_PRS)

CTSLEB_PRS <- commandArgs(TRUE)[10]
print(CTSLEB_PRS)

all_phenotypes_file <- commandArgs(TRUE)[11]
print(all_phenotypes_file)

OUTPUT_PATH <- commandArgs(TRUE)[12]
print(OUTPUT_PATH)

JointPRS <- read.delim(JointPRS)
colnames(JointPRS) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","JointPRS_PRS")
JointPRS <- JointPRS[,c("IID","JointPRS_PRS")]

PROSPER_PRS <- read.delim(PROSPER_PRS)
colnames(PROSPER_PRS) <- c("FID","IID","ALLELE_CT","NAMED_ALLELE_DOSAGE_SUM","PROSPER_PRS")
PROSPER_PRS <- PROSPER_PRS[,c("IID","PROSPER_PRS")]

CTSLEB_PRS <- read.delim(CTSLEB_PRS,sep = "\t",header = TRUE)
colnames(CTSLEB_PRS) <- c("IID","CTSLEB_PRS")

pheno_tune <- read.delim(Tune_Data)
CV_PRS_Tune <- read.delim(CommonVariantPRS_Tune)
colnames(CV_PRS_Tune) <- c("IID","CV_PRS")
pheno_tune <- inner_join(pheno_tune,CV_PRS_Tune)
RV_PRS_Tune <- read.csv(RareVariantPRS_Tune)
colnames(RV_PRS_Tune) <- c("IID","RV_PRS")
pheno_tune <- inner_join(pheno_tune,RV_PRS_Tune)


pheno_validation <- read.delim(Validation_Data)
CV_PRS_Validation <- read.delim(CommonVariantPRS_Validation)
colnames(CV_PRS_Validation) <- c("IID","CV_PRS")
pheno_validation <- inner_join(pheno_validation,CV_PRS_Validation)
RV_PRS_Validation <- read.csv(RareVariantPRS_Validation)
colnames(RV_PRS_Validation) <- c("IID","RV_PRS")
pheno_validation <- inner_join(pheno_validation,RV_PRS_Validation)
pheno_validation <- inner_join(pheno_validation,JointPRS)
pheno_validation <- inner_join(pheno_validation,PROSPER_PRS)
pheno_validation <- inner_join(pheno_validation,CTSLEB_PRS)


model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_tune)
pheno_tune$y_tune <- NA
pheno_tune$y_tune[!is.na(pheno_tune[,trait])] <- model.null$residual

model.null <- lm(as.formula(paste0(trait,"~age+age2+sex+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10")),data=pheno_validation)
pheno_validation$y_validation <- NA
pheno_validation$y_validation[!is.na(pheno_validation[,trait])] <- model.null$residual

RICE_Model <- lm(y_tune ~ CV_PRS + RV_PRS,data = pheno_tune)
pheno_validation$PRS <- predict(RICE_Model,pheno_validation)

CV_RV_PRS_raw <- pheno_validation
CV_RV_PRS_adjusted <- pheno_validation

for(i in c("CV_PRS","RV_PRS","PRS","JointPRS_PRS","PROSPER_PRS","CTSLEB_PRS")){
  tmp <- data.frame(y = CV_RV_PRS_adjusted[,i],CV_RV_PRS_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
  mod <- lm(y~.,data = tmp)
  R <- mod$residuals
  tmp <- data.frame(y = R^2,CV_RV_PRS_adjusted[,c("PC1","PC2","PC3","PC4","PC5")])
  mod <- lm(y~.,data = tmp)
  y_hat <- predict(mod,tmp)
  if(sum(y_hat < 0) > 0){
    mod <- lm(y~1,data = tmp)
    y_hat <- predict(mod,tmp)
  }
  if(sum(sqrt(y_hat)) == 0){
    CV_RV_PRS_adjusted[,i] <- 0
  }else{
    CV_RV_PRS_adjusted[,i] <- R/sqrt(y_hat)
  }
}

all_phenotypes <- read.csv(all_phenotypes_file)

CV_RV_PRS_raw_EUR <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
CV_RV_PRS_raw_SAS <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
CV_RV_PRS_raw_AMR <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
CV_RV_PRS_raw_AFR <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
CV_RV_PRS_raw_EAS <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
CV_RV_PRS_raw_MID <- CV_RV_PRS_raw[CV_RV_PRS_raw$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

CV_RV_PRS_adjusted_EUR <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EUR"],]
CV_RV_PRS_adjusted_SAS <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "SAS"],]
CV_RV_PRS_adjusted_AMR <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AMR"],]
CV_RV_PRS_adjusted_AFR <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "AFR"],]
CV_RV_PRS_adjusted_EAS <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "EAS"],]
CV_RV_PRS_adjusted_MID <- CV_RV_PRS_adjusted[CV_RV_PRS_adjusted$IID %in% all_phenotypes$IID[all_phenotypes$ancestry == "MID"],]

CV_RV_PRS_raw_EUR$y_validation <- scale(CV_RV_PRS_raw_EUR$y_validation)
CV_RV_PRS_raw_SAS$y_validation <- scale(CV_RV_PRS_raw_SAS$y_validation)
CV_RV_PRS_raw_AMR$y_validation <- scale(CV_RV_PRS_raw_AMR$y_validation)
CV_RV_PRS_raw_AFR$y_validation <- scale(CV_RV_PRS_raw_AFR$y_validation)
CV_RV_PRS_raw_EAS$y_validation <- scale(CV_RV_PRS_raw_EAS$y_validation)
CV_RV_PRS_raw_MID$y_validation <- scale(CV_RV_PRS_raw_MID$y_validation)

CV_RV_PRS_adjusted_EUR$y_validation <- scale(CV_RV_PRS_adjusted_EUR$y_validation)
CV_RV_PRS_adjusted_SAS$y_validation <- scale(CV_RV_PRS_adjusted_SAS$y_validation)
CV_RV_PRS_adjusted_AMR$y_validation <- scale(CV_RV_PRS_adjusted_AMR$y_validation)
CV_RV_PRS_adjusted_AFR$y_validation <- scale(CV_RV_PRS_adjusted_AFR$y_validation)
CV_RV_PRS_adjusted_EAS$y_validation <- scale(CV_RV_PRS_adjusted_EAS$y_validation)
CV_RV_PRS_adjusted_MID$y_validation <- scale(CV_RV_PRS_adjusted_MID$y_validation)

CV_RV_PRS_raw_EUR$RV_PRS <- scale(CV_RV_PRS_raw_EUR$RV_PRS)
CV_RV_PRS_raw_SAS$RV_PRS <- scale(CV_RV_PRS_raw_SAS$RV_PRS)
CV_RV_PRS_raw_AMR$RV_PRS <- scale(CV_RV_PRS_raw_AMR$RV_PRS)
CV_RV_PRS_raw_AFR$RV_PRS <- scale(CV_RV_PRS_raw_AFR$RV_PRS)
CV_RV_PRS_raw_EAS$RV_PRS <- scale(CV_RV_PRS_raw_EAS$RV_PRS)
CV_RV_PRS_raw_MID$RV_PRS <- scale(CV_RV_PRS_raw_MID$RV_PRS)

CV_RV_PRS_raw_EUR$CV_PRS <- scale(CV_RV_PRS_raw_EUR$CV_PRS)
CV_RV_PRS_raw_SAS$CV_PRS <- scale(CV_RV_PRS_raw_SAS$CV_PRS)
CV_RV_PRS_raw_AMR$CV_PRS <- scale(CV_RV_PRS_raw_AMR$CV_PRS)
CV_RV_PRS_raw_AFR$CV_PRS <- scale(CV_RV_PRS_raw_AFR$CV_PRS)
CV_RV_PRS_raw_EAS$CV_PRS <- scale(CV_RV_PRS_raw_EAS$CV_PRS)
CV_RV_PRS_raw_MID$CV_PRS <- scale(CV_RV_PRS_raw_MID$CV_PRS)

CV_RV_PRS_raw_EUR$PRS <- scale(CV_RV_PRS_raw_EUR$PRS)
CV_RV_PRS_raw_SAS$PRS <- scale(CV_RV_PRS_raw_SAS$PRS)
CV_RV_PRS_raw_AMR$PRS <- scale(CV_RV_PRS_raw_AMR$PRS)
CV_RV_PRS_raw_AFR$PRS <- scale(CV_RV_PRS_raw_AFR$PRS)
CV_RV_PRS_raw_EAS$PRS <- scale(CV_RV_PRS_raw_EAS$PRS)
CV_RV_PRS_raw_MID$PRS <- scale(CV_RV_PRS_raw_MID$PRS)

CV_RV_PRS_raw_EUR$JointPRS_PRS <- scale(CV_RV_PRS_raw_EUR$JointPRS_PRS)
CV_RV_PRS_raw_SAS$JointPRS_PRS <- scale(CV_RV_PRS_raw_SAS$JointPRS_PRS)
CV_RV_PRS_raw_AMR$JointPRS_PRS <- scale(CV_RV_PRS_raw_AMR$JointPRS_PRS)
CV_RV_PRS_raw_AFR$JointPRS_PRS <- scale(CV_RV_PRS_raw_AFR$JointPRS_PRS)
CV_RV_PRS_raw_EAS$JointPRS_PRS <- scale(CV_RV_PRS_raw_EAS$JointPRS_PRS)
CV_RV_PRS_raw_MID$JointPRS_PRS <- scale(CV_RV_PRS_raw_MID$JointPRS_PRS)

CV_RV_PRS_raw_EUR$PROSPER_PRS <- scale(CV_RV_PRS_raw_EUR$PROSPER_PRS)
CV_RV_PRS_raw_SAS$PROSPER_PRS <- scale(CV_RV_PRS_raw_SAS$PROSPER_PRS)
CV_RV_PRS_raw_AMR$PROSPER_PRS <- scale(CV_RV_PRS_raw_AMR$PROSPER_PRS)
CV_RV_PRS_raw_AFR$PROSPER_PRS <- scale(CV_RV_PRS_raw_AFR$PROSPER_PRS)
CV_RV_PRS_raw_EAS$PROSPER_PRS <- scale(CV_RV_PRS_raw_EAS$PROSPER_PRS)
CV_RV_PRS_raw_MID$PROSPER_PRS <- scale(CV_RV_PRS_raw_MID$PROSPER_PRS)

CV_RV_PRS_raw_EUR$CTSLEB_PRS <- scale(CV_RV_PRS_raw_EUR$CTSLEB_PRS)
CV_RV_PRS_raw_SAS$CTSLEB_PRS <- scale(CV_RV_PRS_raw_SAS$CTSLEB_PRS)
CV_RV_PRS_raw_AMR$CTSLEB_PRS <- scale(CV_RV_PRS_raw_AMR$CTSLEB_PRS)
CV_RV_PRS_raw_AFR$CTSLEB_PRS <- scale(CV_RV_PRS_raw_AFR$CTSLEB_PRS)
CV_RV_PRS_raw_EAS$CTSLEB_PRS <- scale(CV_RV_PRS_raw_EAS$CTSLEB_PRS)
CV_RV_PRS_raw_MID$CTSLEB_PRS <- scale(CV_RV_PRS_raw_MID$CTSLEB_PRS)


Beta_CV_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(y_validation~CV_PRS + RV_PRS,data = boot_data))[2]
  return(c(result))
}

Beta_RV_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- coef(lm(y_validation~CV_PRS + RV_PRS,data = boot_data))[3]
  return(c(result))
}

R2_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  result <- summary(lm(y_validation~PRS,data = boot_data))$r.squared
  return(c(result))
}

R2_Comparison_Boot <- function(data,indices){
  boot_data <- data[indices, ]
  RICE_R2 <- summary(lm(y_validation~PRS,data = boot_data))$r.squared
  CTSLEB_R2 <- summary(lm(y_validation~CTSLEB_PRS,data = boot_data))$r.squared
  PROSPER_R2 <- summary(lm(y_validation~PROSPER_PRS,data = boot_data))$r.squared
  JointPRS_R2 <- summary(lm(y_validation~JointPRS_PRS,data = boot_data))$r.squared
  return(c(RICE_R2 - CTSLEB_R2,RICE_R2 - PROSPER_R2,RICE_R2 - JointPRS_R2))
}

beta_CV_validation_raw_EUR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_EUR))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_EUR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_EUR_boot <- boot_beta$t
beta_CV_se_validation_raw_EUR <- sd(boot_beta$t)

beta_RV_validation_raw_EUR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_EUR))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_EUR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_EUR_boot <- boot_beta$t
beta_RV_se_validation_raw_EUR <- sd(boot_beta$t)

R2_validation_raw_EUR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_EUR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_EUR, statistic = R2_Boot, R = 10000)
R2_raw_EUR_boot <- boot_R2$t
R2_se_validation_raw_EUR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_EUR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_EUR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_EUR_boot) <- c("R2_raw_EUR_RICE_vs_CTSLEB","R2_raw_EUR_RICE_vs_PROSPER","R2_raw_EUR_RICE_vs_JointPRS")

beta_CV_validation_raw_SAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_SAS))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_SAS, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_SAS_boot <- boot_beta$t
beta_CV_se_validation_raw_SAS <- sd(boot_beta$t)

beta_RV_validation_raw_SAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_SAS))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_SAS, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_SAS_boot <- boot_beta$t
beta_RV_se_validation_raw_SAS <- sd(boot_beta$t)

R2_validation_raw_SAS <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_SAS))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_SAS, statistic = R2_Boot, R = 10000)
R2_raw_SAS_boot <- boot_R2$t
R2_se_validation_raw_SAS <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_SAS, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_SAS_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_SAS_boot) <- c("R2_raw_SAS_RICE_vs_CTSLEB","R2_raw_SAS_RICE_vs_PROSPER","R2_raw_SAS_RICE_vs_JointPRS")

beta_CV_validation_raw_AMR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_AMR))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_AMR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_AMR_boot <- boot_beta$t
beta_CV_se_validation_raw_AMR <- sd(boot_beta$t)

beta_RV_validation_raw_AMR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_AMR))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_AMR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_AMR_boot <- boot_beta$t
beta_RV_se_validation_raw_AMR <- sd(boot_beta$t)

R2_validation_raw_AMR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_AMR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_AMR, statistic = R2_Boot, R = 10000)
R2_raw_AMR_boot <- boot_R2$t
R2_se_validation_raw_AMR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_AMR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_AMR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_AMR_boot) <- c("R2_raw_AMR_RICE_vs_CTSLEB","R2_raw_AMR_RICE_vs_PROSPER","R2_raw_AMR_RICE_vs_JointPRS")

beta_CV_validation_raw_AFR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_AFR))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_AFR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_AFR_boot <- boot_beta$t
beta_CV_se_validation_raw_AFR <- sd(boot_beta$t)

beta_RV_validation_raw_AFR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_AFR))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_AFR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_AFR_boot <- boot_beta$t
beta_RV_se_validation_raw_AFR <- sd(boot_beta$t)

R2_validation_raw_AFR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_AFR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_AFR, statistic = R2_Boot, R = 10000)
R2_raw_AFR_boot <- boot_R2$t
R2_se_validation_raw_AFR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_AFR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_AFR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_AFR_boot) <- c("R2_raw_AFR_RICE_vs_CTSLEB","R2_raw_AFR_RICE_vs_PROSPER","R2_raw_AFR_RICE_vs_JointPRS")

beta_CV_validation_raw_EAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_EAS))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_EAS, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_EAS_boot <- boot_beta$t
beta_CV_se_validation_raw_EAS <- sd(boot_beta$t)

beta_RV_validation_raw_EAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_EAS))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_EAS, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_EAS_boot <- boot_beta$t
beta_RV_se_validation_raw_EAS <- sd(boot_beta$t)

R2_validation_raw_EAS <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_EAS))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_EAS, statistic = R2_Boot, R = 10000)
R2_raw_EAS_boot <- boot_R2$t
R2_se_validation_raw_EAS <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_EAS, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_EAS_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_EAS_boot) <- c("R2_raw_EAS_RICE_vs_CTSLEB","R2_raw_EAS_RICE_vs_PROSPER","R2_raw_EAS_RICE_vs_JointPRS")

beta_CV_validation_raw_MID <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_MID))[2]
boot_beta <- boot(data = CV_RV_PRS_raw_MID, statistic = Beta_CV_Boot, R = 10000)
beta_CV_raw_MID_boot <- boot_beta$t
beta_CV_se_validation_raw_MID <- sd(boot_beta$t)

beta_RV_validation_raw_MID <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_raw_MID))[3]
boot_beta <- boot(data = CV_RV_PRS_raw_MID, statistic = Beta_RV_Boot, R = 10000)
beta_RV_raw_MID_boot <- boot_beta$t
beta_RV_se_validation_raw_MID <- sd(boot_beta$t)

R2_validation_raw_MID <- summary(lm(y_validation~PRS,data = CV_RV_PRS_raw_MID))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_raw_MID, statistic = R2_Boot, R = 10000)
R2_raw_MID_boot <- boot_R2$t
R2_se_validation_raw_MID <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_raw_MID, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_raw_MID_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_raw_MID_boot) <- c("R2_raw_MID_RICE_vs_CTSLEB","R2_raw_MID_RICE_vs_PROSPER","R2_raw_MID_RICE_vs_JointPRS")

beta_CV_validation_adjusted_EUR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_EUR))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_EUR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_EUR_boot <- boot_beta$t
beta_CV_se_validation_adjusted_EUR <- sd(boot_beta$t)

beta_RV_validation_adjusted_EUR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_EUR))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_EUR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_EUR_boot <- boot_beta$t
beta_RV_se_validation_adjusted_EUR <- sd(boot_beta$t)

R2_validation_adjusted_EUR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_EUR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_EUR, statistic = R2_Boot, R = 10000)
R2_adjusted_EUR_boot <- boot_R2$t
R2_se_validation_adjusted_EUR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_EUR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_EUR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_EUR_boot) <- c("R2_adjusted_EUR_RICE_vs_CTSLEB","R2_adjusted_EUR_RICE_vs_PROSPER","R2_adjusted_EUR_RICE_vs_JointPRS")

beta_CV_validation_adjusted_SAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_SAS))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_SAS, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_SAS_boot <- boot_beta$t
beta_CV_se_validation_adjusted_SAS <- sd(boot_beta$t)

beta_RV_validation_adjusted_SAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_SAS))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_SAS, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_SAS_boot <- boot_beta$t
beta_RV_se_validation_adjusted_SAS <- sd(boot_beta$t)

R2_validation_adjusted_SAS <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_SAS))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_SAS, statistic = R2_Boot, R = 10000)
R2_adjusted_SAS_boot <- boot_R2$t
R2_se_validation_adjusted_SAS <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_SAS, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_SAS_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_SAS_boot) <- c("R2_adjusted_SAS_RICE_vs_CTSLEB","R2_adjusted_SAS_RICE_vs_PROSPER","R2_adjusted_SAS_RICE_vs_JointPRS")

beta_CV_validation_adjusted_AMR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_AMR))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_AMR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_AMR_boot <- boot_beta$t
beta_CV_se_validation_adjusted_AMR <- sd(boot_beta$t)

beta_RV_validation_adjusted_AMR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_AMR))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_AMR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_AMR_boot <- boot_beta$t
beta_RV_se_validation_adjusted_AMR <- sd(boot_beta$t)

R2_validation_adjusted_AMR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_AMR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_AMR, statistic = R2_Boot, R = 10000)
R2_adjusted_AMR_boot <- boot_R2$t
R2_se_validation_adjusted_AMR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_AMR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_AMR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_AMR_boot) <- c("R2_adjusted_AMR_RICE_vs_CTSLEB","R2_adjusted_AMR_RICE_vs_PROSPER","R2_adjusted_AMR_RICE_vs_JointPRS")

beta_CV_validation_adjusted_AFR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_AFR))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_AFR, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_AFR_boot <- boot_beta$t
beta_CV_se_validation_adjusted_AFR <- sd(boot_beta$t)

beta_RV_validation_adjusted_AFR <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_AFR))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_AFR, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_AFR_boot <- boot_beta$t
beta_RV_se_validation_adjusted_AFR <- sd(boot_beta$t)

R2_validation_adjusted_AFR <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_AFR))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_AFR, statistic = R2_Boot, R = 10000)
R2_adjusted_AFR_boot <- boot_R2$t
R2_se_validation_adjusted_AFR <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_AFR, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_AFR_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_AFR_boot) <- c("R2_adjusted_AFR_RICE_vs_CTSLEB","R2_adjusted_AFR_RICE_vs_PROSPER","R2_adjusted_AFR_RICE_vs_JointPRS")

beta_CV_validation_adjusted_EAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_EAS))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_EAS, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_EAS_boot <- boot_beta$t
beta_CV_se_validation_adjusted_EAS <- sd(boot_beta$t)

beta_RV_validation_adjusted_EAS <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_EAS))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_EAS, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_EAS_boot <- boot_beta$t
beta_RV_se_validation_adjusted_EAS <- sd(boot_beta$t)

R2_validation_adjusted_EAS <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_EAS))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_EAS, statistic = R2_Boot, R = 10000)
R2_adjusted_EAS_boot <- boot_R2$t
R2_se_validation_adjusted_EAS <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_EAS, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_EAS_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_EAS_boot) <- c("R2_adjusted_EAS_RICE_vs_CTSLEB","R2_adjusted_EAS_RICE_vs_PROSPER","R2_adjusted_EAS_RICE_vs_JointPRS")

beta_CV_validation_adjusted_MID <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_MID))[2]
boot_beta <- boot(data = CV_RV_PRS_adjusted_MID, statistic = Beta_CV_Boot, R = 10000)
beta_CV_adjusted_MID_boot <- boot_beta$t
beta_CV_se_validation_adjusted_MID <- sd(boot_beta$t)

beta_RV_validation_adjusted_MID <- coef(lm(y_validation~CV_PRS + RV_PRS,data = CV_RV_PRS_adjusted_MID))[3]
boot_beta <- boot(data = CV_RV_PRS_adjusted_MID, statistic = Beta_RV_Boot, R = 10000)
beta_RV_adjusted_MID_boot <- boot_beta$t
beta_RV_se_validation_adjusted_MID <- sd(boot_beta$t)

R2_validation_adjusted_MID <- summary(lm(y_validation~PRS,data = CV_RV_PRS_adjusted_MID))$r.squared
boot_R2 <- boot(data = CV_RV_PRS_adjusted_MID, statistic = R2_Boot, R = 10000)
R2_adjusted_MID_boot <- boot_R2$t
R2_se_validation_adjusted_MID <- sd(boot_R2$t)

boot_R2 <- boot(data = CV_RV_PRS_adjusted_MID, statistic = R2_Comparison_Boot, R = 10000)
R2_comparison_adjusted_MID_boot <- as.data.frame(boot_R2$t)
colnames(R2_comparison_adjusted_MID_boot) <- c("R2_adjusted_MID_RICE_vs_CTSLEB","R2_adjusted_MID_RICE_vs_PROSPER","R2_adjusted_MID_RICE_vs_JointPRS")

CV_PRS_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                             beta_raw = c(beta_CV_validation_raw_EUR,beta_CV_validation_raw_SAS,beta_CV_validation_raw_AMR,beta_CV_validation_raw_AFR,beta_CV_validation_raw_EAS,beta_CV_validation_raw_MID), 
                             beta_se_raw = c(beta_CV_se_validation_raw_EUR,beta_CV_se_validation_raw_SAS,beta_CV_se_validation_raw_AMR,beta_CV_se_validation_raw_AFR,beta_CV_se_validation_raw_EAS,beta_CV_se_validation_raw_MID), 
                             R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                             R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                             beta_adjusted = c(beta_CV_validation_adjusted_EUR,beta_CV_validation_adjusted_SAS,beta_CV_validation_adjusted_AMR,beta_CV_validation_adjusted_AFR,beta_CV_validation_adjusted_EAS,beta_CV_validation_adjusted_MID), 
                             beta_se_adjusted = c(beta_CV_se_validation_adjusted_EUR,beta_CV_se_validation_adjusted_SAS,beta_CV_se_validation_adjusted_AMR,beta_CV_se_validation_adjusted_AFR,beta_CV_se_validation_adjusted_EAS,beta_CV_se_validation_adjusted_MID), 
                             R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                             R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

CV_Boot_Results <- data.frame(trait = trait,beta_CV_raw_EUR_boot,R2_raw_EUR_boot,beta_CV_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_CV_raw_AMR_boot,R2_raw_AMR_boot,beta_CV_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_CV_raw_EAS_boot,R2_raw_EAS_boot,beta_CV_raw_MID_boot,R2_raw_MID_boot,beta_CV_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_CV_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_CV_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_CV_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_CV_adjusted_EAS_boot,R2_adjusted_EAS_boot,beta_CV_adjusted_MID_boot,R2_adjusted_MID_boot)

Comparison_Boot_Results <- data.frame(trait = trait,R2_comparison_raw_EUR_boot,R2_comparison_raw_SAS_boot,R2_comparison_raw_AMR_boot,R2_comparison_raw_AFR_boot,R2_comparison_raw_EAS_boot,R2_comparison_raw_MID_boot,
                                      R2_comparison_adjusted_EUR_boot,R2_comparison_adjusted_SAS_boot,R2_comparison_adjusted_AMR_boot,R2_comparison_adjusted_AFR_boot,R2_comparison_adjusted_EAS_boot,R2_comparison_adjusted_MID_boot)

RV_PRS_Results <- data.frame(trait = trait,ancestry = c("EUR","SAS","AMR","AFR","EAS","MID"), 
                             beta_raw = c(beta_RV_validation_raw_EUR,beta_RV_validation_raw_SAS,beta_RV_validation_raw_AMR,beta_RV_validation_raw_AFR,beta_RV_validation_raw_EAS,beta_RV_validation_raw_MID), 
                             beta_se_raw = c(beta_RV_se_validation_raw_EUR,beta_RV_se_validation_raw_SAS,beta_RV_se_validation_raw_AMR,beta_RV_se_validation_raw_AFR,beta_RV_se_validation_raw_EAS,beta_RV_se_validation_raw_MID), 
                             R2_raw = c(R2_validation_raw_EUR,R2_validation_raw_SAS,R2_validation_raw_AMR,R2_validation_raw_AFR,R2_validation_raw_EAS,R2_validation_raw_MID),
                             R2_se_raw = c(R2_se_validation_raw_EUR,R2_se_validation_raw_SAS,R2_se_validation_raw_AMR,R2_se_validation_raw_AFR,R2_se_validation_raw_EAS,R2_se_validation_raw_MID),
                             beta_adjusted = c(beta_RV_validation_adjusted_EUR,beta_RV_validation_adjusted_SAS,beta_RV_validation_adjusted_AMR,beta_RV_validation_adjusted_AFR,beta_RV_validation_adjusted_EAS,beta_RV_validation_adjusted_MID), 
                             beta_se_adjusted = c(beta_RV_se_validation_adjusted_EUR,beta_RV_se_validation_adjusted_SAS,beta_RV_se_validation_adjusted_AMR,beta_RV_se_validation_adjusted_AFR,beta_RV_se_validation_adjusted_EAS,beta_RV_se_validation_adjusted_MID), 
                             R2_adjusted = c(R2_validation_adjusted_EUR,R2_validation_adjusted_SAS,R2_validation_adjusted_AMR,R2_validation_adjusted_AFR,R2_validation_adjusted_EAS,R2_validation_adjusted_MID),
                             R2_se_adjusted = c(R2_se_validation_adjusted_EUR,R2_se_validation_adjusted_SAS,R2_se_validation_adjusted_AMR,R2_se_validation_adjusted_AFR,R2_se_validation_adjusted_EAS,R2_se_validation_adjusted_MID))

RV_Boot_Results <- data.frame(trait = trait,beta_RV_raw_EUR_boot,R2_raw_EUR_boot,beta_RV_raw_SAS_boot,R2_raw_SAS_boot,
                              beta_RV_raw_AMR_boot,R2_raw_AMR_boot,beta_RV_raw_AFR_boot,R2_raw_AFR_boot,
                              beta_RV_raw_EAS_boot,R2_raw_EAS_boot,beta_RV_raw_MID_boot,R2_raw_MID_boot,beta_RV_adjusted_EUR_boot,R2_adjusted_EUR_boot,
                              beta_RV_adjusted_SAS_boot,R2_adjusted_SAS_boot,beta_RV_adjusted_AMR_boot,R2_adjusted_AMR_boot,
                              beta_RV_adjusted_AFR_boot,R2_adjusted_AFR_boot,beta_RV_adjusted_EAS_boot,R2_adjusted_EAS_boot,beta_RV_adjusted_MID_boot,R2_adjusted_MID_boot)

write.csv(Comparison_Boot_Results,file = paste0(OUTPUT_PATH,"/",trait,"_Comparison_Bootstraps.csv"),row.names = FALSE)
write.csv(CV_PRS_Results,file = paste0(OUTPUT_PATH,"/","CV_",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(CV_Boot_Results,file = paste0(OUTPUT_PATH,"/","CV_",trait,"_Bootstraps.csv"),row.names = FALSE)
write.csv(RV_PRS_Results,file = paste0(OUTPUT_PATH,"/","RV_",trait,"Best_Betas.csv"),row.names = FALSE)
write.csv(RV_Boot_Results,file = paste0(OUTPUT_PATH,"/","RV_",trait,"_Bootstraps.csv"),row.names = FALSE)


Writing Common_Plus_Rare.R


In [34]:
%%writefile Common_Plus_Rare.sh
#!/bin/bash

set -o errexit
set -o nounset

Rscript ${R_Script} ${trait} ${Tune_Data} ${Validation_Data} ${CommonVariantPRS_Tune} ${RareVariantPRS_Tune} ${CommonVariantPRS_Validation} ${RareVariantPRS_Validation}  ${JointPRS}  ${PROSPER_PRS}  ${CTSLEB_PRS} ${all_phenotypes_file} ${OUTPUT_PATH}

Writing Common_Plus_Rare.sh


In [35]:
%%writefile score_task.R

tasks <- data.frame(check.names = FALSE)

for(trait in c("BMI","LDL","HDL","logTG","TC","Height")){
  tasks <- rbind(tasks, data.frame(
    '--env trait'=trait,
    '--input Tune_Data'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Tune.txt",
    '--input Validation_Data'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/All_Validation.txt",
    '--input CommonVariantPRS_Tune'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",trait,"_Best_Tune_All.txt"),
    '--input RareVariantPRS_Tune'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/RareVariant_PRS/",trait,"_PRS_Tune.csv"),
    '--input CommonVariantPRS_Validation'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/OneCommonPRS/",trait,"_Best_Validation_All.txt"),
    '--input RareVariantPRS_Validation'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/RareVariant_PRS/",trait,"_PRS_Validation.csv"),
    '--input JointPRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/JointPRS/PRSs/PRS_META_",trait,".sscore"),
    '--input PROSPER_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/PROSPER/PRSs/PRS_",trait,".sscore"),
    '--input CTSLEB_PRS'=paste0("gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/CTSLEB/Results/",trait,"_Best_Validation_All.txt"),
    '--input R_Script'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/Common_Plus_Rare.R",
    '--input all_phenotypes_file'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/all_phenotypes.csv",
    '--output-recursive OUTPUT_PATH'="gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Results/Continuous/Common_Plus_Rare/",
    check.names = FALSE
  ))   
}


write.table(tasks, 
            file="score_task.txt", 
            row.names=F, col.names=T, 
            sep='\t', quote=F)

Overwriting score_task.R


In [36]:
!Rscript score_task.R

In [37]:
!gsutil -m cp Common_Plus_Rare.R gs://fc-secure-797107a7-4402-4122-941c-9a486e0d633e/JW/AoU_Phenotypes/Scripts/

Copying file://Common_Plus_Rare.R [Content-Type=application/octet-stream]...
/ [1/1 files][ 27.6 KiB/ 27.6 KiB] 100% Done                                    
Operation completed over 1 objects/27.6 KiB.                                     


In [38]:
%%bash --out score_batch

source ~/aou_dsub.bash # This file was created via notebook 01_dsub_setup.ipynb.

aou_dsub \
  --image willja16/r_with_plink \
  --disk-size 100 \
  --boot-disk-size 25 \
  --min-ram 16 \
  --timeout "96h" \
  --logging "${WORKSPACE_BUCKET}/data/logging" \
  --script Common_Plus_Rare.sh \
  --tasks score_task.txt

Job properties:
  job-id: common-plu--williamsjacr--250516-234338-22
  job-name: common-plus-rare
  user-id: williamsjacr
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/7209186891319781713
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13861044095870257966
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/13852388308955964713
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/4893768840273172404
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/1490482200950451986
Provider internal-id (operation): projects/52933917155/locations/us-central1/operations/12978605675624031067
Launched job-id: common-plu--williamsjacr--250516-234338-22
6 task(s)
To check the status, run:
  dstat --provider google-cls-v2 --project terra-vpc-sc-804f445b --location us-central1 --jobs 'common-plu--williamsjacr--250516-23