# Preliminaries

In [None]:
library(tidyverse)
library(reticulate)

fapi <- import("firecloud.api")
tnu <- import("terra_notebook_utils")

ws_namespace <- Sys.getenv("WORKSPACE_NAMESPACE")
ws_name <- Sys.getenv("WORKSPACE_NAME")
ws_bucket <- Sys.getenv("WORKSPACE_BUCKET")

In [None]:
# # Install bcftools
# bcftools_install_cmds <- paste(
#     "wget https://github.com/samtools/bcftools/releases/download/1.15.1/bcftools-1.15.1.tar.bz2",
#     "bzip2 -d bcftools-1.15.1.tar.bz2",
#     "tar xvf bcftools-1.15.1.tar",
#     "cd bcftools-1.15.1",
#     "./configure",
#     "make",
#     sep=" && "
# )
# system(bcftools_install_cmds)

# Import and subset TOPMed genotype files

## Basic info about the variants of interest

In [None]:
snp_info_df <- tribble(
    ~CHR, ~POS, ~rsID,
    2, 121657822, "rs2862183",
    17, 36804493, "rs295849",
    20, 33426108, "rs141588480",
    7, 121864095, "rs77810251"
)
write_csv(snp_info_df, "genotypes/snp_info.csv")

pilot_snps <- snp_info_df$rsID
write(pilot_snps, "genotypes/pilot_snps.txt")

# Region/locus file will be used in bcftools subsetting command
snp_info_df %>%
    mutate(CHR = paste0("chr", CHR)) %>%
    select(CHR, POS) %>%
    write_tsv("genotypes/pilot_snp_regions.txt", col_names=FALSE)

## Import and subset TOPMed-wide VCF files

In [None]:
# Data table contains references to TOPMed genotype file locations
ref_file_tsv <- fapi$get_entities_tsv(ws_namespace, ws_name, "reference_file", model="flexible")
ref_file_df <- read_tsv(ref_file_tsv$text, col_types=cols())

In [None]:
pilot_chroms <- unique(snp_info_df$CHR)

# for (chr in pilot_chroms) {  # Loop through relevant chromosomes, extracting variants of interest
#     fn <- paste0("freeze.9b.chr", chr, ".pass_and_fail.gtonly.minDP0.bcf")
#     if (!file.exists(fn)) {
#         drs_uri <- ref_file_df$`pfb:ga4gh_drs_uri`[ref_file_df$`pfb:file_name` == fn]
#         print(drs_uri)
#         tnu$drs$copy(drs_uri, fn)
#     }

#     # Create index for .bcf file
#     bcftools_idx_cmd <- paste0(
#         "bcftools-1.15.1/bcftools index --threads 2 ", fn
#     )
#     if (!file.exists(paste0(fn, ".csi"))) {
#         print(bcftools_idx_cmd)
#         system(bcftools_idx_cmd)
#     }

#     # Run bcftools command to extract variants of interest
#     bcftools_cmd <- paste0(
#         "bcftools-1.15.1/bcftools view",
#         " -R pilot_snp_regions.txt",
#     #     " -i 'ID=@pilot_snps.txt'",
#         " --no-header",
#         " -o genotypes/pilot_snps_chr", chr, ".vcf",
#         " ", fn
#     )
#     print(bcftools_cmd)
#     system(bcftools_cmd) 
    
#     # Remove very large .bcf files
#     rm_cmd <- paste0("rm ", fn)
#     system(rm_cmd)
#     system(paste0("mv ", fn, ".csi genotypes/"))  # Keep index file for potential later use
# }

In [None]:
# Use header to get sample IDs to link to genotypes in VCF

# chr <- 22
# fn <- paste0("freeze.9b.chr", chr, ".pass_and_fail.gtonly.minDP0.bcf")
#     if (!file.exists(fn)) {
#         drs_uri <- ref_file_df$`pfb:ga4gh_drs_uri`[ref_file_df$`pfb:file_name` == fn]
#         print(drs_uri)
#         tnu$drs$copy(drs_uri, fn)
#     }
# system(paste0(
#     "bcftools-1.15.1/bcftools head ", fn,
#     " | tail -n 1",
#     " | tr '\t' '\n'",
#     " > genotypes/pilot_header.txt"
# ))
# rm_cmd <- paste0("rm ", fn)
# system(rm_cmd)

In [None]:
vcf_header <- scan("genotypes/pilot_header.txt", what=character())
vcf_samples <- vcf_header[seq(10, length(vcf_header))]

## Collect genotypes and export

In [None]:
read_vcf <- function(vcf_fn, rsID) {
    system(paste0("grep ", rsID, " ", vcf_fn, " > gvec.tmp"))
    if (rsID == "rs77810251") system("grep 'G\tA' gvec.tmp > gvec2.tmp && mv gvec2.tmp gvec.tmp")
    vcf_vec <- scan("gvec.tmp", what=character())
    system("rm gvec.tmp")
    vcf_genos <- vcf_vec[seq(10, length(vcf_vec))]
    vcf_dosages <- as.integer(c("0/0"=0, "0/1"=1, "1/1"=2)[vcf_genos])
    vcf_dosages
}

In [None]:
snp_vec_list <- apply(snp_info_df, 1, function(r) {
    vcf_fn <- paste0("genotypes/pilot_snps_chr", str_trim(r["CHR"]), ".vcf")
    read_vcf(vcf_fn, r["rsID"])
}, simplify=FALSE)

In [None]:
geno_mat_full <- do.call(rbind, snp_vec_list)  # SNPs x samples
colnames(geno_mat_full) <- vcf_samples
rownames(geno_mat_full) <- snp_info_df$rsID

geno_df <- as_tibble(t(geno_mat_full), rownames="NWD_ID")

In [None]:
saveRDS(geno_df, "genotypes/analysis_genotypes.rds")

system(paste("gsutil cp -r genotypes", ws_bucket))

# Archive

In [None]:
# if (!require("vcfR")) install.packages("vcfR")
# library(vcfR)

In [None]:
# vcf <- read.vcfR("genotypes/pilot_snps_chr7.vcf")
# vcf@gt[1:5, 1:5]