# PCSK9 vs. APOC3

This notebook records [BRV](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4459641/) analysis result for PCSK9 v.s LDL and APOC3 v.s. TG

## Extract PCSK9 + APOC3

In [1]:
library(dplyr)
library(data.table)
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




### Exome

In [2]:
for(chr in c(1,11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(maf in c(0.01, 0.005, 0.001)){
        maf_c <- gsub("\\.", "", as.character(maf))
        
        annot <- read.csv(sprintf("./exome/ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.csv.gz", chr, maf_c)) %>% 
            filter(Gene.refGene == gene)
        annot %>% fwrite(sprintf("./%s/%s_exome_maf%s_annot.csv.gz", tolower(gene), gene, maf_c))
        annot %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_exome_maf%s_extractlist_snplist", tolower(gene), gene, maf_c), col.names = FALSE)
        
        annot_cadd <- read.csv(sprintf("./exome/ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_cadd.csv.gz", chr, maf_c)) %>% 
            filter(Gene.refGene == gene)
        annot_cadd %>% fwrite(sprintf("./%s/%s_exome_maf%s_cadd_annot.csv.gz", tolower(gene), gene, maf_c))
        annot_cadd %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_exome_maf%s_cadd_extractlist_snplist", tolower(gene), gene, maf_c), col.names = FALSE)
    }
}

### HRC

In [3]:
for(chr in c(1,11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot <- read.csv(sprintf("./hrc/hrc_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot %>% select(ID_hg19) %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

In [4]:
for(chr in c(1, 11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot_cadd <- read.csv(sprintf("./hrc/hrc_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_cadd_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot_cadd %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_cadd_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot_cadd %>% select(ID_hg19) %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_cadd_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

### TOPMed

In [8]:
for(chr in c(1, 11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot <- read.csv(sprintf("./topmed_v3/topmed_chr%i_rsq0%i_hg38_hg38_maf%s_LOF_missense_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot %>% fwrite(sprintf("./%s/%s_topmed_v3_rsq0%i_maf%s_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_topmed_v3_rsq0%i_maf%s_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

In [9]:
for(chr in c(1, 11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot_cadd <- read.csv(sprintf("./topmed_v3/topmed_chr%i_rsq0%i_hg38_hg38_maf%s_LOF_missense_cadd_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot_cadd %>% fwrite(sprintf("./%s/%s_topmed_v3_rsq0%i_maf%s_cadd_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot_cadd %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_topmed_v3_rsq0%i_maf%s_cadd_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

### Extract

In [17]:
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9

module load Plink/2.00a

for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c1_maf00${i}_LOF_missense_extracted \
        --extract PCSK9_exome_maf00${i}_extractlist_snplist \
        --make-bed --export A \
        --out PCSK9_exome_maf00${i}_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_hrc_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out PCSK9_hrc_rsq0${j}_maf00${i}_extracted

        plink2 \
            --bpfile ../topmed_v3/topmed_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_topmed_v3_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out PCSK9_topmed_v3_rsq0${j}_maf00${i}_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to PCSK9_exome_maf001_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c1_maf001_LOF_missense_extracted
  --export A
  --extract PCSK9_exome_maf001_extractlist_snplist
  --make-bed
  --out PCSK9_exome_maf001_extracted

Start time: Fri Jul  5 13:50:10 2024
515666 MiB RAM detected, ~490443 available; reserving 257833 MiB for main
workspace.
Using up to 2 compute threads.
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c1_maf001_LOF_missense_extracted.fam.
478357 variants loaded from
../exome/ukb23156_c1_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 394 variants remaining.
394 variants remaining after main filters.
Writing PCSK9_exome_maf001_extracted.fam ... done.
Writing PCSK9_exome_maf001_extracted.bim ... done.
Writing PCSK9

In [18]:
for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c1_maf00${i}_LOF_missense_extracted \
        --extract PCSK9_exome_maf00${i}_cadd_extractlist_snplist \
        --make-bed --export A \
        --out PCSK9_exome_maf00${i}_cadd_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_hrc_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out PCSK9_hrc_rsq0${j}_maf00${i}_cadd_extracted

        plink2 \
            --bpfile ../topmed_v3/../topmed_v3/topmed_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_topmed_v3_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out PCSK9_topmed_v3_rsq0${j}_maf00${i}_cadd_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to PCSK9_exome_maf001_cadd_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c1_maf001_LOF_missense_extracted
  --export A
  --extract PCSK9_exome_maf001_cadd_extractlist_snplist
  --make-bed
  --out PCSK9_exome_maf001_cadd_extracted

Start time: Fri Jul  5 13:52:41 2024
515666 MiB RAM detected, ~490275 available; reserving 257833 MiB for main
workspace.
Using up to 2 compute threads.
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c1_maf001_LOF_missense_extracted.fam.
478357 variants loaded from
../exome/ukb23156_c1_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 200 variants remaining.
200 variants remaining after main filters.
Writing PCSK9_exome_maf001_cadd_extracted.fam ... done.
Writing PCSK9_exome_maf001_cadd_extracted.bi

In [19]:
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3

for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c11_maf00${i}_LOF_missense_extracted \
        --extract APOC3_exome_maf00${i}_extractlist_snplist \
        --make-bed --export A \
        --out APOC3_exome_maf00${i}_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_hrc_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out APOC3_hrc_rsq0${j}_maf00${i}_extracted

        plink2 \
            --bpfile ../topmed_v3/topmed_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_topmed_v3_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out APOC3_topmed_v3_rsq0${j}_maf00${i}_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to APOC3_exome_maf001_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c11_maf001_LOF_missense_extracted
  --export A
  --extract APOC3_exome_maf001_extractlist_snplist
  --make-bed
  --out APOC3_exome_maf001_extracted

Start time: Fri Jul  5 13:52:52 2024
515666 MiB RAM detected, ~490212 available; reserving 257833 MiB for main
workspace.
Using up to 2 compute threads.
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c11_maf001_LOF_missense_extracted.fam.
293784 variants loaded from
../exome/ukb23156_c11_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 60 variants remaining.
60 variants remaining after main filters.
Writing APOC3_exome_maf001_extracted.fam ... done.
Writing APOC3_exome_maf001_extracted.bim ... done.
Writing APOC

In [20]:
for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c11_maf00${i}_LOF_missense_extracted \
        --extract APOC3_exome_maf00${i}_cadd_extractlist_snplist \
        --make-bed --export A \
        --out APOC3_exome_maf00${i}_cadd_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_hrc_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out APOC3_hrc_rsq0${j}_maf00${i}_cadd_extracted

        plink2 \
            --bpfile ../topmed_v3/topmed_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_topmed_v3_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A \
            --out APOC3_topmed_v3_rsq0${j}_maf00${i}_cadd_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to APOC3_exome_maf001_cadd_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c11_maf001_LOF_missense_extracted
  --export A
  --extract APOC3_exome_maf001_cadd_extractlist_snplist
  --make-bed
  --out APOC3_exome_maf001_cadd_extracted

Start time: Fri Jul  5 13:53:33 2024
515666 MiB RAM detected, ~490265 available; reserving 257833 MiB for main
workspace.
Using up to 2 compute threads.
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c11_maf001_LOF_missense_extracted.fam.
293784 variants loaded from
../exome/ukb23156_c11_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 26 variants remaining.
26 variants remaining after main filters.
Writing APOC3_exome_maf001_cadd_extracted.fam ... done.
Writing APOC3_exome_maf001_cadd_extracted.b

: 13

## Make merged dataset

### PCSK9 - exome + topmed + hrc

In [1]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9")

library(dplyr)
library(data.table)
library(tidyverse)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.5.1     [32m✔[39m [34mpurrr  [39m 1.0.2
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mstringr[39m 1.5.1
[32m✔[39m [34mtidyr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 1.0.0
[32m✔[39m [34mreadr  [39m 2.1.5     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdata.table[39m::[32mbetween()[39m masks [34mdplyr[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m       masks [34mstats[39m::filter()
[31m✖[39m [34mdata.table[39m::[32mfirst()[39m   masks

In [70]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering:%s", rsq, maf, cadd))
            
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_v3_exome_168206ids_rsq0%d_maf00%s%s_annot.csv.gz", 
                       rsq, maf, cadd)
            annot_gene <- read.table(annot_fname, sep = ",", header = TRUE) %>% filter(Gene.refGene == "PCSK9") %>% unique()
            
            full_df <- read.table("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/id_only.csv", sep = ",", header = TRUE)
            
            exome_fname <- sprintf("PCSK9_exome_maf00%s%s_extracted.raw", maf, cadd)
            hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            topmed_fname <- sprintf("PCSK9_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)
            
            ## exome matrix
            if(file.exists(exome_fname)){
                exome <- read.table(exome_fname, sep = "\t", header = TRUE) %>% arrange(FID, IID)
                exome_names <- colnames(exome)[7: dim(exome)[2]] %>% 
                    stringr::str_split(pattern = "_", simplify = TRUE)
                exome_names <- stringr::str_replace_all(exome_names[,1], "\\.", ":")
                colnames(exome)[7:dim(exome)[2]] <- exome_names

                full_df <- left_join(full_df, exome)
            }
            
            ## hrc matrix
            if(file.exists(hrc_fname)){
                hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc_names <- stringr::str_replace_all(hrc_names[,1], "\\.", ":")
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names

                fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc$FID <- as.integer(fid_iid[,1])
                hrc$IID <- as.integer(fid_iid[,2])

                hrc <- hrc %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", hrc_snplist))
                full_df <- left_join(full_df, hrc)
            }
            
            ## topmed matrix
            if(file.exists(topmed_fname)){
                topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed_names <- stringr::str_replace_all(topmed_names[,1], "\\.", ":")
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names

                fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed$FID <- as.integer(fid_iid[,1])
                topmed$IID <- as.integer(fid_iid[,2])

                topmed <- topmed %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", topmed_snplist))
                full_df <- left_join(full_df, topmed)
            }
            
            fwrite(full_df, sprintf("PCSK9_hrc_topmed_v3_exome_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_v3_snplist: %d; Exome: %d; Annot_gene: %d; Full dataframe: %d", 
                          length(hrc_snplist), length(topmed_snplist), ncol(exome)-6, nrow(annot_gene), ncol(full_df)-6))
            rm(full_df)
        }
    }
}



[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 16; Exome: 394; Annot_gene: 412; Full dataframe: 412"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Exome: 200; Annot_gene: 204; Full dataframe: 204"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 16; Exome: 394; Annot_gene: 412; Full dataframe: 412"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Exome: 200; Annot_gene: 204; Full dataframe: 204"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 16; Exome: 393; Annot_gene: 411; Full dataframe: 411"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Exome: 199; Annot_gene: 203; Full dataframe: 203"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 394; Annot_gene: 394; Full dataframe: 394"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 200; Annot_gene: 200; Full dataframe: 200"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 394; Annot_gene: 394; Full dataframe: 394"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 200; Annot_gene: 200; Full dataframe: 200"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 393; Annot_gene: 393; Full dataframe: 393"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 199; Annot_gene: 199; Full dataframe: 199"


### PCSK9 - topmed + hrc

In [82]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering: %s", rsq, maf, cadd))
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_v3_168206ids_rsq0%d_maf00%s%s_annot.csv.gz", 
                                   rsq, maf, cadd)
            annot_gene <- read.table(annot_fname, sep = ",", header = TRUE) %>% filter(Gene.refGene == "PCSK9") %>% unique()
            
            full_df <- read.table("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/id_only.csv", sep = ",", header = TRUE)            
            hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            topmed_fname <- sprintf("PCSK9_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            
            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            ## hrc matrix
            if(file.exists(hrc_fname)){
                hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc_names <- stringr::str_replace_all(hrc_names[,1], "\\.", ":")
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names

                fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc$FID <- as.integer(fid_iid[,1])
                hrc$IID <- as.integer(fid_iid[,2])

                hrc <- hrc %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", hrc_snplist))
                full_df <- left_join(full_df, hrc)
            }
            
            ## topmed matrix
            if(file.exists(topmed_fname)){
                topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed_names <- stringr::str_replace_all(topmed_names[,1], "\\.", ":")
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names

                fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed$FID <- as.integer(fid_iid[,1])
                topmed$IID <- as.integer(fid_iid[,2])

                topmed <- topmed %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", topmed_snplist))
                full_df <- left_join(full_df, topmed)
            }
            
            fwrite(full_df, sprintf("PCSK9_hrc_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_v3_snplist: %d; Annot_gene: %d; Full dataframe: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(annot_gene), ncol(full_df)-6))
        }
    }
}



[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 10; TOPMed_v3_snplist: 115; Annot_gene: 125; Full dataframe: 125"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 57; Annot_gene: 61; Full dataframe: 61"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 10; TOPMed_v3_snplist: 115; Annot_gene: 125; Full dataframe: 125"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 57; Annot_gene: 61; Full dataframe: 61"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 10; TOPMed_v3_snplist: 114; Annot_gene: 124; Full dataframe: 124"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 56; Annot_gene: 60; Full dataframe: 60"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 34; Annot_gene: 38; Full dataframe: 38"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 15; Annot_gene: 16; Full dataframe: 16"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 34; Annot_gene: 38; Full dataframe: 38"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 15; Annot_gene: 16; Full dataframe: 16"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 4; TOPMed_v3_snplist: 33; Annot_gene: 37; Full dataframe: 37"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 14; Annot_gene: 15; Full dataframe: 15"


### APOC3 - exome + topmed + hrc

In [83]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3")

library(dplyr)
library(genio)
library(data.table)
library(tidyverse)

In [84]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering:%s", rsq, maf, cadd))
            
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_v3_exome_168206ids_chr11_rsq0%d_maf00%s%s_annot.csv.gz", 
                       rsq, maf, cadd)
            annot_gene <- read.table(annot_fname, sep = ",", header = TRUE) %>% filter(Gene.refGene == "APOC3") %>% unique()
            
            full_df <- read.table("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/id_only.csv", sep = ",", header = TRUE)
            
            exome_fname <- sprintf("APOC3_exome_maf00%s%s_extracted.raw", maf, cadd)
            hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            topmed_fname <- sprintf("APOC3_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)
            
            ## exome matrix
            if(file.exists(exome_fname)){
                exome <- read.table(exome_fname, sep = "\t", header = TRUE) %>% arrange(FID, IID)
                exome_names <- colnames(exome)[7: dim(exome)[2]] %>% 
                    stringr::str_split(pattern = "_", simplify = TRUE)
                exome_names <- stringr::str_replace_all(exome_names[,1], "\\.", ":")
                colnames(exome)[7:dim(exome)[2]] <- exome_names

                full_df <- left_join(full_df, exome)
            }
            
            ## hrc matrix
            if(file.exists(hrc_fname)){
                hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc_names <- stringr::str_replace_all(hrc_names[,1], "\\.", ":")
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names

                fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc$FID <- as.integer(fid_iid[,1])
                hrc$IID <- as.integer(fid_iid[,2])

                hrc <- hrc %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", hrc_snplist))
                full_df <- left_join(full_df, hrc)
            }
            
            ## topmed matrix
            if(file.exists(topmed_fname)){
                topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed_names <- stringr::str_replace_all(topmed_names[,1], "\\.", ":")
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names

                fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed$FID <- as.integer(fid_iid[,1])
                topmed$IID <- as.integer(fid_iid[,2])

                topmed <- topmed %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", topmed_snplist))
                full_df <- left_join(full_df, topmed)
            }
            
            fwrite(full_df, sprintf("APOC3_hrc_topmed_v3_exome_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_v3_snplist: %d; Exome: %d; Annot_gene: %d; Full dataframe: %d", 
                          length(hrc_snplist), length(topmed_snplist), ncol(exome)-6, nrow(annot_gene), ncol(full_df)-6))
        }
    }
}



[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 5; Exome: 60; Annot_gene: 65; Full dataframe: 65"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 2; Exome: 26; Annot_gene: 28; Full dataframe: 28"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 5; Exome: 60; Annot_gene: 65; Full dataframe: 65"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 2; Exome: 26; Annot_gene: 28; Full dataframe: 28"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 5; Exome: 59; Annot_gene: 64; Full dataframe: 64"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 2; Exome: 25; Annot_gene: 27; Full dataframe: 27"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 60; Annot_gene: 60; Full dataframe: 60"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 26; Annot_gene: 26; Full dataframe: 26"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 60; Annot_gene: 60; Full dataframe: 60"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 26; Annot_gene: 26; Full dataframe: 26"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 59; Annot_gene: 59; Full dataframe: 59"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 0; Exome: 25; Annot_gene: 25; Full dataframe: 25"


### APOC3 - topmed + hrc

In [85]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering: %s", rsq, maf, cadd))
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_v3_168206ids_chr11_rsq0%d_maf00%s%s_annot.csv.gz", 
                                   rsq, maf, cadd)
            annot_gene <- read.table(annot_fname, sep = ",", header = TRUE) %>% filter(Gene.refGene == "APOC3") %>% unique()
            
            full_df <- read.table("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/id_only.csv", sep = ",", header = TRUE)
            
            hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            topmed_fname <- sprintf("APOC3_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd)
            
            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            ## hrc matrix
            if(file.exists(hrc_fname)){
                hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc_names <- stringr::str_replace_all(hrc_names[,1], "\\.", ":")
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names

                fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                hrc$FID <- as.integer(fid_iid[,1])
                hrc$IID <- as.integer(fid_iid[,2])

                hrc <- hrc %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", hrc_snplist))
                full_df <- left_join(full_df, hrc)
            }
            
            ## topmed matrix
            if(file.exists(topmed_fname)){
                topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed_names <- stringr::str_replace_all(topmed_names[,1], "\\.", ":")
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names

                fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
                topmed$FID <- as.integer(fid_iid[,1])
                topmed$IID <- as.integer(fid_iid[,2])

                topmed <- topmed %>% arrange(FID, IID) %>% select(c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE", topmed_snplist))
                full_df <- left_join(full_df, topmed)
            }
            
            fwrite(full_df, sprintf("APOC3_hrc_topmed_v3_rsq0%d_maf00%s%s_extracted.raw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_v3_snplist: %d; Annot_gene: %d; Full dataframe: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(annot_gene), ncol(full_df)-6))
        }
    }
}



[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 16; Annot_gene: 18; Full dataframe: 18"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 8; Annot_gene: 10; Full dataframe: 10"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 16; Annot_gene: 18; Full dataframe: 18"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 8; Annot_gene: 10; Full dataframe: 10"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 15; Annot_gene: 17; Full dataframe: 17"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 2; TOPMed_v3_snplist: 7; Annot_gene: 9; Full dataframe: 9"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Annot_gene: 4; Full dataframe: 4"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Annot_gene: 4; Full dataframe: 4"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Annot_gene: 4; Full dataframe: 4"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`
[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 1; TOPMed_v3_snplist: 3; Annot_gene: 4; Full dataframe: 4"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 3; Annot_gene: 3; Full dataframe: 3"


[1m[22mJoining with `by = join_by(FID, IID, PAT, MAT, SEX, PHENOTYPE)`


[1] "HRC_snplist: 0; TOPMed_v3_snplist: 3; Annot_gene: 3; Full dataframe: 3"


## Phenotype Analysis

We are going to be using Burden of Rare Variant ([BRV](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4459641/)) to analyze rare variant aggregate association for PCSK9 and APOC3. For phenotype analysis, we control for age, sex and 2 PC's.

### Functions and Packages

In [90]:
library(dplyr)
library(genio)
library(data.table)
library(tidyverse)

In [91]:
# mi_pheno <- read.csv("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/Unrealted_White_EU_Both_Exo_Impu_5209_MI_cases.phe.csv")
# mi_y <- mi_pheno %>% arrange(FID) %>% pull(MI)

###########################################
# trig_ldl_pheno <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/Trigly_LDL_Log10_168206ind.phe.csv") %>% arrange(FID)
# tg_y <- trig_ldl_pheno %>% arrange(FID) %>% pull(logarithm_base10_Trigli)
# ldl_y <- trig_ldl_pheno  %>% arrange(FID) %>% pull(f.30780.0.0)

###########################################
ldl_df <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/UKBB_LDL_159759inds_4PCs.pheno", sep = "\t") %>% arrange(FID)
ldl_y <- ldl_df %>% pull(f.30780.0.0)
ldl_id <- ldl_df %>% pull(IID)
ldl_cov <- ldl_df %>% select(sex, age, PC1, PC2)

tg_df <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/UKBB_TG_159759inds_4PCs.pheno", sep = "\t") %>% arrange(FID)
tg_y <- tg_df %>% pull(logarithm_base10_Trigli)
tg_id <- tg_df %>% pull(IID)
tg_cov <- tg_df %>% select(sex, age, PC1, PC2)

rm(ldl_df, tg_df)

In [92]:
# the BRV function
BRV <- function(X_mat, y, cov, option = "binomial"){
    X_mat <- as.matrix(X_mat)
    y <- as.vector(y)

    ## cout number of rare variants for each individual
    if(ncol(X_mat) == 1){
        X_mat_new = X_mat
    } else {
        X_mat_new  = rowSums(X_mat, na.rm=TRUE) %>% as.matrix()
    }
    
    age <- cov$age
    sex <- cov$sex
    pc1 <- cov$PC1
    pc2 <- cov$PC2

    ## obtain p value by logistic regression
    mat.fit <- glm(y ~ X_mat_new + age + sex + pc1 + pc2, family = option)
    zstat <- coef(summary(mat.fit))[2, 3]
    pval <- coef(summary(mat.fit))[2, 4]

    return(list(zstat = zstat, pval = pval))
}

### PCSK9 v.s. LDL

In [95]:
setwd('/mnt/vast/hpc/csg/tl3031/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9')

In [103]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){
    exome_fname <- sprintf("PCSK9_exome_maf00%s_extracted.raw", maf)
    if(file.exists(exome_fname)){
        exome <- read.table(exome_fname, sep = "\t", header = TRUE) %>% arrange(FID, IID)
        exome_X <- exome %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
        result_exome <- BRV(2 - exome_X, ldl_y, ldl_cov,"gaussian")
    } else {
        exome_X <- matrix()
        result_exome$pval <- NA
    }

    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        topmed_fname <- sprintf("PCSK9_topmed_v3_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        hrc_topmed_fname <- sprintf("PCSK9_hrc_topmed_v3_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("PCSK9_hrc_topmed_v3_exome_rsq0%d_maf00%s_extracted.raw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
            
            fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            hrc$FID <- as.integer(fid_iid[,1])
            hrc$IID <- as.integer(fid_iid[,2])
            hrc_X <- hrc %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()

            result_hrc <- BRV(2 - hrc_X, ldl_y, ldl_cov, "gaussian")

        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
            
            fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            topmed$FID <- as.integer(fid_iid[,1])
            topmed$IID <- as.integer(fid_iid[,2])

            topmed_X <- topmed %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_topmed <- BRV(2 - topmed_X, ldl_y, ldl_cov, "gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- read.table(hrc_topmed_fname, sep = ",", header = TRUE)
            hrc_topmed_X <- hrc_topmed %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed <- BRV(2 - hrc_topmed_X, ldl_y, ldl_cov, "gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- read.table(hrc_topmed_exome_fname, sep = ",", header = TRUE)
            hrc_topmed_exome_X <- hrc_topmed_exome %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed_exome <- BRV(2 - hrc_topmed_exome_X, ldl_y, ldl_cov, "gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

In [104]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,394,5.473985e-71
hrc,0.3,0.01,16,1.944983e-12
hrc,0.8,0.01,6,6.993942e-11
topmed,0.3,0.01,124,2.403941e-38
topmed,0.8,0.01,34,7.39923e-30
hrc_topmed,0.3,0.01,125,1.146106e-39
hrc_topmed,0.8,0.01,38,5.148051e-32
hrc_topmed_exome,0.3,0.01,412,8.865311e-71
hrc_topmed_exome,0.8,0.01,394,5.473985e-71
exome,,0.005,394,5.473985e-71


In [106]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){
    exome_fname <- sprintf("PCSK9_exome_maf00%s_cadd_extracted.raw", maf)
    if(file.exists(exome_fname)){
        exome <- read.table(exome_fname, sep = "\t", header = TRUE) %>% arrange(FID, IID)
        exome_X <- exome %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
        result_exome <- BRV(2 - exome_X, ldl_y, ldl_cov, "gaussian")
    } else {
        exome_X <- matrix()
        result_exome$pval <- NA
    }

    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        topmed_fname <- sprintf("PCSK9_topmed_v3_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        hrc_topmed_fname <- sprintf("PCSK9_hrc_topmed_v3_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("PCSK9_hrc_topmed_v3_exome_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
            
            fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            hrc$FID <- as.integer(fid_iid[,1])
            hrc$IID <- as.integer(fid_iid[,2])
            hrc_X <- hrc %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()

            result_hrc <- BRV(2 - hrc_X, ldl_y, ldl_cov,"gaussian")

        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
            
            fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            topmed$FID <- as.integer(fid_iid[,1])
            topmed$IID <- as.integer(fid_iid[,2])

            topmed_X <- topmed %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_topmed <- BRV(2 - topmed_X, ldl_y, ldl_cov,"gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- read.table(hrc_topmed_fname, sep = ",", header = TRUE)
            hrc_topmed_X <- hrc_topmed %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed <- BRV(2 - hrc_topmed_X, ldl_y, ldl_cov,"gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- read.table(hrc_topmed_exome_fname, sep = ",", header = TRUE)
            hrc_topmed_exome_X <- hrc_topmed_exome %>% arrange(FID, IID) %>% filter(FID %in% ldl_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed_exome <- BRV(2 - hrc_topmed_exome_X, ldl_y, ldl_cov,"gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

In [107]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,200,8.246876e-93
hrc,0.3,0.01,6,1.912681e-12
hrc,0.8,0.01,2,4.563676e-15
topmed,0.3,0.01,61,1.287643e-54
topmed,0.8,0.01,15,1.227744e-44
hrc_topmed,0.3,0.01,61,3.7547040000000004e-54
hrc_topmed,0.8,0.01,16,3.68366e-45
hrc_topmed_exome,0.3,0.01,204,2.554497e-93
hrc_topmed_exome,0.8,0.01,200,8.246876e-93
exome,,0.005,200,8.246876e-93


#### PCSK9: chr1:55052701:C:T (hg38)

In [108]:
# pcsk9_id_maf001 <- read.csv("PCSK9_exome_maf001_annot.csv.gz")$ID_hg38
# pcsk9_id_maf0001 <- read.csv("PCSK9_exome_maf0001_annot.csv.gz")$ID_hg38
# setdiff(pcsk9_id_maf001, pcsk9_id_maf0001)

# read.csv("PCSK9_exome_maf001_annot.csv.gz") %>% filter(ID_hg38 == 'chr1:55052701:C:T')

We are interested in this variant, because we see that after removal, the p-value has increased a lot. Therefore, we are going to conduct a single varaint association testing for this variants alone using exome and imputed data.

In [109]:
# ## exome data
# exome_bim <- read_bim("PCSK9_exome_maf001_extracted.bim")
# exome_fam <- read_fam("PCSK9_exome_maf001_extracted.fam")
# exome <- read_bed("PCSK9_exome_maf001_extracted.bed", exome_bim$id, exome_fam$id)

# exome_X <- data.frame(ID = names(exome["chr1:55052701:C:T",]), value=exome["chr1:55052701:C:T",], row.names=NULL)
# exome_X <- exome_X %>% arrange(as.numeric(ID)) %>% filter(ID %in% ldl_id) %>% select(-ID) %>% as.matrix()

# exome.fit <- glm(ldl_y ~ exome_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
# coef(summary(exome.fit))[2, 4]

In [110]:
# read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_v3_168206ids_rsq03_maf001_annot.csv.gz") %>%
#     filter(ID_hg38 == "chr1:55052701:C:T")

# ## hrc
# hrc <- read.csv("PCSK9_hrc_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr1:55518374:C:T")
# hrc_X <- format_traw(hrc, ldl_id, "hrc")

# hrc.fit <- glm(ldl_y ~ hrc_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
# coef(summary(hrc.fit))[2, 4]

# ## topmed
# topmed <- read.csv("PCSK9_topmed_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr1:55052701:C:T")
# topmed_X <- format_traw(topmed, ldl_id, "topmed")

# topmed.fit <- glm(ldl_y ~ topmed_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
# coef(summary(topmed.fit))[2, 4]

### APOC3 v.s. TG

In [114]:
setwd('/mnt/vast/hpc/csg/tl3031/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3')

result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){
    exome_fname <- sprintf("APOC3_exome_maf00%s_extracted.raw", maf)
    if(file.exists(exome_fname)){
        exome <- read.table(exome_fname, sep = "\t", header = TRUE)
        exome_X <- exome %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
        result_exome <- BRV(2 - exome_X, tg_y, tg_cov,"gaussian")
    } else {
        exome_X <- matrix()
        result_exome$pval <- NA
    }

    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        topmed_fname <- sprintf("APOC3_topmed_v3_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        hrc_topmed_fname <- sprintf("APOC3_hrc_topmed_v3_rsq0%d_maf00%s_extracted.raw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("APOC3_hrc_topmed_v3_exome_rsq0%d_maf00%s_extracted.raw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
            
            fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            hrc$FID <- as.integer(fid_iid[,1])
            hrc$IID <- as.integer(fid_iid[,2])
            hrc_X <- hrc %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()

            result_hrc <- BRV(2 - hrc_X, tg_y, tg_cov,"gaussian")

        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- read.table(topmed_fname, sep = "\t", header = TRUE)
            
            fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            topmed$FID <- as.integer(fid_iid[,1])
            topmed$IID <- as.integer(fid_iid[,2])

            topmed_X <- topmed %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_topmed <- BRV(2 - topmed_X, tg_y, tg_cov,"gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- read.table(hrc_topmed_fname, sep = ",", header = TRUE)
            hrc_topmed_X <- hrc_topmed %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed <- BRV(2 - hrc_topmed_X, tg_y, tg_cov,"gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- read.csv(hrc_topmed_exome_fname, sep = ",", header = TRUE)
            hrc_topmed_exome_X <- hrc_topmed_exome %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed_exome <- BRV(2 - hrc_topmed_exome_X, tg_y, tg_cov,"gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

In [115]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,60,3.0734e-234
hrc,0.3,0.01,3,3.376964e-182
hrc,0.8,0.01,2,9.967896e-170
topmed,0.3,0.01,18,1.342452e-190
topmed,0.8,0.01,4,1.761751e-183
hrc_topmed,0.3,0.01,18,4.1451029999999996e-187
hrc_topmed,0.8,0.01,4,6.4134e-183
hrc_topmed_exome,0.3,0.01,65,3.666208e-231
hrc_topmed_exome,0.8,0.01,60,3.0734e-234
exome,,0.005,60,3.0734e-234


In [118]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){
    exome_fname <- sprintf("APOC3_exome_maf00%s_cadd_extracted.raw", maf)
    if(file.exists(exome_fname)){
        exome <- read.table(exome_fname, sep = "\t", header = TRUE)
        exome_X <- exome %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
        result_exome <- BRV(2 - exome_X, tg_y, tg_cov,"gaussian")
    } else {
        exome_X <- matrix()
        result_exome$pval <- NA
    }

    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        topmed_fname <- sprintf("APOC3_topmed_v3_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        hrc_topmed_fname <- sprintf("APOC3_hrc_topmed_v3_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("APOC3_hrc_topmed_v3_exome_rsq0%d_maf00%s_cadd_extracted.raw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- read.table(hrc_fname, sep = "\t", header = TRUE)
            hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
            colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,1]

            fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            hrc$FID <- as.integer(fid_iid[,1])
            hrc$IID <- as.integer(fid_iid[,2])
            hrc_X <- hrc %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()

            result_hrc <- BRV(2 - hrc_X, tg_y, tg_cov,"gaussian")

        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- read.csv(topmed_fname, sep = "\t", header = TRUE)
            topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
            colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,1]

            fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
            topmed$FID <- as.integer(fid_iid[,1])
            topmed$IID <- as.integer(fid_iid[,2])

            topmed_X <- topmed %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_topmed <- BRV(2 - topmed_X, tg_y, tg_cov,"gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- read.table(hrc_topmed_fname, sep = ",", header = TRUE)
            hrc_topmed_X <- hrc_topmed %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed <- BRV(2 - hrc_topmed_X, tg_y, tg_cov,"gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- read.table(hrc_topmed_exome_fname, sep = ",", header = TRUE)
            hrc_topmed_exome_X <- hrc_topmed_exome %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()
            result_hrc_topmed_exome <- BRV(2 - hrc_topmed_exome_X, tg_y, tg_cov,"gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

In [119]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,26,1.0503770000000001e-262
hrc,0.3,0.01,3,3.376964e-182
hrc,0.8,0.01,2,9.967896e-170
topmed,0.3,0.01,10,4.038258e-197
topmed,0.8,0.01,4,1.761751e-183
hrc_topmed,0.3,0.01,10,4.5971540000000004e-194
hrc_topmed,0.8,0.01,4,6.4134e-183
hrc_topmed_exome,0.3,0.01,28,8.537625e-260
hrc_topmed_exome,0.8,0.01,26,1.0503770000000001e-262
exome,,0.005,26,1.0503770000000001e-262


#### APOC3: chr11:116830638

We specifically looked into this variant, for the same reason we did for the PCSK9 variant

In [279]:
annot_001 <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_v3_exome_168206ids_chr11_rsq03_maf001_annot.csv.gz") %>%
    filter(Gene.refGene == "APOC3") %>% pull(ID)
annot_0001 <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_v3_exome_168206ids_chr11_rsq03_maf0001_annot.csv.gz")  %>%
    filter(Gene.refGene == "APOC3") %>% pull(ID)

setdiff(annot_001, annot_0001)

read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_v3_exome_168206ids_chr11_rsq03_maf001_annot.csv.gz") %>%
    filter(ID == 'chr11:116830638:G:A') %>% select(c(1:7, R2_hrc, R2_topmed))

Chr,Start,End,Ref,Alt,ID_hg38,ID_hg19,R2_hrc,R2_topmed
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
11,116830638,116830638,G,A,chr11:116830638:G:A,chr11:116701354:G:A,0.836202,0.853641


In [258]:
exome <- read.csv("APOC3_exome_maf001_extracted.raw")
exome_names <- colnames(exome)[7: dim(exome)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
colnames(exome)[7:dim(exome)[2]] <- exome_names[,1]

exome_X <- exome %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()

result_exome <- BRV(2 - exome_X[,'chr11:116830638:G:A'], tg_y, tg_cov,"gaussian")
result_exome

In [271]:
hrc <- read.csv("APOC3_hrc_rsq03_maf001_extracted.raw")
hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,1]

fid_iid <- hrc$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
hrc$FID <- as.integer(fid_iid[,1])
hrc$IID <- as.integer(fid_iid[,2])

hrc_X <- hrc %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()

result_hrc <- BRV(2 - hrc_X[,'chr11:116701354:G:A'], tg_y, tg_cov,"gaussian")
result_hrc

In [274]:
topmed <- read.csv("APOC3_topmed_v3_rsq03_maf001_extracted.raw")
topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,1]

fid_iid <- topmed$IID %>% stringr::str_split(pattern = "_", simplify = TRUE)
topmed$FID <- as.integer(fid_iid[,1])
topmed$IID <- as.integer(fid_iid[,2])

topmed_X <- topmed %>% arrange(FID, IID) %>% filter(FID %in% tg_id) %>% select(-c(1:6)) %>% as.matrix()

result_topmed <- BRV(2 - topmed_X[,'chr11:116830638:G:A'], tg_y, tg_cov,"gaussian")
result_topmed

In [5]:
annot_001 <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_v3_168206ids_chr11_rsq03_maf001_annot.csv.gz") %>%
    filter(Gene.refGene == "APOC3")

In [6]:
annot_001

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg19,ID,Function,R2_hrc,R2_topmed,R2,source,RawScore,PHRED,ID_hg38
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>,<chr>
11,116830637,116830637,C,T,exonic;splicing,APOC3,stopgain,0.0006,,chr11:116701353:C:T,chr11:116830637:C:T,LoF,0.542862,0.458809,0.542862,hrc,5.613917,34.0,chr11:116830637:C:T
11,116830638,116830638,G,A,splicing,APOC3,.,0.0023,,chr11:116701354:G:A,chr11:116830638:G:A,splicing,0.836202,0.853641,0.853641,topmed,4.947405,33.0,chr11:116830638:G:A
11,116830844,116830844,G,A,exonic,APOC3,nonsynonymous SNV,0.0002,0.653,chr11:116701560:G:A,chr11:116830844:G:A,missense,0.973659,0.963205,0.973659,hrc,2.818346,23.2,chr11:116830844:G:A
11,116830568,116830568,A,G,splicing,APOC3,.,6.169e-05,0.277,,chr11:116830568:A:G,splicing,0.0,0.383569,0.383569,topmed,4.960331,33.0,chr11:116830568:A:G
11,116830592,116830592,C,T,exonic,APOC3,nonsynonymous SNV,8.807e-06,0.38,,chr11:116830592:C:T,missense,0.0,0.591272,0.591272,topmed,2.265048,21.3,chr11:116830592:C:T
11,116830593,116830593,G,A,exonic,APOC3,nonsynonymous SNV,1.761e-05,0.342,,chr11:116830593:G:A,missense,0.0,0.881843,0.881843,topmed,2.287065,21.5,chr11:116830593:G:A
11,116830620,116830620,C,T,exonic,APOC3,nonsynonymous SNV,3.525e-05,0.062,,chr11:116830620:C:T,missense,0.0,0.656993,0.656993,topmed,0.402428,5.454,chr11:116830620:C:T
11,116830622,116830622,C,T,exonic,APOC3,nonsynonymous SNV,0.0,0.189,,chr11:116830622:C:T,missense,0.0,0.597868,0.597868,topmed,2.078035,19.85,chr11:116830622:C:T
11,116830808,116830808,T,G,exonic,APOC3,nonsynonymous SNV,,0.157,,chr11:116830808:T:G,missense,0.0,0.570624,0.570624,topmed,0.188174,2.99,chr11:116830808:T:G
11,116830823,116830823,A,G,exonic,APOC3,nonsynonymous SNV,8.86e-06,0.26,,chr11:116830823:A:G,missense,0.0,0.749498,0.749498,topmed,-1.343152,0.001,chr11:116830823:A:G
