# PCSK9 vs. APOC3

This notebook records [BRV](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4459641/) analysis result for PCSK9 v.s LDL and APOC3 v.s. TG

## Extract PCSK9 + APOC3

In [2]:
library(dplyr)
library(data.table)
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




### Exome

In [5]:
for(chr in c(1,11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(maf in c(0.01, 0.005, 0.001)){
        maf_c <- gsub("\\.", "", as.character(maf))
        
        annot <- fread(sprintf("./exome/ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.csv.gz", chr, maf_c)) %>% 
            filter(Gene.refGene == gene)
        annot %>% fwrite(sprintf("./%s/%s_exome_maf%s_annot.csv.gz", tolower(gene), gene, maf_c))
        annot %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_exome_maf%s_extractlist_snplist", tolower(gene), gene, maf_c), col.names = FALSE)
        
        annot_cadd <- fread(sprintf("./exome/ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_cadd.csv.gz", chr, maf_c)) %>% 
            filter(Gene.refGene == gene)
        annot_cadd %>% fwrite(sprintf("./%s/%s_exome_maf%s_cadd_annot.csv.gz", tolower(gene), gene, maf_c))
        annot_cadd %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_exome_maf%s_cadd_extractlist_snplist", tolower(gene), gene, maf_c), col.names = FALSE)
    }
}

### HRC

In [6]:
for(chr in c(1,11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot <- fread(sprintf("./hrc/hrc_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot %>% select(ID_hg19) %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

In [7]:
for(chr in c(1, 11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot_cadd <- fread(sprintf("./hrc/hrc_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_cadd_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot_cadd %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_cadd_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot_cadd %>% select(ID_hg19) %>% fwrite(sprintf("./%s/%s_hrc_rsq0%i_maf%s_cadd_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

### TOPMed

In [8]:
for(chr in c(1,11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot <- fread(sprintf("./topmed/topmed_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot %>% fwrite(sprintf("./%s/%s_topmed_rsq0%i_maf%s_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_topmed_rsq0%i_maf%s_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

In [9]:
for(chr in c(1, 11)){
    gene <- ifelse(chr==1, "PCSK9", "APOC3")
    
    for(rsq in c(3, 8)){
        for(maf in c(0.01, 0.005, 0.001)){
            maf_c <- gsub("\\.", "", as.character(maf))

            annot_cadd <- fread(sprintf("./topmed/topmed_chr%i_rsq0%i_hg19_hg38_maf%s_LOF_missense_cadd_annot.csv.gz", chr, rsq, maf_c)) %>% 
                filter(Gene.refGene == gene)
            annot_cadd %>% fwrite(sprintf("./%s/%s_topmed_rsq0%i_maf%s_cadd_annot.csv.gz", tolower(gene), gene, rsq, maf_c))
            annot_cadd %>% select(ID_hg38) %>% fwrite(sprintf("./%s/%s_topmed_rsq0%i_maf%s_cadd_extractlist_snplist", tolower(gene), gene, rsq, maf_c), col.names = FALSE)
        }
    }
}

### Extract

In [3]:
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9

module load Plink/2.00a

for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c1_maf00${i}_LOF_missense_extracted \
        --extract PCSK9_exome_maf00${i}_extractlist_snplist \
        --make-bed \
        --out PCSK9_exome_maf00${i}_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_hrc_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out PCSK9_hrc_rsq0${j}_maf00${i}_extracted

        plink2 \
            --bpfile ../topmed/topmed_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_topmed_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out PCSK9_topmed_rsq0${j}_maf00${i}_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to PCSK9_exome_maf001_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c1_maf001_LOF_missense_extracted
  --extract PCSK9_exome_maf001_extractlist_snplist
  --make-bed
  --out PCSK9_exome_maf001_extracted

Start time: Tue Mar  5 11:50:54 2024
257483 MiB RAM detected, ~148510 available; reserving 128741 MiB for main
workspace.
Allocated 40734 MiB successfully, after larger attempt(s) failed.
Using up to 64 threads (change this with --threads).
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c1_maf001_LOF_missense_extracted.fam.
478357 variants loaded from
../exome/ukb23156_c1_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 394 variants remaining.
394 variants remaining after main filters.
Writing PCSK9_exome_maf001_extracted.fam

In [4]:
for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c1_maf00${i}_LOF_missense_extracted \
        --extract PCSK9_exome_maf00${i}_cadd_extractlist_snplist \
        --make-bed \
        --out PCSK9_exome_maf00${i}_cadd_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_hrc_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out PCSK9_hrc_rsq0${j}_maf00${i}_cadd_extracted

        plink2 \
            --bpfile ../topmed/topmed_chr1_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract PCSK9_topmed_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out PCSK9_topmed_rsq0${j}_maf00${i}_cadd_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to PCSK9_exome_maf001_cadd_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c1_maf001_LOF_missense_extracted
  --extract PCSK9_exome_maf001_cadd_extractlist_snplist
  --make-bed
  --out PCSK9_exome_maf001_cadd_extracted

Start time: Tue Mar  5 11:51:22 2024
257483 MiB RAM detected, ~169645 available; reserving 128741 MiB for main
workspace.
Allocated 40734 MiB successfully, after larger attempt(s) failed.
Using up to 64 threads (change this with --threads).
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c1_maf001_LOF_missense_extracted.fam.
478357 variants loaded from
../exome/ukb23156_c1_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 200 variants remaining.
200 variants remaining after main filters.
Writing PCSK9_exome_maf00

In [5]:
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3

for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c11_maf00${i}_LOF_missense_extracted \
        --extract APOC3_exome_maf00${i}_extractlist_snplist \
        --make-bed \
        --out APOC3_exome_maf00${i}_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_hrc_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out APOC3_hrc_rsq0${j}_maf00${i}_extracted

        plink2 \
            --bpfile ../topmed/topmed_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_topmed_rsq0${j}_maf00${i}_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out APOC3_topmed_rsq0${j}_maf00${i}_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to APOC3_exome_maf001_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c11_maf001_LOF_missense_extracted
  --extract APOC3_exome_maf001_extractlist_snplist
  --make-bed
  --out APOC3_exome_maf001_extracted

Start time: Tue Mar  5 11:51:29 2024
257483 MiB RAM detected, ~148366 available; reserving 128741 MiB for main
workspace.
Allocated 40734 MiB successfully, after larger attempt(s) failed.
Using up to 64 threads (change this with --threads).
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c11_maf001_LOF_missense_extracted.fam.
293784 variants loaded from
../exome/ukb23156_c11_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 60 variants remaining.
60 variants remaining after main filters.
Writing APOC3_exome_maf001_extracted.fa

In [6]:
for i in 1 05 01; do
    plink2 \
        --bfile ../exome/ukb23156_c11_maf00${i}_LOF_missense_extracted \
        --extract APOC3_exome_maf00${i}_cadd_extractlist_snplist \
        --make-bed \
        --out APOC3_exome_maf00${i}_cadd_extracted

    for j in 3 8; do
        plink2 \
            --bpfile ../hrc/hrc_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_hrc_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out APOC3_hrc_rsq0${j}_maf00${i}_cadd_extracted

        plink2 \
            --bpfile ../topmed/topmed_chr11_rsq0${j}_maf00${i}_LOF_missense_extracted \
            --extract APOC3_topmed_rsq0${j}_maf00${i}_cadd_extractlist_snplist \
            --make-bpgen --sort-vars \
            --export A-transpose \
            --out APOC3_topmed_rsq0${j}_maf00${i}_cadd_extracted
    done
done

PLINK v2.00a4LM 64-bit Intel (11 Apr 2023)     www.cog-genomics.org/plink/2.0/
(C) 2005-2023 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to APOC3_exome_maf001_cadd_extracted.log.
Options in effect:
  --bfile ../exome/ukb23156_c11_maf001_LOF_missense_extracted
  --extract APOC3_exome_maf001_cadd_extractlist_snplist
  --make-bed
  --out APOC3_exome_maf001_cadd_extracted

Start time: Tue Mar  5 11:51:33 2024
257483 MiB RAM detected, ~138087 available; reserving 128741 MiB for main
workspace.
Allocated 40734 MiB successfully, after larger attempt(s) failed.
Using up to 64 threads (change this with --threads).
168206 samples (0 females, 0 males, 168206 ambiguous; 168206 founders) loaded
from ../exome/ukb23156_c11_maf001_LOF_missense_extracted.fam.
293784 variants loaded from
../exome/ukb23156_c11_maf001_LOF_missense_extracted.bim.
Note: No phenotype data present.
--extract: 26 variants remaining.
26 variants remaining after main filters.
Writing APOC3_exome_maf0

## Make merged dataset

### PCSK9 - exome + topmed + hrc

In [7]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9")

library(dplyr)
library(genio)
library(data.table)
library(tidyverse)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mggplot2  [39m 3.4.3     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.1     [32m✔[39m [34mtidyr    [39m 1.3.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdata.table[39m::[32mbetween()[39m masks [34mdplyr[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m       masks [34mstats[39m::filter()
[31m✖[

In [8]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering:%s", rsq, maf, cadd))
            
            ## exome matrix
            fname = sprintf("PCSK9_exome_maf00%s%s_extracted", maf, cadd)
            bim_fname <- paste0(fname, '.bim')
            fam_fname <- paste0(fname, '.fam')
            bed_fname <- paste0(fname, '.bed')

            bim <- read_bim(bim_fname)
            fam <- read_fam(fam_fname)
            bed <- read_bed(bed_fname, bim$id, fam$id)
            bed <- bed[, order(colnames(bed))]
            
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_rsq0%d_maf00%s%s_annot.csv.gz", 
                       rsq, maf, cadd)
            annot_gene <- fread(annot_fname) %>% filter(Gene.refGene == "PCSK9")

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(hrc_fname)){
                hrc <- fread(hrc_fname)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,2]

                hrc_gene <- hrc %>% filter(SNP %in% hrc_snplist)
                hrc_gene <- hrc_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                hrc_gene <- 2 - hrc_gene %>% select(sort(colnames(hrc_gene)))
            }
            
            topmed_fname <- sprintf("PCSK9_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(topmed_fname)){
                topmed <- fread(topmed_fname)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,3]

                topmed_gene <- topmed %>% filter(SNP %in% topmed_snplist)
                topmed_gene <- topmed_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                topmed_gene <- 2 - topmed_gene %>% select(sort(colnames(topmed_gene)))
            }
            
            if(exists("hrc_gene") & exists("topmed_gene")){
                bed <- rbind(bed, hrc_gene, topmed_gene)
                rm(hrc_gene)
                rm(topmed_gene)
            } else if (exists("hrc_gene")) {
                bed <- rbind(bed, hrc_gene)
                rm(hrc_gene)
            } else if (exists("topmed_gene")){
                bed <- rbind(bed, topmed_gene)
                rm(topmed_gene)
            }
            fwrite(bed, sprintf("PCSK9_hrc_topmed_exome_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_snplist: %d; Exome: %d; Annot_gene: %d; Bed: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(bim), nrow(annot_gene), nrow(bed)))
            rm(bed)
        }
    }
}



Reading: PCSK9_exome_maf001_extracted.bim

Reading: PCSK9_exome_maf001_extracted.fam

Reading: PCSK9_exome_maf001_extracted.bed



[1] "HRC_snplist: 2; TOPMed_snplist: 7; Exome: 394; Annot_gene: 403; Bed: 403"


Reading: PCSK9_exome_maf001_cadd_extracted.bim

Reading: PCSK9_exome_maf001_cadd_extracted.fam

Reading: PCSK9_exome_maf001_cadd_extracted.bed



[1] "HRC_snplist: 1; TOPMed_snplist: 1; Exome: 200; Annot_gene: 202; Bed: 202"


Reading: PCSK9_exome_maf0005_extracted.bim

Reading: PCSK9_exome_maf0005_extracted.fam

Reading: PCSK9_exome_maf0005_extracted.bed



[1] "HRC_snplist: 2; TOPMed_snplist: 7; Exome: 394; Annot_gene: 403; Bed: 403"


Reading: PCSK9_exome_maf0005_cadd_extracted.bim

Reading: PCSK9_exome_maf0005_cadd_extracted.fam

Reading: PCSK9_exome_maf0005_cadd_extracted.bed



[1] "HRC_snplist: 1; TOPMed_snplist: 1; Exome: 200; Annot_gene: 202; Bed: 202"


Reading: PCSK9_exome_maf0001_extracted.bim

Reading: PCSK9_exome_maf0001_extracted.fam

Reading: PCSK9_exome_maf0001_extracted.bed



[1] "HRC_snplist: 2; TOPMed_snplist: 7; Exome: 393; Annot_gene: 402; Bed: 402"


Reading: PCSK9_exome_maf0001_cadd_extracted.bim

Reading: PCSK9_exome_maf0001_cadd_extracted.fam

Reading: PCSK9_exome_maf0001_cadd_extracted.bed



[1] "HRC_snplist: 1; TOPMed_snplist: 1; Exome: 199; Annot_gene: 201; Bed: 201"


Reading: PCSK9_exome_maf001_extracted.bim

Reading: PCSK9_exome_maf001_extracted.fam

Reading: PCSK9_exome_maf001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 2; Exome: 394; Annot_gene: 396; Bed: 396"


Reading: PCSK9_exome_maf001_cadd_extracted.bim

Reading: PCSK9_exome_maf001_cadd_extracted.fam

Reading: PCSK9_exome_maf001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 1; Exome: 200; Annot_gene: 201; Bed: 201"


Reading: PCSK9_exome_maf0005_extracted.bim

Reading: PCSK9_exome_maf0005_extracted.fam

Reading: PCSK9_exome_maf0005_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 2; Exome: 394; Annot_gene: 396; Bed: 396"


Reading: PCSK9_exome_maf0005_cadd_extracted.bim

Reading: PCSK9_exome_maf0005_cadd_extracted.fam

Reading: PCSK9_exome_maf0005_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 1; Exome: 200; Annot_gene: 201; Bed: 201"


Reading: PCSK9_exome_maf0001_extracted.bim

Reading: PCSK9_exome_maf0001_extracted.fam

Reading: PCSK9_exome_maf0001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 2; Exome: 393; Annot_gene: 395; Bed: 395"


Reading: PCSK9_exome_maf0001_cadd_extracted.bim

Reading: PCSK9_exome_maf0001_cadd_extracted.fam

Reading: PCSK9_exome_maf0001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 1; Exome: 199; Annot_gene: 200; Bed: 200"


### PCSK9 - topmed + hrc

In [10]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering:%s", rsq, maf, cadd))
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_168206ids_rsq0%d_maf00%s%s_annot.csv.gz", 
                                   rsq, maf, cadd)
            annot_gene <- fread(annot_fname) %>% filter(Gene.refGene == "PCSK9")

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            hrc <- fread(hrc_fname)
            hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
            colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,2]

            hrc_gene <- hrc %>% filter(SNP %in% hrc_snplist)
            hrc_gene <- hrc_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
            hrc_gene <- 2 - hrc_gene %>% select(sort(colnames(hrc_gene)))

            topmed_fname <- sprintf("PCSK9_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            topmed <- fread(topmed_fname)
            topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
            colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,3]

            topmed_gene <- topmed %>% filter(SNP %in% topmed_snplist)
            topmed_gene <- topmed_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
            topmed_gene <- 2 - topmed_gene %>% select(sort(colnames(topmed_gene)))

            if(exists("hrc_gene") & exists("topmed_gene")){
                bed <- rbind(hrc_gene, topmed_gene)
            } else if (exists("hrc_gene")) {
                bed <- hrc_gene
            } else {
                bed <- topmed_gene
            }
            # fwrite(bed, sprintf("PCSK9_hrc_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_snplist: %d; Annot_gene: %d; Bed: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(annot_gene), nrow(bed)))
        } 
    }
}

[1] "HRC_snplist: 6; TOPMed_snplist: 104; Annot_gene: 110; Bed: 110"
[1] "HRC_snplist: 2; TOPMed_snplist: 60; Annot_gene: 62; Bed: 62"
[1] "HRC_snplist: 6; TOPMed_snplist: 104; Annot_gene: 110; Bed: 110"
[1] "HRC_snplist: 2; TOPMed_snplist: 60; Annot_gene: 62; Bed: 62"
[1] "HRC_snplist: 6; TOPMed_snplist: 103; Annot_gene: 109; Bed: 109"
[1] "HRC_snplist: 2; TOPMed_snplist: 59; Annot_gene: 61; Bed: 61"
[1] "HRC_snplist: 3; TOPMed_snplist: 45; Annot_gene: 48; Bed: 48"
[1] "HRC_snplist: 1; TOPMed_snplist: 25; Annot_gene: 26; Bed: 26"
[1] "HRC_snplist: 3; TOPMed_snplist: 45; Annot_gene: 48; Bed: 48"
[1] "HRC_snplist: 1; TOPMed_snplist: 25; Annot_gene: 26; Bed: 26"
[1] "HRC_snplist: 3; TOPMed_snplist: 44; Annot_gene: 47; Bed: 47"
[1] "HRC_snplist: 1; TOPMed_snplist: 24; Annot_gene: 25; Bed: 25"


### APOC3 - exome + topmed + hrc

In [11]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3")

library(dplyr)
library(genio)
library(data.table)
library(tidyverse)

In [12]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering:%s", rsq, maf, cadd))
            
            ## exome matrix
            fname = sprintf("APOC3_exome_maf00%s%s_extracted", maf, cadd)
            bim_fname <- paste0(fname, '.bim')
            fam_fname <- paste0(fname, '.fam')
            bed_fname <- paste0(fname, '.bed')

            bim <- read_bim(bim_fname)
            fam <- read_fam(fam_fname)
            bed <- read_bed(bed_fname, bim$id, fam$id)
            bed <- bed[, order(colnames(bed))]
            
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_chr11_rsq0%d_maf00%s%s_annot.csv.gz", 
                       rsq, maf, cadd)
            annot_gene <- fread(annot_fname) %>% filter(Gene.refGene == "APOC3")

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(hrc_fname)){
                hrc <- fread(hrc_fname)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,2]

                hrc_gene <- hrc %>% filter(SNP %in% hrc_snplist)
                hrc_gene <- hrc_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                hrc_gene <- 2 - hrc_gene %>% select(sort(colnames(hrc_gene)))
            }
            
            topmed_fname <- sprintf("APOC3_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(topmed_fname)){
                topmed_fname <- sprintf("APOC3_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
                topmed <- fread(topmed_fname)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,3]

                topmed_gene <- topmed %>% filter(SNP %in% topmed_snplist)
                topmed_gene <- topmed_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                topmed_gene <- 2 - topmed_gene %>% select(sort(colnames(topmed_gene)))
            }
            
            if(exists("hrc_gene") & exists("topmed_gene")){
                bed <- rbind(bed, hrc_gene, topmed_gene)
                rm(hrc_gene)
                rm(topmed_gene)
            } else if (exists("hrc_gene")) {
                bed <- rbind(bed, hrc_gene)
                rm(hrc_gene)
            } else if (exists("topmed_gene")){
                bed <- rbind(bed, topmed_gene)
                rm(topmed_gene)
            }
            # fwrite(bed, sprintf("APOC3_hrc_topmed_exome_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd))
            
            print(sprintf("HRC_snplist: %d; TOPMed_snplist: %d; Exome: %d; Annot_gene: %d; Bed: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(bim), nrow(annot_gene), nrow(bed)))
            rm(bed)
        }
    }
}



Reading: APOC3_exome_maf001_extracted.bim

Reading: APOC3_exome_maf001_extracted.fam

Reading: APOC3_exome_maf001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 3; Exome: 60; Annot_gene: 63; Bed: 63"


Reading: APOC3_exome_maf001_cadd_extracted.bim

Reading: APOC3_exome_maf001_cadd_extracted.fam

Reading: APOC3_exome_maf001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 26; Annot_gene: 26; Bed: 26"


Reading: APOC3_exome_maf0005_extracted.bim

Reading: APOC3_exome_maf0005_extracted.fam

Reading: APOC3_exome_maf0005_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 3; Exome: 60; Annot_gene: 63; Bed: 63"


Reading: APOC3_exome_maf0005_cadd_extracted.bim

Reading: APOC3_exome_maf0005_cadd_extracted.fam

Reading: APOC3_exome_maf0005_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 26; Annot_gene: 26; Bed: 26"


Reading: APOC3_exome_maf0001_extracted.bim

Reading: APOC3_exome_maf0001_extracted.fam

Reading: APOC3_exome_maf0001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 3; Exome: 59; Annot_gene: 62; Bed: 62"


Reading: APOC3_exome_maf0001_cadd_extracted.bim

Reading: APOC3_exome_maf0001_cadd_extracted.fam

Reading: APOC3_exome_maf0001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 25; Annot_gene: 25; Bed: 25"


Reading: APOC3_exome_maf001_extracted.bim

Reading: APOC3_exome_maf001_extracted.fam

Reading: APOC3_exome_maf001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 60; Annot_gene: 60; Bed: 60"


Reading: APOC3_exome_maf001_cadd_extracted.bim

Reading: APOC3_exome_maf001_cadd_extracted.fam

Reading: APOC3_exome_maf001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 26; Annot_gene: 26; Bed: 26"


Reading: APOC3_exome_maf0005_extracted.bim

Reading: APOC3_exome_maf0005_extracted.fam

Reading: APOC3_exome_maf0005_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 60; Annot_gene: 60; Bed: 60"


Reading: APOC3_exome_maf0005_cadd_extracted.bim

Reading: APOC3_exome_maf0005_cadd_extracted.fam

Reading: APOC3_exome_maf0005_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 26; Annot_gene: 26; Bed: 26"


Reading: APOC3_exome_maf0001_extracted.bim

Reading: APOC3_exome_maf0001_extracted.fam

Reading: APOC3_exome_maf0001_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 59; Annot_gene: 59; Bed: 59"


Reading: APOC3_exome_maf0001_cadd_extracted.bim

Reading: APOC3_exome_maf0001_cadd_extracted.fam

Reading: APOC3_exome_maf0001_cadd_extracted.bed



[1] "HRC_snplist: 0; TOPMed_snplist: 0; Exome: 25; Annot_gene: 25; Bed: 25"


### APOC3 - topmed + hrc

In [13]:
for(rsq in c(3, 8)){
    for(maf in c("1", "05", "01")){
        for(cadd in c("", "_cadd")){
            print(sprintf("==========RSQ 0.%d; MAF 00%s; CADD filtering: %s", rsq, maf, cadd))
            annot_fname <- sprintf("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_topmed_168206ids_chr11_rsq0%d_maf00%s%s_annot.csv.gz", 
                                   rsq, maf, cadd)
            annot_gene <- fread(annot_fname) %>% filter(Gene.refGene == "APOC3")

            hrc_snplist <- annot_gene %>% filter(source == "hrc") %>% pull(ID_hg19)
            topmed_snplist <- annot_gene %>% filter(source == "topmed") %>% pull(ID)

            hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(hrc_fname)){
                hrc <- fread(hrc_fname)
                hrc_names <- colnames(hrc)[7: dim(hrc)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(hrc)[7:dim(hrc)[2]] <- hrc_names[,2]

                hrc_gene <- hrc %>% filter(SNP %in% hrc_snplist)
                hrc_gene <- hrc_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                hrc_gene <- 2 - hrc_gene %>% select(sort(colnames(hrc_gene)))
            }

            topmed_fname <- sprintf("APOC3_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd)
            if(file.exists(topmed_fname)){
                topmed <- fread(topmed_fname)
                topmed_names <- colnames(topmed)[7: dim(topmed)[2]] %>% stringr::str_split(pattern = "_", simplify = TRUE)
                colnames(topmed)[7:dim(topmed)[2]] <- topmed_names[,3]

                topmed_gene <- topmed %>% filter(SNP %in% topmed_snplist)
                topmed_gene <- topmed_gene %>% remove_rownames() %>% select(-c("CHR", "(C)M", "POS", "COUNTED", "ALT", "SNP"))
                topmed_gene <- 2 - topmed_gene %>% select(sort(colnames(topmed_gene)))
            }
            
            if(exists("hrc_gene") & exists("topmed_gene")){
                bed <- rbind(hrc_gene, topmed_gene)
                rm(hrc_gene)
                rm(topmed_gene)
            } else if (exists("hrc_gene")) {
                bed <- hrc_gene
                rm(hrc_gene)
            } else {
                bed <- topmed_gene
                rm(topmed_gene)
            }
            
            if(exists("bed")){
                # fwrite(bed, sprintf("APOC3_hrc_topmed_rsq0%d_maf00%s%s_extracted.traw", rsq, maf, cadd))
            }
            
            print(sprintf("HRC_snplist: %d; TOPMed_snplist: %d; Annot_gene: %d; Bed: %d", 
                          length(hrc_snplist), length(topmed_snplist), nrow(annot_gene), nrow(bed)))
            rm(bed)
        }
    }
}

[1] "HRC_snplist: 1; TOPMed_snplist: 14; Annot_gene: 15; Bed: 15"
[1] "HRC_snplist: 1; TOPMed_snplist: 8; Annot_gene: 9; Bed: 9"
[1] "HRC_snplist: 1; TOPMed_snplist: 14; Annot_gene: 15; Bed: 15"
[1] "HRC_snplist: 1; TOPMed_snplist: 8; Annot_gene: 9; Bed: 9"
[1] "HRC_snplist: 1; TOPMed_snplist: 13; Annot_gene: 14; Bed: 14"
[1] "HRC_snplist: 1; TOPMed_snplist: 7; Annot_gene: 8; Bed: 8"
[1] "HRC_snplist: 1; TOPMed_snplist: 5; Annot_gene: 6; Bed: 6"
[1] "HRC_snplist: 1; TOPMed_snplist: 4; Annot_gene: 5; Bed: 5"
[1] "HRC_snplist: 1; TOPMed_snplist: 5; Annot_gene: 6; Bed: 6"
[1] "HRC_snplist: 1; TOPMed_snplist: 4; Annot_gene: 5; Bed: 5"
[1] "HRC_snplist: 0; TOPMed_snplist: 4; Annot_gene: 4; Bed: 4"
[1] "HRC_snplist: 0; TOPMed_snplist: 3; Annot_gene: 3; Bed: 3"


## Phenotype Analysis

We are going to be using Burden of Rare Variant ([BRV](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4459641/)) to analyze rare variant aggregate association for PCSK9 and APOC3. For phenotype analysis, we control for age, sex and 2 PC's.

### Functions and Packages

In [14]:
library(dplyr)
library(genio)
library(data.table)
library(tidyverse)

In [15]:
# mi_pheno <- read.csv("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/Unrealted_White_EU_Both_Exo_Impu_5209_MI_cases.phe.csv")
# mi_y <- mi_pheno %>% arrange(FID) %>% pull(MI)

###########################################
# trig_ldl_pheno <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/Trigly_LDL_Log10_168206ind.phe.csv") %>% arrange(FID)
# tg_y <- trig_ldl_pheno %>% arrange(FID) %>% pull(logarithm_base10_Trigli)
# ldl_y <- trig_ldl_pheno  %>% arrange(FID) %>% pull(f.30780.0.0)

###########################################
ldl_df <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/UKBB_LDL_159759inds_4PCs.pheno", sep = "\t") %>% arrange(FID)
ldl_y <- ldl_df %>% pull(f.30780.0.0)
ldl_id <- ldl_df %>% pull(IID)
ldl_cov <- ldl_df %>% select(sex, age, PC1, PC2)

tg_df <- read.csv("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/UKBB_TG_159759inds_4PCs.pheno", sep = "\t") %>% arrange(FID)
tg_y <- tg_df %>% pull(logarithm_base10_Trigli)
tg_id <- tg_df %>% pull(IID)
tg_cov <- tg_df %>% select(sex, age, PC1, PC2)

rm(ldl_df, tg_df)

In [16]:
# The BRV function
BRV <- function(X_mat, y, cov, option = "binomial"){
    X_mat <- as.matrix(X_mat)
    y <- as.vector(y)

    ## cout number of rare variants for each individual
    if(ncol(X_mat) == 1){
        X_mat_new = X_mat
        print(head(X_mat_new))
    } else {
        X_mat_new  = rowSums(X_mat, na.rm=TRUE) %>% as.matrix()
    }
    
    age <- cov$age
    sex <- cov$sex
    pc1 <- cov$PC1
    pc2 <- cov$PC2

    ## obtain p value by logistic regression
    mat.fit <- glm(y ~ X_mat_new + age + sex + pc1 + pc2, family = option)
    zstat <- coef(summary(mat.fit))[2, 3]
    pval <- coef(summary(mat.fit))[2, 4]

    return(list(zstat = zstat, pval = pval))
}

In [17]:
## Formatting imputed data matrix
format_traw <- function(df, id, option="merge"){
    if(option != "merge"){
        X <- df %>% 
            select(-c(1:6)) %>% 
            t() %>% as.data.frame() %>% 
            tibble::rownames_to_column("FID_full")
        FID <- sapply(strsplit(X$FID_full,"_"), `[`, 2)
        X <- X %>%
            mutate(FID = FID) %>%
            arrange(FID) %>% 
            filter(FID %in% id) %>%
            select(-FID_full, -FID) %>% 
            as.matrix()
        X <- 2 - X
    } else {
        X <- df %>% 
            t() %>% as.data.frame() %>% 
            arrange(as.numeric(V1)) %>% 
            filter(V1 %in% id) %>%
            select(-V1) %>% 
            as.matrix()
    }
    
    return(X)
}

### PCSK9 v.s. LDL

In [18]:
setwd("/mnt/vast/hpc/csg/tl3031/imputation-rvtest/analysis/imputation_aggregated_analysis/pcsk9")

result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){ 
    exome_bim <- read_bim(sprintf("PCSK9_exome_maf00%s_extracted.bim", maf))
    exome_fam <- read_fam(sprintf("PCSK9_exome_maf00%s_extracted.fam", maf))
    exome <- read_bed(sprintf("PCSK9_exome_maf00%s_extracted.bed", maf), exome_bim$id, exome_fam$id)
    exome_X <- exome %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("FID") %>% 
            mutate(FID = as.numeric(FID)) %>% arrange(FID) %>% filter(FID %in% ldl_id) %>% select(-FID) %>% as.matrix()
    result_exome <- BRV(exome_X, ldl_y, ldl_cov, "gaussian")
    
    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        topmed_fname <- sprintf("PCSK9_topmed_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        hrc_topmed_fname <- sprintf("PCSK9_hrc_topmed_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("PCSK9_hrc_topmed_exome_rsq0%d_maf00%s_extracted.traw", rsq, maf)

        hrc <- fread(hrc_fname)
        hrc_X <- format_traw(hrc, ldl_id, "hrc")
        result_hrc <- BRV(hrc_X, ldl_y, ldl_cov, "gaussian")

        topmed <- fread(topmed_fname)
        topmed_X <- format_traw(topmed, ldl_id, "topmed")
        result_topmed <- BRV(topmed_X, ldl_y, ldl_cov, "gaussian")

        hrc_topmed <- fread(hrc_topmed_fname)
        hrc_topmed_X <- format_traw(hrc_topmed, ldl_id)
        result_hrc_topmed <- BRV(hrc_topmed_X, ldl_y, ldl_cov, "gaussian")

        hrc_topmed_exome <- fread(hrc_topmed_exome_fname)
        hrc_topmed_exome_X <- format_traw(hrc_topmed_exome, ldl_id)
        result_hrc_topmed_exome <- BRV(hrc_topmed_exome_X, ldl_y, ldl_cov, "gaussian")

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

Reading: PCSK9_exome_maf001_extracted.bim

Reading: PCSK9_exome_maf001_extracted.fam

Reading: PCSK9_exome_maf001_extracted.bed

Reading: PCSK9_exome_maf0005_extracted.bim

Reading: PCSK9_exome_maf0005_extracted.fam

Reading: PCSK9_exome_maf0005_extracted.bed

Reading: PCSK9_exome_maf0001_extracted.bim

Reading: PCSK9_exome_maf0001_extracted.fam

Reading: PCSK9_exome_maf0001_extracted.bed



In [19]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,394,5.473985e-71
hrc,0.3,0.01,16,1.944983e-12
hrc,0.8,0.01,6,6.993942e-11
topmed,0.3,0.01,109,1.3840750000000001e-39
topmed,0.8,0.01,47,1.769128e-34
hrc_topmed,0.3,0.01,110,1.1218179999999999e-38
hrc_topmed,0.8,0.01,48,1.972979e-34
hrc_topmed_exome,0.3,0.01,403,1.684064e-70
hrc_topmed_exome,0.8,0.01,396,7.159298e-71
exome,,0.005,394,5.473985e-71


In [20]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "maf", "num_var", "pvals")

for(maf in c("1", "05", "01")){
    exome_bim <- read_bim(sprintf("PCSK9_exome_maf00%s_cadd_extracted.bim", maf))
    exome_fam <- read_fam(sprintf("PCSK9_exome_maf00%s_cadd_extracted.fam", maf))
    exome <- read_bed(sprintf("PCSK9_exome_maf00%s_cadd_extracted.bed", maf), exome_bim$id, exome_fam$id)
    exome_X <- exome %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("FID") %>% 
            mutate(FID = as.numeric(FID)) %>% arrange(FID) %>% filter(FID %in% ldl_id) %>% select(-FID) %>% as.matrix()
    result_exome <- BRV(exome_X, ldl_y, ldl_cov, "gaussian")
    
    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("PCSK9_hrc_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        topmed_fname <- sprintf("PCSK9_topmed_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        hrc_topmed_fname <- sprintf("PCSK9_hrc_topmed_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("PCSK9_hrc_topmed_exome_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)

        hrc <- fread(hrc_fname)
        hrc_X <- format_traw(hrc, ldl_id, "hrc")
        result_hrc <- BRV(hrc_X, ldl_y, ldl_cov, "gaussian")

        topmed <- fread(topmed_fname)
        topmed_X <- format_traw(topmed, ldl_id, "topmed")
        result_topmed <- BRV(topmed_X, ldl_y, ldl_cov, "gaussian")

        hrc_topmed <- fread(hrc_topmed_fname)
        hrc_topmed_X <- format_traw(hrc_topmed, ldl_id)
        result_hrc_topmed <- BRV(hrc_topmed_X, ldl_y, ldl_cov, "gaussian")
            
        hrc_topmed_exome <- fread(hrc_topmed_exome_fname)
        hrc_topmed_exome_X <- format_traw(hrc_topmed_exome, ldl_id)
        result_hrc_topmed_exome <- BRV(hrc_topmed_exome_X, ldl_y, ldl_cov, "gaussian")
            
        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

Reading: PCSK9_exome_maf001_cadd_extracted.bim

Reading: PCSK9_exome_maf001_cadd_extracted.fam

Reading: PCSK9_exome_maf001_cadd_extracted.bed

Reading: PCSK9_exome_maf0005_cadd_extracted.bim

Reading: PCSK9_exome_maf0005_cadd_extracted.fam

Reading: PCSK9_exome_maf0005_cadd_extracted.bed

Reading: PCSK9_exome_maf0001_cadd_extracted.bim

Reading: PCSK9_exome_maf0001_cadd_extracted.fam

Reading: PCSK9_exome_maf0001_cadd_extracted.bed



In [22]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,200,8.246876e-93
hrc,0.3,0.01,6,1.912681e-12
hrc,0.8,0.01,2,4.563676e-15
topmed,0.3,0.01,61,2.297162e-50
topmed,0.8,0.01,25,6.112257e-45
hrc_topmed,0.3,0.01,62,1.269456e-50
hrc_topmed,0.8,0.01,26,1.855966e-45
hrc_topmed_exome,0.3,0.01,202,6.59813e-93
hrc_topmed_exome,0.8,0.01,201,8.034693e-93
exome,,0.005,200,8.246876e-93


#### PCSK9: chr1:55052701:C:T (hg38)

In [30]:
pcsk9_id_maf001 <- fread("PCSK9_exome_maf001_annot.csv.gz")$ID_hg38
pcsk9_id_maf0001 <- fread("PCSK9_exome_maf0001_annot.csv.gz")$ID_hg38
setdiff(pcsk9_id_maf001, pcsk9_id_maf0001)

fread("PCSK9_exome_maf001_annot.csv.gz") %>% filter(ID_hg38 == 'chr1:55052701:C:T')

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg38,ID,Function,RawScore,PHRED
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,55052701,55052701,C,T,exonic,PCSK9,nonsynonymous SNV,0.001,0.615,chr1:55052701:C:T,chr1:55052701:C:T,missense,3.457351,24.6


We are interested in this variant, because we see that after removal, the p-value has increased a lot. Therefore, we are going to conduct a single varaint association testing for this variants alone using exome and imputed data.

In [31]:
## exome data
exome_bim <- read_bim("PCSK9_exome_maf001_extracted.bim")
exome_fam <- read_fam("PCSK9_exome_maf001_extracted.fam")
exome <- read_bed("PCSK9_exome_maf001_extracted.bed", exome_bim$id, exome_fam$id)

exome_X <- data.frame(ID = names(exome["chr1:55052701:C:T",]), value=exome["chr1:55052701:C:T",], row.names=NULL)
exome_X <- exome_X %>% arrange(as.numeric(ID)) %>% filter(ID %in% ldl_id) %>% select(-ID) %>% as.matrix()

exome.fit <- glm(ldl_y ~ exome_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
coef(summary(exome.fit))[2, 4]

Reading: PCSK9_exome_maf001_extracted.bim

Reading: PCSK9_exome_maf001_extracted.fam

Reading: PCSK9_exome_maf001_extracted.bed



In [33]:
## hrc
fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_168206ids_chr1_rsq03_maf001_annot.csv.gz") %>%
filter(ID_hg19 == "chr1:55518374:C:T")

hrc <- fread("PCSK9_hrc_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr1:55518374:C:T")
hrc_X <- format_traw(hrc, ldl_id, "hrc")

hrc.fit <- glm(ldl_y ~ hrc_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
coef(summary(hrc.fit))[2, 4]

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg38,ID_hg19,ID,Function,RawScore,PHRED,R2
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,55052701,55052701,C,T,exonic,PCSK9,nonsynonymous SNV,0.001,0.615,chr1:55052701:C:T,chr1:55518374:C:T,chr1:55052701:C:T,missense,3.341231,24.5,0.677861


In [34]:
fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/topmed_168206ids_chr1_rsq03_maf001_annot.csv.gz") %>%
filter(ID_hg19 == "chr1:55052701:C:T")

topmed <- fread("PCSK9_topmed_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr1:55052701:C:T")
topmed_X <- format_traw(topmed, ldl_id, "topmed")

topmed.fit <- glm(ldl_y ~ topmed_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
coef(summary(topmed.fit))[2, 4]

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg38,ID_hg19,ID,Function,RawScore,PHRED,R2
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,55052701,55052701,C,T,exonic,PCSK9,nonsynonymous SNV,0.001,0.615,chr1:55052701:C:T,chr1:55052701:C:T,chr1:55052701:C:T,missense,3.457351,24.6,0.815682


#### PCSK9: HRC dataset inspection

We observe that for less variant, there is a more significant p-value. Therefore, we further investigated variants in HRC dataset.

In [35]:
annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc/hrc_chr1_rsq03_hg19_hg38_maf001_LOF_missense_cadd_annot.csv.gz")
pcsk9_hrc_rsq03 <- annot %>% filter(Gene.refGene == "PCSK9") %>% pull(ID_hg19)

In [36]:
annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc/hrc_chr1_rsq08_hg19_hg38_maf001_LOF_missense_cadd_annot.csv.gz")
pcsk9_hrc_rsq08 <- annot %>% filter(Gene.refGene == "PCSK9") %>% pull(ID_hg19)

In [37]:
annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_168206ids_chr1_rsq03_maf001_annot.csv.gz")
diff_lst <- setdiff(pcsk9_hrc_rsq03, pcsk9_hrc_rsq08)
hrc_mat <- fread("PCSK9_hrc_rsq03_maf001_extracted.traw")

df <- data.frame(data.frame(matrix(ncol = 4, nrow = 0)))
colnames(df) <- c("variant", "rsqs", "functions", "pvals")
for(snp in diff_lst){
    snp_info <- annot %>% filter(ID_hg19 == snp)
    
    hrc_X <- format_traw(hrc_mat %>% filter(SNP == snp), ldl_id, "hrc")
    hrc.fit <- glm(ldl_y ~ hrc_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
    
    subdf <- data.frame(variant = snp_info$ID, 
                        rsqs = snp_info$R2,
                        functions = snp_info$Function,
                        pvals = coef(summary(hrc.fit))[2, 4])
    df <- rbind(df, subdf) 
}

In [38]:
df

variant,rsqs,functions,pvals
<chr>,<dbl>,<chr>,<dbl>
chr1:55046549:C:G,0.652429,LoF,0.5391590013
chr1:55052701:C:T,0.677861,missense,0.0001473467
chr1:55056028:C:A,0.605831,missense,0.5496690045
chr1:55057403:C:T,0.53673,missense,0.5307091797


In [39]:
annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_168206ids_chr1_rsq03_maf001_annot.csv.gz")
diff_lst <- pcsk9_hrc_rsq08
hrc_mat <- fread("PCSK9_hrc_rsq03_maf001_extracted.traw")

df <- data.frame(data.frame(matrix(ncol = 4, nrow = 0)))
colnames(df) <- c("variant", "rsqs", "functions", "pvals")
for(snp in diff_lst){
    snp_info <- annot %>% filter(ID_hg19 == snp)
    
    hrc_X <- format_traw(hrc_mat %>% filter(SNP == snp), ldl_id, "hrc")
    hrc.fit <- glm(ldl_y ~ hrc_X + ldl_cov$age + ldl_cov$sex + ldl_cov$PC1 + ldl_cov$PC2, family = "gaussian")
    
    subdf <- data.frame(variant = snp_info$ID, 
                        rsqs = snp_info$R2,
                        functions = snp_info$Function,
                        pvals = coef(summary(hrc.fit))[2, 4])
    df <- rbind(df, subdf) 
}

In [40]:
df

variant,rsqs,functions,pvals
<chr>,<dbl>,<chr>,<dbl>
chr1:55063542:C:A,0.825654,LoF,0.1193677
chr1:55061557:G:A,0.85097,splicing,1.200053e-14


### APOC3 v.s. TG

In [41]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/apoc3")

In [42]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){ 
    exome_bim <- read_bim(sprintf("APOC3_exome_maf00%s_extracted.bim", maf))
    exome_fam <- read_fam(sprintf("APOC3_exome_maf00%s_extracted.fam", maf))
    exome <- read_bed(sprintf("APOC3_exome_maf00%s_extracted.bed", maf), exome_bim$id, exome_fam$id)
    exome_X <- exome %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("FID") %>% 
            mutate(FID = as.numeric(FID)) %>% arrange(FID) %>% filter(FID %in% tg_id) %>% select(-FID) %>% as.matrix()
    result_exome <- BRV(exome_X, tg_y, tg_cov,"gaussian")
    
    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        topmed_fname <- sprintf("APOC3_topmed_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        hrc_topmed_fname <- sprintf("APOC3_hrc_topmed_rsq0%d_maf00%s_extracted.traw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("APOC3_hrc_topmed_exome_rsq0%d_maf00%s_extracted.traw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- fread(hrc_fname)
            hrc_X <- format_traw(hrc, tg_id, "hrc")
            result_hrc <- BRV(hrc_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- fread(topmed_fname)
            topmed_X <- format_traw(topmed, tg_id, "topmed")
            result_topmed <- BRV(topmed_X, tg_y, tg_cov, "gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- fread(hrc_topmed_fname)
            hrc_topmed_X <- format_traw(hrc_topmed, tg_id)
            result_hrc_topmed <- BRV(hrc_topmed_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- fread(hrc_topmed_exome_fname)
            hrc_topmed_exome_X <- format_traw(hrc_topmed_exome, tg_id)
            result_hrc_topmed_exome <- BRV(hrc_topmed_exome_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

Reading: APOC3_exome_maf001_extracted.bim

Reading: APOC3_exome_maf001_extracted.fam

Reading: APOC3_exome_maf001_extracted.bed

Reading: APOC3_exome_maf0005_extracted.bim

Reading: APOC3_exome_maf0005_extracted.fam

Reading: APOC3_exome_maf0005_extracted.bed

Reading: APOC3_exome_maf0001_extracted.bim

Reading: APOC3_exome_maf0001_extracted.fam

Reading: APOC3_exome_maf0001_extracted.bed



In [43]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,60,3.0734e-234
hrc,0.3,0.01,3,3.376964e-182
hrc,0.8,0.01,2,9.967896e-170
topmed,0.3,0.01,15,4.292663e-185
topmed,0.8,0.01,5,7.820732e-178
hrc_topmed,0.3,0.01,15,2.3820230000000002e-188
hrc_topmed,0.8,0.01,6,1.89171e-187
hrc_topmed_exome,0.3,0.01,63,2.277336e-234
hrc_topmed_exome,0.8,0.01,60,3.0734e-234
exome,,0.005,60,3.0734e-234


In [44]:
result_df <- data.frame(data.frame(matrix(ncol = 5, nrow = 0)))
colnames(result_df) <- c("data", "rsqs", "mafs", "num_var", "pvals")

for(maf in c("1", "05", "01")){ 
    exome_bim <- read_bim(sprintf("APOC3_exome_maf00%s_cadd_extracted.bim", maf))
    exome_fam <- read_fam(sprintf("APOC3_exome_maf00%s_cadd_extracted.fam", maf))
    exome <- read_bed(sprintf("APOC3_exome_maf00%s_cadd_extracted.bed", maf), exome_bim$id, exome_fam$id)
    exome_X <- exome %>% t() %>% as.data.frame() %>% tibble::rownames_to_column("FID") %>% 
            mutate(FID = as.numeric(FID)) %>% arrange(FID) %>% filter(FID %in% tg_id) %>% select(-FID) %>% as.matrix()
    result_exome <- BRV(exome_X, tg_y, tg_cov, "gaussian")
    
    for(rsq in c(3, 8)){
        hrc_fname <- sprintf("APOC3_hrc_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        topmed_fname <- sprintf("APOC3_topmed_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        hrc_topmed_fname <- sprintf("APOC3_hrc_topmed_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)
        hrc_topmed_exome_fname <- sprintf("APOC3_hrc_topmed_exome_rsq0%d_maf00%s_cadd_extracted.traw", rsq, maf)

        if(file.exists(hrc_fname)){
            hrc <- fread(hrc_fname)
            hrc_X <- format_traw(hrc, tg_id, "hrc")
            result_hrc <- BRV(hrc_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_X <- matrix()
            result_hrc$pval <- NA
        }
        
        if(file.exists(topmed_fname)){
            topmed <- fread(topmed_fname)
            topmed_X <- format_traw(topmed, tg_id, "topmed")
            result_topmed <- BRV(topmed_X, tg_y, tg_cov, "gaussian")
        } else {
            topmed_X <- matrix()
            result_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_fname)){
            hrc_topmed <- fread(hrc_topmed_fname)
            hrc_topmed_X <- format_traw(hrc_topmed, tg_id)
            result_hrc_topmed <- BRV(hrc_topmed_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_topmed_X <- matrix()
            result_hrc_topmed$pval <- NA
        }

        if(file.exists(hrc_topmed_exome_fname)){
            hrc_topmed_exome <- fread(hrc_topmed_exome_fname)
            hrc_topmed_exome_X <- format_traw(hrc_topmed_exome, tg_id)
            result_hrc_topmed_exome <- BRV(hrc_topmed_exome_X, tg_y, tg_cov, "gaussian")
        } else {
            hrc_topmed_exome_X <- matrix()
            result_hrc_topmed_exome$pval <- NA
        }

        data <- c("hrc", "topmed", "hrc_topmed", "hrc_topmed_exome")
        rsqs <- rep(rsq/10, 4)
        mafs <- rep(paste0("0.0", maf), 4)
        pvals <- c(result_hrc$pval, result_topmed$pval, result_hrc_topmed$pval, result_hrc_topmed_exome$pval)
        num_var <- c(ncol(hrc_X), ncol(topmed_X), ncol(hrc_topmed_X), ncol(hrc_topmed_exome_X))
        sub_df <- data.frame(data, rsqs, mafs, num_var, pvals)
        result_df <- rbind(result_df, sub_df)
    }
    result_df <- rbind(data.frame(data = "exome", rsqs = "NA", mafs = paste0("0.0", maf), num_var = ncol(exome_X), pvals = result_exome$pval), result_df)
}

Reading: APOC3_exome_maf001_cadd_extracted.bim

Reading: APOC3_exome_maf001_cadd_extracted.fam

Reading: APOC3_exome_maf001_cadd_extracted.bed

Reading: APOC3_exome_maf0005_cadd_extracted.bim

Reading: APOC3_exome_maf0005_cadd_extracted.fam

Reading: APOC3_exome_maf0005_cadd_extracted.bed

Reading: APOC3_exome_maf0001_cadd_extracted.bim

Reading: APOC3_exome_maf0001_cadd_extracted.fam

Reading: APOC3_exome_maf0001_cadd_extracted.bed



In [45]:
result_df %>% 
    mutate(data = factor(data, levels = c("exome", "hrc", "topmed", "hrc_topmed", "hrc_topmed_exome"))) %>% 
    arrange(desc(as.numeric(mafs)), data)

data,rsqs,mafs,num_var,pvals
<fct>,<chr>,<chr>,<int>,<dbl>
exome,,0.01,26,1.0503770000000001e-262
hrc,0.3,0.01,3,3.376964e-182
hrc,0.8,0.01,2,9.967896e-170
topmed,0.3,0.01,9,1.220032e-190
topmed,0.8,0.01,4,1.304211e-179
hrc_topmed,0.3,0.01,9,7.033698e-194
hrc_topmed,0.8,0.01,5,4.544786e-189
hrc_topmed_exome,0.3,0.01,26,1.0503770000000001e-262
hrc_topmed_exome,0.8,0.01,26,1.0503770000000001e-262
exome,,0.005,26,1.0503770000000001e-262


#### APOC3: chr11:116830638

We specifically looked into this variant, for the same reason we did for the PCSK9 variant

In [57]:
exome_bim <- read_bim("APOC3_exome_maf001_extracted.bim")
exome_fam <- read_fam("APOC3_exome_maf001_extracted.fam")
exome <- read_bed("APOC3_exome_maf001_extracted.bed", exome_bim$id, exome_fam$id)

exome_X <- data.frame(ID = names(exome["chr11:116830638:G:A",]), value=exome["chr11:116830638:G:A",], row.names=NULL)
exome_X <- exome_X %>% arrange(as.numeric(ID)) %>% filter(ID %in% tg_id) %>% select(-ID) %>% as.matrix()

exome.fit <- glm(tg_y ~ exome_X + tg_cov$age + tg_cov$sex + tg_cov$PC1 + tg_cov$PC2, family = "gaussian")
coef(summary(exome.fit))[2, 4]

Reading: APOC3_exome_maf001_extracted.bim

Reading: APOC3_exome_maf001_extracted.fam

Reading: APOC3_exome_maf001_extracted.bed



In [54]:
fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/hrc_168206ids_chr11_rsq03_maf001_annot.csv.gz") %>% 
    filter(ID_hg19 == "chr11:116701354:G:A")

hrc <- fread("APOC3_hrc_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr11:116701354:G:A")
hrc_X <- format_traw(hrc, tg_id, "hrc")

hrc.fit <- glm(tg_y ~ hrc_X + tg_cov$age + tg_cov$sex + tg_cov$PC1 + tg_cov$PC2, family = "gaussian")
coef(summary(hrc.fit))[2, 4]

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg38,ID_hg19,ID,Function,RawScore,PHRED,R2
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
11,116830638,116830638,G,A,splicing,APOC3,.,0.0023,.,chr11:116830638:G:A,chr11:116701354:G:A,chr11:116830638:G:A,splicing,5.166115,33,0.836202


In [56]:
fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed/topmed_168206ids_chr11_rsq03_maf001_annot.csv.gz") %>% 
    filter(ID_hg38 == "chr11:116830638:G:A")

topmed <- fread("APOC3_topmed_rsq03_maf001_extracted.traw") %>% filter(SNP == "chr11:116830638:G:A")
topmed_X <- format_traw(topmed, tg_id, "topmed")

topmed.fit <- glm(tg_y ~ topmed_X + tg_cov$age + tg_cov$sex + tg_cov$PC1 + tg_cov$PC2, family = "gaussian")
coef(summary(topmed.fit))[2,4]

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,MAF_nfe_exome,REVEL_score,ID_hg38,ID_hg19,ID,Function,RawScore,PHRED,R2
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
11,116830638,116830638,G,A,splicing,APOC3,.,0.0023,.,chr11:116830638:G:A,chr11:116830638:G:A,chr11:116830638:G:A,splicing,4.947405,33,0.859247
