# Additional Queries

This notebook records some additional queries we made in addition to the simulation

## R2 v.s. r2

We are interested in comparing the correlation between exome and imputed data (r2) versus the imputation quality of imputed data (R2). Since correlation can only be computed between the overlapped variant between exome sequence data and imputed data, we generated another set of files for each gene for overlapped variants only.

### Extract variants

In [1]:
library(dplyr)
library(data.table)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [3]:
df <- data.table::fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_rsq03_maf001_annot.csv.gz")
head(df)

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,ExonicFunc.refGene,Function,MAF_nfe_exome,⋯,R2,R2_hrc,R2_topmed,R2_exome,ID,ID_hg38,ID_hg19,source,RawScore,PHRED
<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,930165,930165,G,A,exonic;splicing,SAMD11,nonsynonymous SNV,splicing,9.599e-05,⋯,0.767397,0.425596,0.767397,999,chr1:930165:G:A,chr1:930165:G:A,chr1:865545:G:A,exome,4.225333,28.9
1,930204,930204,G,A,exonic,SAMD11,nonsynonymous SNV,missense,6.548e-05,⋯,0.885662,0.584718,0.885662,999,chr1:930204:G:A,chr1:930204:G:A,chr1:865584:G:A,exome,2.879554,23.2
1,930245,930245,G,A,exonic,SAMD11,nonsynonymous SNV,missense,1.937e-05,⋯,0.806128,0.806128,0.324635,999,chr1:930245:G:A,chr1:930245:G:A,chr1:865625:G:A,exome,3.267428,24.0
1,930248,930248,G,A,exonic,SAMD11,nonsynonymous SNV,missense,0.0053,⋯,0.995155,0.995155,0.993817,999,chr1:930248:G:A,chr1:930248:G:A,chr1:865628:G:A,exome,2.36338,21.9
1,930285,930285,G,A,exonic,SAMD11,nonsynonymous SNV,missense,0.0004,⋯,0.724876,0.724876,0.615333,999,chr1:930285:G:A,chr1:930285:G:A,chr1:865665:G:A,exome,0.091394,2.015
1,930314,930314,C,T,exonic,SAMD11,nonsynonymous SNV,missense,0.0008,⋯,0.798359,0.798359,0.782097,999,chr1:930314:C:T,chr1:930314:C:T,chr1:865694:C:T,exome,2.481464,22.4


In [5]:
setwd("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/overlap/scripts")
for(c in c(1,2)){
    for(maf in c(0.001, 0.005, 0.01)){
        if(maf == 0.001) {
            df1 <- df %>% filter(MAF_nfe_exome < 0.001 | is.na(MAF_nfe_exome))
        } else if (maf == 0.005) {
            df1 <- df %>% filter(MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005)
        } else {
            df1 <- df %>% filter(MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01)
        }
        
        topmed <- df1 %>% filter(Chr == c, R2_topmed != 0, R2_exome == 999)
        hrc <- df1 %>% filter(Chr == c, R2_hrc != 0, R2_exome == 999)
        
        counter = 0
        for(i in seq(1, nrow(topmed), 5000)){
            counter = counter + 1
            if(i+4999 > nrow(topmed)) end = nrow(topmed) else end = i+4999
            
            topmed[c(i:end),] %>% 
                select(ID_hg38) %>% 
                fwrite(sprintf("exome_topmed_ES_chr%i_maf%s_batch%i.txt", c, gsub('\\.', '', as.character(maf)), counter), col.names = FALSE)

            topmed[c(i:end),] %>% 
                select(ID_hg38) %>% 
                fwrite(sprintf("exome_topmed_TP_chr%i_maf%s_batch%i.txt", c, gsub('\\.', '', as.character(maf)), counter), col.names = FALSE)
        }
        
        counter = 0
        for(i in seq(1, nrow(hrc), 5000)){
            counter = counter + 1
            if(i+4999 > nrow(hrc)) end = nrow(hrc) else end = i+4999
                
            hrc[c(i:end),] %>% 
                select(ID_hg38) %>% 
                fwrite(sprintf("exome_hrc_ES_chr%i_maf%s_batch%i.txt", c, gsub('\\.', '', as.character(maf)), counter), col.names = FALSE)
            hrc[c(i:end),] %>% 
                select(ID_hg19) %>% 
                fwrite(sprintf("exome_hrc_HRC_chr%i_maf%s_batch%i.txt", c, gsub('\\.', '', as.character(maf)), counter), col.names = FALSE)
        }        
    }
}

In [8]:
for chr in 1 2; do
    for maf in 0001 0005 001; do
        for dt in ES HRC; do
            extract_prefix='exome_hrc_'$dt'_chr'$chr'_maf'$maf'_batch'
            num_batch=$(ls $extract_prefix*'.txt'|wc -l)
            
            for ((i=1; i<=$num_batch; i++)); do
                script_name='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'.sh'
                out_name='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/output/'$extract_prefix$i'.out'
                
                if [ "${dt}" = "ES" ]; then
                  bfile='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/ukb23156_c'$chr'_maf001_LOF_missense_extracted'
                  plink_module='module load Plink/1.9.10'
                  plink_command='plink --bfile '$bfile' --extract '$extract_prefix$i'.txt --make-bed --out '$extract_prefix$i
                else
                  bfile='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc/hrc_chr'$chr'_rsq03_maf001_LOF_missense_extracted'
                  plink_module='module load Plink/2.00a'
                  plink_command='plink2 --bpfile '$bfile' --extract '$extract_prefix$i'.txt --make-bpgen --export A-transpose --out '$extract_prefix$i >> $script_name
                fi

                echo '#!/bin/bash' > $script_name
                echo '#$ -N' $extract_prefix >> $script_name
                echo '#$ -l h_vmem=10G' >> $script_name
                echo '#$ -l h_rt=600:00:00' >> $script_name
                echo '#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'_'$JOB_ID'.out' >> $script_name
                echo '#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'_'$JOB_ID'.err' >> $script_name
                echo '#$ -q csg.q' >> $script_name
                echo '#$ -j y' >> $script_name
                echo '#$ -S /bin/bash' >> $script_name
                echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
                echo $plink_module >> $script_name
                echo 'cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation' >> $script_name
                echo $plink_command >> $script_name
                echo 'echo "Number of variants in bim file:" >> '$out_name >> $script_name
                echo 'wc -l' $extract_prefix$i'.bim >> '$out_name >> $script_name
                echo 'echo "Number of variants in extract file:" >> '$out_name >> $script_name
                echo 'wc -l '$extract_prefix$i'.txt >> '$out_name >> $script_name
            done
        done
    done
done

In [None]:
for chr in 1 2; do
    for maf in 0001 0005 001; do
        for dt in ES TP; do
            extract_prefix='exome_topmed_'$dt'_chr'$chr'_maf'$maf'_batch'
            num_batch=$(ls $extract_prefix*'.txt'|wc -l)
            
            for ((i=1; i<=$num_batch; i++)); do
                script_name='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'.sh'
                out_name='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/output/'$extract_prefix$i'.out'
                
                if [ "${dt}" = "ES" ]; then
                  bfile='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/ukb23156_c'$chr'_maf001_LOF_missense_extracted'
                  plink_module='module load Plink/1.9.10'
                  plink_command='plink --bfile '$bfile' --extract '$extract_prefix$i'.txt --make-bed --out '$extract_prefix$i
                else
                  bfile='/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/topmed/topmed_chr'$chr'_rsq03_maf001_LOF_missense_extracted'
                  plink_module='module load Plink/2.00a'
                  plink_command='plink2 --bpfile '$bfile' --extract '$extract_prefix$i'.txt --make-bpgen --export A-transpose --out '$extract_prefix$i >> $script_name
                fi

                echo '#!/bin/bash' > $script_name
                echo '#$ -N' $extract_prefix >> $script_name
                echo '#$ -l h_vmem=10G' >> $script_name
                echo '#$ -l h_rt=600:00:00' >> $script_name
                echo '#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'_'$JOB_ID'.out' >> $script_name
                echo '#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$extract_prefix$i'_'$JOB_ID'.err' >> $script_name
                echo '#$ -q csg.q' >> $script_name
                echo '#$ -j y' >> $script_name
                echo '#$ -S /bin/bash' >> $script_name
                echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
                echo $plink_module >> $script_name
                echo 'cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation' >> $script_name
                echo $plink_command >> $script_name
                echo 'echo "Number of variants in bim file:" >> '$out_name >> $script_name
                echo 'wc -l' $extract_prefix$i'.bim >> '$out_name >> $script_name
                echo 'echo "Number of variants in extract file:" >> '$out_name >> $script_name
                echo 'wc -l '$extract_prefix$i'.txt >> '$out_name >> $script_name
            done
        done
    done
done

### Calculate correlation

In [10]:
pwd

/home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/overlap/scripts


In [11]:
script_dir="/mnt/vast/hpc/csg/tl3031/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/"
rm $script_dir*'corr'*'.sh'
rm $script_dir*'corr'*'.out'

for chr in 1 2; do
    for maf in 0001 0005 001; doscript_dir="/mnt/vast/hpc/csg/tl3031/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/"
rm $script_dir*'corr'*'.sh'
rm $script_dir*'corr'*'.out'

for chr in 1 2; do
    for maf in 0001 0005 001; do
        for dt in HRC TP; do
            if [ "${dt}" = "HRC" ]; then
                prefix='exome_hrc'
            else
                prefix='exome_topmed'
            fi
            
            exome_prefix=$prefix'_ES_chr'$chr'_maf'$maf'_batch'
            imputed_prefix=$prefix'_'$dt'_chr'$chr'_maf'$maf'_batch'
            num_batch=$(ls $imputed_prefix*'.txt' | wc -l)
            
            for ((i=1; i<=$num_batch; i++)); do
                script_name=$script_dir$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.sh'
                out_name=$script_dir'output/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.out'
                command='Rscript '$script_dir'calc_corr.R '$exome_prefix$i' '$imputed_prefix$i' '$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.csv '$dt
                echo '#!/bin/bash' >> $script_name
                echo '#$ -N' $prefix'_chr'$chr'_maf'$maf'_corr_batch'$i >> $script_name
                echo '#$ -l h_vmem=80G' >> $script_name
                echo '#$ -l h_rt=600:00:00' >> $script_name
                echo '#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'_'$JOB_ID'.out' >> $script_name
                echo '#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'_'$JOB_ID'.err' >> $script_name
                echo '#$ -q csg.q' >> $script_name
                echo '#$ -j y' >> $script_name
                echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
                echo 'module load R/4.2.2.10' >> $script_name
                echo 'cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation' >> $script_name
                echo $command >> $script_name
                echo 'echo "Number of variants in bim file:" >> '$out_name >> $script_name
                echo 'wc -l' $exome_prefix$i'.bim >> '$out_name >> $script_name
                echo 'echo "Number of variants in correlation file:" >> '$out_name >> $script_name
                echo 'wc -l '$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.csv >> '$out_name >> $script_name
            done
        done
    done
done
        for dt in HRC TP; do
            if [ "${dt}" = "HRC" ]; then
                prefix='exome_hrc'
            else
                prefix='exome_topmed'
            fi
            
            exome_prefix=$prefix'_ES_chr'$chr'_maf'$maf'_batch'
            imputed_prefix=$prefix'_'$dt'_chr'$chr'_maf'$maf'_batch'
            num_batch=$(ls $imputed_prefix*'.txt' | wc -l)
            
            for ((i=1; i<=$num_batch; i++)); do
                script_name=$script_dir$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.sh'
                out_name=$script_dir'output/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.out'
                command='Rscript '$script_dir'calc_corr.R '$exome_prefix$i' '$imputed_prefix$i' '$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.csv '$dt
                echo '#!/bin/bash' >> $script_name
                echo '#$ -N' $prefix'_chr'$chr'_maf'$maf'_corr_batch'$i >> $script_name
                echo '#$ -l h_vmem=80G' >> $script_name
                echo '#$ -l h_rt=600:00:00' >> $script_name
                echo '#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'_'$JOB_ID'.out' >> $script_name
                echo '#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation/script/'$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'_'$JOB_ID'.err' >> $script_name
                echo '#$ -q csg.q' >> $script_name
                echo '#$ -j y' >> $script_name
                echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
                echo 'module load R/4.2.2.10' >> $script_name
                echo 'cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation' >> $script_name
                echo $command >> $script_name
                echo 'echo "Number of variants in bim file:" >> '$out_name >> $script_name
                echo 'wc -l' $exome_prefix$i'.bim >> '$out_name >> $script_name
                echo 'echo "Number of variants in correlation file:" >> '$out_name >> $script_name
                echo 'wc -l '$prefix'_chr'$chr'_maf'$maf'_corr_batch'$i'.csv >> '$out_name >> $script_name
            done
        done
    done
done

## Check correlation and Rsq

In [12]:
fname_lst <- list.files("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/correlation", pattern = ".csv", full.names = TRUE)

corr_df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(corr_df) <- c("snp_id", "corr", "rsq", "dataset")
for(i in fname_lst){
    dataset <- case_when(
        grepl("hrc", i, fixed = TRUE) & grepl("maf001", i, fixed = TRUE) ~ "hrc_maf001",
        grepl("hrc", i, fixed = TRUE) & grepl("maf0005", i, fixed = TRUE) ~ "hrc_maf0005",
        grepl("hrc", i, fixed = TRUE) & grepl("maf0001", i, fixed = TRUE) ~ "hrc_maf0001",
        grepl("topmed", i, fixed = TRUE) & grepl("maf001", i, fixed = TRUE) ~ "topmed_maf001",
        grepl("topmed", i, fixed = TRUE) & grepl("maf0005", i, fixed = TRUE) ~ "topmed_maf0005",
        grepl("topmed", i, fixed = TRUE) & grepl("maf0001", i, fixed = TRUE) ~ "topmed_maf0001",
    )
    
    dataset_corr_df <- fread(i) %>% mutate(dataset = dataset)
    corr_df <- rbind(corr_df, dataset_corr_df)
}

dim(corr_df)

In [13]:
hrc_corr_df <- corr_df %>% filter(stringr::str_detect(dataset, "hrc_maf")) %>% select(snpid, corr) %>% rename(corr_hrc = corr)
topmed_corr_df <- corr_df %>% filter(stringr::str_detect(dataset, "topmed_maf")) %>% select(snpid, corr) %>% rename(corr_topmed = corr)

In [14]:
hrc_topmed_df <- left_join(df, hrc_corr_df, by = c("ID_hg38" = "snpid"))
hrc_topmed_df <- left_join(hrc_topmed_df, topmed_corr_df, by = c("ID_hg38" = "snpid"))
hrc_topmed_df <- hrc_topmed_df %>% 
    filter(R2 != 0) %>%
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    group_by(maf_cat)

In [15]:
hrc_topmed_df <- hrc_topmed_df %>% 
    mutate(corr_hrc = ifelse(is.na(corr_hrc), 0, corr_hrc),
           corr_topmed = ifelse(is.na(corr_topmed), 0, corr_topmed)) %>%
    mutate(corr = ifelse(corr_hrc > corr_topmed, corr_hrc, corr_topmed),
           corr_source = ifelse(corr_hrc > corr_topmed, "HRC", "TOPMed")) 

In [16]:
hrc_summary <- df %>% 
    filter(R2_hrc != 0) %>% 
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    group_by(maf_cat) %>%
    summarise(
        R2_mean = mean(R2_hrc),
        R2_std = sd(R2_hrc),
        R2_count = n()
    ) %>%
    mutate(maf_cat = factor(maf_cat, levels = c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")),
          data = "HRC") %>% 
    arrange(maf_cat)

In [17]:
topmed_summary <- df %>% 
    filter(R2_topmed != 0) %>% 
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    group_by(maf_cat) %>%
    summarise(
        R2_mean = mean(R2_topmed),
        R2_std = sd(R2_topmed),
        R2_count = n()
    ) %>%
    mutate(maf_cat = factor(maf_cat, levels = c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")),
          data = "TOPMed") %>% 
    arrange(maf_cat)

In [24]:
## HRC, TOPMed R2
rbind(hrc_summary, topmed_summary) %>%
    mutate(R2_se = R2_std/sqrt(R2_count))

maf_cat,R2_mean,R2_std,R2_count,data,R2_se
<fct>,<dbl>,<dbl>,<int>,<chr>,<dbl>
MAF < 0.001,0.68907,0.200122,28538,HRC,0.00118463
0.001 <= MAF < 0.005,0.88178,0.150094,5354,HRC,0.00205128
0.005 <= MAF < 0.01,0.93032,0.107334,1342,HRC,0.00292994
MAF < 0.001,0.68673,0.197036,203645,TOPMed,0.00043663
0.001 <= MAF < 0.005,0.90304,0.091267,5597,TOPMed,0.00121993
0.005 <= MAF < 0.01,0.96321,0.066508,1411,TOPMed,0.00177056


In [25]:
## HRC, TOPMed r2
hrc_topmed_corr_summary <-
    corr_df %>% 
    group_by(dataset) %>% 
    summarise(corr_mean = mean(corr), corr_std = sd(corr), rsq_mean = mean(rsq), rsq_std = sd(rsq), corr_count = n()) %>%
    mutate(maf_cat = case_when(grepl("0001", dataset) ~ "MAF < 0.001",
                               grepl("0005", dataset) ~ "0.001 <= MAF < 0.005",
                               grepl("001", dataset)  ~ "0.005 <= MAF < 0.01",
                               TRUE ~ "dataset"),
          data = ifelse(grepl("hrc", dataset), "HRC", "TOPMed")) %>%
    select("maf_cat", "corr_mean", "corr_std", "corr_count", "data")
    
rbind(hrc_summary, topmed_summary) %>% 
    left_join(hrc_topmed_corr_summary)  %>% 
    select(data, maf_cat, R2_mean, R2_std, R2_count, corr_mean, corr_std, corr_count)

[1m[22mJoining with `by = join_by(maf_cat, data)`


data,maf_cat,R2_mean,R2_std,R2_count,corr_mean,corr_std,corr_count
<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>
HRC,MAF < 0.001,0.68907,0.200122,28538,0.66954,0.28018,26178
HRC,0.001 <= MAF < 0.005,0.88178,0.150094,5354,0.90534,0.14729,5239
HRC,0.005 <= MAF < 0.01,0.93032,0.107334,1342,0.93787,0.20053,1304
TOPMed,MAF < 0.001,0.68673,0.197036,203645,0.66342,0.32862,165244
TOPMed,0.001 <= MAF < 0.005,0.90304,0.091267,5597,0.93094,0.11896,5503
TOPMed,0.005 <= MAF < 0.01,0.96321,0.066508,1411,0.95648,0.1953,1374


In [26]:
## hrc_topmed_r2
hrc_topmed_df %>% 
    filter(corr != 0) %>%
    summarise(mean_corr = mean(corr), se_corr = sd(corr) / sqrt(n()), count_corr = n()) %>% 
    mutate(maf_cat = factor(maf_cat, levels = c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01"))) %>% 
    arrange(maf_cat)

maf_cat,mean_corr,se_corr,count_corr
<fct>,<dbl>,<dbl>,<int>
MAF < 0.001,0.74645,0.00063846,148289
0.001 <= MAF < 0.005,0.94635,0.00160662,5583
0.005 <= MAF < 0.01,0.95616,0.00518559,1384


In [27]:
## hrc_topmed_R2
hrc_topmed_df %>% 
    summarise(mean_R2 = mean(R2), se_R2 = sd(R2) / sqrt(n()), count_R2 = n()) %>% 
    mutate(maf_cat = factor(maf_cat, levels = c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01"))) %>% 
    arrange(maf_cat)

maf_cat,mean_R2,se_R2,count_R2
<fct>,<dbl>,<dbl>,<int>
MAF < 0.001,0.68856,0.00043696,206209
0.001 <= MAF < 0.005,0.92985,0.00136855,5733
0.005 <= MAF < 0.01,0.95635,0.00232959,1440


### T-test for R2 vs r2

#### Table S2

In [28]:
hrc_corr_df <- corr_df %>% 
    mutate(maf_cat = case_when(grepl("0001", dataset) ~ "MAF < 0.001",
                               grepl("0005", dataset) ~ "0.001 <= MAF < 0.005",
                               grepl("001", dataset)  ~ "0.005 <= MAF < 0.01",
                               TRUE ~ "dataset")) %>%
    filter(grepl("hrc_", dataset)) %>%
    select(snpid, corr, maf_cat)

topmed_corr_df <- corr_df %>% 
    mutate(maf_cat = case_when(grepl("0001", dataset) ~ "MAF < 0.001",
                               grepl("0005", dataset) ~ "0.001 <= MAF < 0.005",
                               grepl("001", dataset)  ~ "0.005 <= MAF < 0.01",
                               TRUE ~ "dataset")) %>%
    filter(grepl("topmed_", dataset)) %>%
    select(snpid, corr, maf_cat)

In [29]:
hrc_r2_df <- df %>% 
    filter(R2_hrc != 0) %>% 
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    select(ID_hg38, R2_hrc, maf_cat)

topmed_r2_df <- df %>% 
    filter(R2_topmed != 0) %>% 
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    select(ID_hg38, R2_topmed, maf_cat)

In [30]:
## check if the R2 is statistically different for HRC v.s. TOPMed

r2_df <- data.frame(matrix(ncol=9, nrow = 0))
colnames(r2_df) <- c("r2_mean_hrc", "r2_se_hrc", "r2_n_hrc", "r2_mean_topmed", "r2_se_topmed", "r2_n_topmed", "test_stat", "p_value", "maf_cat")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc <- hrc_r2_df %>% filter(maf_cat == maf)
    topmed <- topmed_r2_df %>% filter(maf_cat == maf)
    
    test_result <- t.test(hrc$R2_hrc, topmed$R2_topmed)
    
    sub_r2_df <- data.frame("r2_mean_hrc" = mean(hrc$R2_hrc),
                              "r2_se_hrc" = sd(hrc$R2_hrc) / sqrt(length(hrc$R2_hrc)),
                              "r2_n_hrc" = length(hrc$R2_hrc), 
                              "r2_mean_topmed" = mean(topmed$R2_topmed),
                              "r2_se_topmed" = sd(topmed$R2_topmed) / sqrt(length(topmed$R2_topmed)),
                              "r2_n_topmed" = length(topmed$R2_topmed), 
                              "test_stat" = test_result$statistic, 
                              "p_value" = test_result$p.value,
                              "maf_cat" = maf)
    r2_df = rbind(r2_df, sub_r2_df)
}

r2_df

Unnamed: 0_level_0,r2_mean_hrc,r2_se_hrc,r2_n_hrc,r2_mean_topmed,r2_se_topmed,r2_n_topmed,test_stat,p_value,maf_cat
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>
t,0.68907,0.0011846,28538,0.68673,0.00043663,203645,1.8555,0.063531,MAF < 0.001
t1,0.88178,0.0020513,5354,0.90304,0.00121993,5597,-8.9073,6.277400000000001e-19,0.001 <= MAF < 0.005
t2,0.93032,0.0029299,1342,0.96321,0.00177056,1411,-9.609,1.9038999999999998e-21,0.005 <= MAF < 0.01


In [31]:
## check if the RSQ is statistically different for HRC v.s. HRC_TOPMed

r2_df <- data.frame(matrix(ncol=9, nrow = 0))
colnames(r2_df) <- c("r2_mean_hrc", "r2_se_hrc", "r2_n_hrc", "r2_mean_hrc_topmed", "r2_se_hrc_topmed", "r2_n_hrc_topmed", "test_stat", "p_value", "maf_cat")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc <- hrc_r2_df %>% filter(maf_cat == maf)
    hrc_topmed <- hrc_topmed_df %>% filter(maf_cat == maf)
    
    test_result <- t.test(hrc$R2_hrc, hrc_topmed$R2)
    
    sub_r2_df <- data.frame("r2_mean_hrc" = mean(hrc$R2_hrc),
                              "r2_se_hrc" = sd(hrc$R2_hrc) / sqrt(length(hrc$R2_hrc)),
                              "r2_n_hrc" = length(hrc$R2_hrc), 
                              "r2_mean_hrc_topmed" = mean(hrc_topmed$R2),
                              "r2_se_hrc_topmed" = sd(hrc_topmed$R2) / sqrt(length(hrc_topmed$R2)),
                              "r2_n_hrc_topmed" = length(hrc_topmed$R2), 
                              "test_stat" = test_result$statistic, 
                              "p_value" = test_result$p.value,
                              "maf_cat" = maf)
    r2_df = rbind(r2_df, sub_r2_df)
}

r2_df

Unnamed: 0_level_0,r2_mean_hrc,r2_se_hrc,r2_n_hrc,r2_mean_hrc_topmed,r2_se_hrc_topmed,r2_n_hrc_topmed,test_stat,p_value,maf_cat
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>
t,0.68907,0.0011846,28538,0.68856,0.00043696,206209,0.40004,0.68913,MAF < 0.001
t1,0.88178,0.0020513,5354,0.92985,0.00136855,5733,-19.49355,5.2782000000000004e-83,0.001 <= MAF < 0.005
t2,0.93032,0.0029299,1342,0.95635,0.00232959,1440,-6.95453,4.4568e-12,0.005 <= MAF < 0.01


In [32]:
## check if the RSQ is statistically different for TOPMed v.s. HRC_TOPMed

r2_df <- data.frame(matrix(ncol=9, nrow = 0))
colnames(r2_df) <- c("r2_mean_topmed", "r2_se_topmed", "r2_n_topmed", "r2_mean_hrc_topmed", "r2_se_hrc_topmed", "r2_n_hrc_topmed", "test_stat", "p_value", "maf_cat")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    topmed <- topmed_r2_df %>% filter(maf_cat == maf)
    hrc_topmed <- hrc_topmed_df %>% filter(maf_cat == maf)
    
    test_result <- t.test(hrc$R2_hrc, hrc_topmed$R2)
    
    sub_r2_df <- data.frame("r2_mean_topmed" = mean(topmed$R2_topmed),
                              "r2_se_topmed" = sd(topmed$R2_topmed) / sqrt(length(topmed$R2_topmed)),
                              "r2_n_topmed" = length(topmed$R2_topmed), 
                              "r2_mean_hrc_topmed" = mean(hrc_topmed$R2),
                              "r2_se_hrc_topmed" = sd(hrc_topmed$R2) / sqrt(length(hrc_topmed$R2)),
                              "r2_n_hrc_topmed" = length(hrc_topmed$R2), 
                              "test_stat" = test_result$statistic, 
                              "p_value" = test_result$p.value,
                              "maf_cat" = maf)
    r2_df = rbind(r2_df, sub_r2_df)
}

r2_df

Unnamed: 0_level_0,r2_mean_topmed,r2_se_topmed,r2_n_topmed,r2_mean_hrc_topmed,r2_se_hrc_topmed,r2_n_hrc_topmed,test_stat,p_value,maf_cat
Unnamed: 0_level_1,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>
t,0.68673,0.00043663,203645,0.68856,0.00043696,206209,81.60905,0.0,MAF < 0.001
t1,0.90304,0.00121993,5597,0.92985,0.00136855,5733,0.14574,0.88414,0.001 <= MAF < 0.005
t2,0.96321,0.00177056,1411,0.95635,0.00232959,1440,-6.95453,4.4568e-12,0.005 <= MAF < 0.01


#### Table S3

In [33]:
## check if the correlation and R2 is statistically different from each other 

corr_rsq_df <- data.frame(matrix(ncol=10, nrow = 0))
colnames(corr_rsq_df) <- c("dataset", "maf_cat", "corr_mean", "corr_se", "corr_n", "rsq_mean", "rsq_se", "rsq_n", "test_stat", "p_value")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc_corr <- hrc_corr_df %>% filter(maf_cat == maf)
    hrc_rsq <- hrc_r2_df %>% filter(maf_cat == maf)
    
    topmed_corr <- topmed_corr_df %>% filter(maf_cat == maf)
    topmed_rsq <- topmed_r2_df %>% filter(maf_cat == maf)
    
    hrc_test_result <- t.test(hrc_corr$corr, hrc_rsq$R2_hrc)
    topmed_test_result <- t.test(topmed_corr$corr, topmed_rsq$R2_topmed)
    
    sub_df_hrc <- data.frame("dataset" = "HRC", 
                             "maf_cat" = maf, 
                             "corr_mean" = mean(hrc_corr$corr), 
                             "corr_se" = sd(hrc_corr$corr) / sqrt(length(hrc_corr$corr)),
                             "corr_n" = length(hrc_corr$corr), 
                             "rsq_mean" = mean(hrc_rsq$R2_hrc), 
                             "rsq_se" = sd(hrc_rsq$R2_hrc) / sqrt(length(hrc_rsq$R2_hrc)),
                             "rsq_n" = length(hrc_rsq$R2_hrc), 
                             "test_stat" = hrc_test_result$statistic, 
                             "p_value" = hrc_test_result$p.value)
    
    sub_df_topmed <- data.frame("dataset" = "TOPMed", 
                                "maf_cat" = maf, 
                                "corr_mean" = mean(topmed_corr$corr), 
                                "corr_se" = sd(topmed_corr$corr) / sqrt(length(topmed_corr$corr)),
                                "corr_n" = length(topmed_corr$corr), 
                                "rsq_mean" = mean(topmed_rsq$R2_topmed), 
                                "rsq_se" = sd(topmed_rsq$R2_topmed) / sqrt(length(topmed_rsq$R2_topmed)),
                                "rsq_n" = length(topmed_rsq$R2_topmed), 
                                "test_stat" = topmed_test_result$statistic, 
                                "p_value" = topmed_test_result$p.value)
    
    corr_rsq_df = rbind(corr_rsq_df, sub_df_hrc, sub_df_topmed)
}

corr_rsq_df %>% arrange(dataset)

Unnamed: 0_level_0,dataset,maf_cat,corr_mean,corr_se,corr_n,rsq_mean,rsq_se,rsq_n,test_stat,p_value
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
t,HRC,MAF < 0.001,0.66954,0.00173168,26178,0.68907,0.00118463,28538,-9.3064,1.3773e-20
t2,HRC,0.001 <= MAF < 0.005,0.90534,0.00203497,5239,0.88178,0.00205128,5354,8.1545,3.9029e-16
t4,HRC,0.005 <= MAF < 0.01,0.93787,0.00555317,1304,0.93032,0.00292994,1342,1.2033,0.22902
t1,TOPMed,MAF < 0.001,0.66342,0.00080841,165244,0.68673,0.00043663,203645,-25.3676,8.5881e-142
t3,TOPMed,0.001 <= MAF < 0.005,0.93094,0.00160361,5503,0.90304,0.00121993,5597,13.8488,3.1507e-43
t5,TOPMed,0.005 <= MAF < 0.01,0.95648,0.0052687,1374,0.96321,0.00177056,1411,-1.2107,0.22619


In [35]:
hrc_topmed_full_df <- hrc_topmed_df %>% 
    filter(R2 != 0) %>%
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01"))

corr_rsq_hrc_topmed_df <- data.frame(matrix(ncol=10, nrow = 0))
colnames(corr_rsq_hrc_topmed_df) <- c("dataset", "maf_cat", "corr_mean", "corr_se", "corr_n", "rsq_mean", "rsq_se", "rsq_n", "test_stat", "p_value")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    subdf <- hrc_topmed_full_df %>% filter(maf_cat == maf)
    corr <- subdf %>% filter(corr != 0) %>% pull(corr)
    test_result <- t.test(corr, subdf$R2)
    
    sub_df <- data.frame("dataset" = "HRC_TOPMed", 
                             "maf_cat" = maf, 
                             "corr_mean" = mean(corr), 
                             "corr_se" = sd(corr) / sqrt(length(corr)),
                             "corr_n" = length(corr), 
                             "rsq_mean" = mean(subdf$R2), 
                             "rsq_se" = sd(subdf$R2) / sqrt(length(subdf$R2)),
                             "rsq_n" = length(subdf$R2), 
                             "test_stat" = test_result$statistic, 
                             "p_value" = test_result$p.value)
    
    corr_rsq_hrc_topmed_df = rbind(sub_df, corr_rsq_hrc_topmed_df)
}
corr_rsq_hrc_topmed_df

Unnamed: 0_level_0,dataset,maf_cat,corr_mean,corr_se,corr_n,rsq_mean,rsq_se,rsq_n,test_stat,p_value
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
t,HRC_TOPMed,0.005 <= MAF < 0.01,0.95616,0.00518559,1384,0.95635,0.00232959,1440,-0.032878,0.97378
t2,HRC_TOPMed,0.001 <= MAF < 0.005,0.94635,0.00160662,5583,0.92985,0.00136855,5733,7.819454,5.7904e-15
t1,HRC_TOPMed,MAF < 0.001,0.74645,0.00063846,148289,0.68856,0.00043696,206209,74.817929,0.0


In [36]:
hrc_topmed_df %>% 
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01")) %>%
    group_by(maf_cat) %>%
    summarise(mean_R2 = mean(R2), se_corr = sd(R2) / sqrt(n()), count_R2 = n()) %>% 
    mutate(maf_cat = factor(maf_cat, levels = c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01"))) %>% 
    arrange(maf_cat)

maf_cat,mean_R2,se_corr,count_R2
<fct>,<dbl>,<dbl>,<int>
MAF < 0.001,0.68856,0.00043696,206209
0.001 <= MAF < 0.005,0.92985,0.00136855,5733
0.005 <= MAF < 0.01,0.95635,0.00232959,1440


#### Overlapped variants - Table S2

In [37]:
df_overlap <- df %>% 
    filter(R2_topmed != 0, R2_hrc != 0) %>% 
    left_join(corr_df %>% filter(grepl("hrc", dataset)) %>% rename("corr_hrc" = "corr") %>% select(-dataset, -rsq), by = c("ID_hg38" = "snpid")) %>%
    left_join(corr_df %>% filter(grepl("topmed", dataset)) %>% rename("corr_topmed" = "corr") %>% select(-dataset, -rsq), by = c("ID_hg38" = "snpid")) %>%
    mutate(maf_cat = case_when(is.na(MAF_nfe_exome) | MAF_nfe_exome < 0.001 ~ "MAF < 0.001",
                               MAF_nfe_exome >= 0.001 & MAF_nfe_exome < 0.005 ~ "0.001 <= MAF < 0.005",
                               MAF_nfe_exome >= 0.005 & MAF_nfe_exome < 0.01 ~ "0.005 <= MAF < 0.01"))

In [38]:
corr_rsq_overlap_df <- data.frame(matrix(ncol=10, nrow = 0))
colnames(corr_rsq_overlap_df) <- c("dataset", "maf_cat", "corr_mean", "corr_se", "corr_n", "rsq_mean", "rsq_se", "rsq_n", "test_stat", "p_value")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc_corr <- df_overlap %>% filter(maf_cat == maf) %>% pull(corr_hrc)
    hrc_corr <- hrc_corr[!is.na(hrc_corr)]
    hrc_rsq <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_hrc)
    
    topmed_corr <- df_overlap %>% filter(maf_cat == maf) %>% pull(corr_topmed)
    topmed_corr <- topmed_corr[!is.na(topmed_corr)]
    topmed_rsq <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_topmed)
    
    hrc_test_result <- t.test(hrc_corr, hrc_rsq)
    topmed_test_result <- t.test(topmed_corr, topmed_rsq)
    
    sub_df_hrc <- data.frame("dataset" = "HRC", 
                             "maf_cat" = maf, 
                             "corr_mean" = mean(hrc_corr), 
                             "corr_se" = sd(hrc_corr) / sqrt(length(hrc_corr)),
                             "corr_n" = length(hrc_corr), 
                             "rsq_mean" = mean(hrc_rsq), 
                             "rsq_se" = sd(hrc_rsq) / sqrt(length(hrc_rsq)),
                             "rsq_n" = length(hrc_rsq), 
                             "test_stat" = hrc_test_result$statistic, 
                             "p_value" = hrc_test_result$p.value)
    
    sub_df_topmed <- data.frame("dataset" = "TOPMed", 
                                "maf_cat" = maf, 
                                "corr_mean" = mean(topmed_corr), 
                                "corr_se" = sd(topmed_corr) / sqrt(length(topmed_corr)),
                                "corr_n" = length(topmed_corr), 
                                "rsq_mean" = mean(topmed_rsq), 
                                "rsq_se" = sd(topmed_rsq) / sqrt(length(topmed_rsq)),
                                "rsq_n" = length(topmed_rsq), 
                                "test_stat" = topmed_test_result$statistic, 
                                "p_value" = topmed_test_result$p.value)
    
    corr_rsq_overlap_df = rbind(corr_rsq_overlap_df, sub_df_hrc, sub_df_topmed)
}

corr_rsq_overlap_df %>% arrange(dataset)

Unnamed: 0_level_0,dataset,maf_cat,corr_mean,corr_se,corr_n,rsq_mean,rsq_se,rsq_n,test_stat,p_value
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
t,HRC,MAF < 0.001,0.68895,0.001691,24507,0.70523,0.0012099,25974,-7.83014,4.9797e-15
t2,HRC,0.001 <= MAF < 0.005,0.90995,0.0019327,5154,0.88861,0.0019642,5218,7.74472,1.0474e-14
t4,HRC,0.005 <= MAF < 0.01,0.94143,0.0054522,1290,0.93787,0.0025202,1313,0.59253,0.55357
t1,TOPMed,MAF < 0.001,0.83922,0.0011727,24507,0.79348,0.0009873,25974,29.83925,6.8946e-194
t3,TOPMed,0.001 <= MAF < 0.005,0.93821,0.0013255,5154,0.9072,0.0011628,5218,17.58826,3.0784000000000002e-68
t5,TOPMed,0.005 <= MAF < 0.01,0.96236,0.0053148,1290,0.96866,0.0014622,1313,-1.14197,0.25365


#### Overlapped variants - Table S3

In [39]:
rsq_overlap_df <- data.frame(matrix(ncol=10, nrow = 0))
colnames(rsq_overlap_df) <- c("maf_cat", "rsq_mean", "rsq_se", "rsq_n", "test_stat", "p_value")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc_rsq <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_hrc)
    topmed_rsq <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_topmed)
    
    test_result <- t.test(hrc_rsq, topmed_rsq)
#     t_statistic <- (mean(hrc_rsq) - mean(topmed_rsq)) / sqrt((var(hrc_rsq)/length(hrc_rsq)) + (var(topmed_rsq)/length(topmed_rsq)))
    
#     t.value = unname(test_result$statistic)
#     df = unname(test_result$parameter[1])
#     p = as.character(2*pt(-abs(t.value), df))
#     print(.N(test_result$p.value))
    
    sub_df <- data.frame("maf_cat" = maf, 
                         "rsq_hrc_mean" = mean(hrc_rsq), 
                         "rsq_hrc_se" = sd(hrc_rsq) / sqrt(length(hrc_rsq)),
                         # "rsq_hrc_se" = sd(hrc_rsq),
                         "rsq_hrc_n" = length(hrc_rsq), 
                         "rsq_topmed_mean" = mean(topmed_rsq), 
                         "rsq_topmed_se" = sd(topmed_rsq) / sqrt(length(topmed_rsq)),
                         # "rsq_topmed_se" = sd(topmed_rsq),
                         "rsq_topmed_n" = length(topmed_rsq), 
                         "test_stat" = test_result$statistic, 
                         "p_value" = test_result$p.value)
    
    rsq_overlap_df = rbind(rsq_overlap_df, sub_df)
}

rsq_overlap_df %>% arrange(dataset)

Unnamed: 0_level_0,maf_cat,rsq_hrc_mean,rsq_hrc_se,rsq_hrc_n,rsq_topmed_mean,rsq_topmed_se,rsq_topmed_n,test_stat,p_value
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>
t,MAF < 0.001,0.70523,0.0012099,25974,0.79348,0.0009873,25974,-56.5129,0.0
t1,0.001 <= MAF < 0.005,0.88861,0.0019642,5218,0.9072,0.0011628,5218,-8.1453,4.321e-16
t2,0.005 <= MAF < 0.01,0.93787,0.0025202,1313,0.96866,0.0014622,1313,-10.5646,1.8625000000000002e-25


In [40]:
## check if the RSQ is statistically different v.s. HRC_TOPMed

r2_df <- data.frame(matrix(ncol=10, nrow = 0))
colnames(r2_df) <- c("data", "r2_mean", "r2_se", "r2_n", "r2_mean_hrc_topmed", "r2_se_hrc_topmed", "r2_n_hrc_topmed", "test_stat", "p_value", "maf_cat")

for(maf in c("MAF < 0.001", "0.001 <= MAF < 0.005", "0.005 <= MAF < 0.01")){
    hrc_r2 <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_hrc)
    topmed_r2 <- df_overlap %>% filter(maf_cat == maf) %>% pull(R2_topmed)
    hrc_topmed <- hrc_topmed_df %>% filter(maf_cat == maf) %>% pull(R2)
    
    hrc_test_result <- t.test(hrc_r2, hrc_topmed)
    topmed_test_result <- t.test(topmed_r2, hrc_topmed)
    
    hrc_t_statistic <- (mean(hrc_r2) - mean(hrc_topmed)) / sqrt((var(hrc_r2)/length(hrc_r2)) + (var(hrc_topmed)/length(hrc_topmed)))
    topmed_t_statistic <- (mean(topmed_r2) - mean(hrc_topmed)) / sqrt((var(topmed_r2)/length(topmed_r2)) + (var(hrc_topmed)/length(hrc_topmed)))


    hrc_r2_df <- data.frame("data" = "HRC", 
                            "r2_mean" = mean(hrc_r2),
                            "r2_se" = sd(hrc_r2) / sqrt(length(hrc_r2)),
                            "r2_n" = length(hrc_r2), 
                            "r2_mean_hrc_topmed" = mean(hrc_topmed),
                            "r2_se_hrc_topmed" = sd(hrc_topmed) / sqrt(length(hrc_topmed)),
                            "r2_n_hrc_topmed" = length(hrc_topmed), 
                            "test_stat" = hrc_test_result$statistic, 
                            "p_value" = hrc_test_result$p.value,
                            "maf_cat" = maf)
    
    
    topmed_r2_df <- data.frame("data" = "TOPMed",
                               "r2_mean" = mean(topmed_r2),
                               "r2_se" = sd(topmed_r2) / sqrt(length(topmed_r2)),
                               "r2_n" = length(topmed_r2), 
                               "r2_mean_hrc_topmed" = mean(hrc_topmed),
                               "r2_se_hrc_topmed" = sd(hrc_topmed) / sqrt(length(hrc_topmed)),
                               "r2_n_hrc_topmed" = length(hrc_topmed), 
                               "test_stat" = topmed_test_result$statistic, 
                               "p_value" = topmed_test_result$p.value,
                                "maf_cat" = maf)
    
    r2_df = rbind(r2_df, hrc_r2_df, topmed_r2_df)
}

r2_df

Unnamed: 0_level_0,data,r2_mean,r2_se,r2_n,r2_mean_hrc_topmed,r2_se_hrc_topmed,r2_n_hrc_topmed,test_stat,p_value,maf_cat
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<chr>
t,HRC,0.70523,0.0012099,25974,0.68856,0.00043696,206209,12.9545,2.7447999999999996e-38,MAF < 0.001
t1,TOPMed,0.79348,0.0009873,25974,0.68856,0.00043696,206209,97.1745,0.0,MAF < 0.001
t2,HRC,0.88861,0.0019642,5218,0.92985,0.00136855,5733,-17.2253,1.6940999999999999e-65,0.001 <= MAF < 0.005
t3,TOPMed,0.9072,0.0011628,5218,0.92985,0.00136855,5733,-12.6091,3.3811e-36,0.001 <= MAF < 0.005
t4,HRC,0.93787,0.0025202,1313,0.95635,0.00232959,1440,-5.3836,7.9271e-08,0.005 <= MAF < 0.01
t5,TOPMed,0.96866,0.0014622,1313,0.95635,0.00232959,1440,4.4739,8.0417e-06,0.005 <= MAF < 0.01


In [41]:
head(df_overlap) %>% select(starts_with("R2"))

R2,R2_hrc,R2_topmed,R2_exome
<dbl>,<dbl>,<dbl>,<int>
0.7674,0.4256,0.7674,999
0.88566,0.58472,0.88566,999
0.80613,0.80613,0.32464,999
0.99516,0.99516,0.99382,999
0.72488,0.72488,0.61533,999
0.79836,0.79836,0.7821,999


In [42]:
head(hrc_topmed_df) %>% select(starts_with("R2"))

[1m[22mAdding missing grouping variables: `maf_cat`


maf_cat,R2,R2_hrc,R2_topmed,R2_exome
<chr>,<dbl>,<dbl>,<dbl>,<int>
MAF < 0.001,0.7674,0.4256,0.7674,999
MAF < 0.001,0.88566,0.58472,0.88566,999
MAF < 0.001,0.80613,0.80613,0.32464,999
0.005 <= MAF < 0.01,0.99516,0.99516,0.99382,999
MAF < 0.001,0.72488,0.72488,0.61533,999
MAF < 0.001,0.79836,0.79836,0.7821,999


In [43]:
.Machine$double.xmin

## Power for overlapped variants of TOPMed and Exome

We are going to perform the simulation with an overlapped set of variants from Exome and TOPMed using the already simulated phenotype from previous simulations. 

**Parameters**: all variants causal, OR = 1.5, disease prevalence = 0.1, $R^2$ = 0.3

### Create per gene genotype file

In [48]:
library(dplyr)
library(data.table)

annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_rsq03_maf001_annot.csv.gz")

In [49]:
setwd("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp")

annot_overlap <- annot %>% filter(!is.na(R2_topmed), R2_topmed != 0, R2_exome == 999)

# for(maf in c(0.01, 0.005, 0.001)){
for(maf in c(0.01)){
    maf_c <- gsub("\\.", "", as.character(maf))
    command_fname <- sprintf("./maf%s/extract_variants.sh", maf_c)
    
    if (file.exists(command_fname)) file.remove(command_fname)
    fname <- file(command_fname, "w")
    writeLines("module load Plink/2.00a", fname)
    close(fname)

    for(g in unique(annot_overlap$Gene.refGene)){
        annot_gene <- annot_overlap %>% 
            filter(is.na(MAF_nfe_exome) | MAF_nfe_exome < maf) %>% 
            filter(Gene.refGene == g) 
        
        annot_gene %>% 
            select(ID_hg38) %>% 
            fwrite(sprintf("./maf%s/%s", maf_c, g))
        
        chr <- annot_gene[1,]$Chr
        
        command <- sprintf("plink2 --bpfile ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/topmed/topmed_chr%i_rsq03_maf%s_LOF_missense_extracted \\
                                --extract %s --export A-transpose --make-bpgen --out %s;", 
                                chr, maf_c, g, g)
        write(command, command_fname, append=TRUE)
    }
}

In [50]:
annot_overlap <- annot %>% filter(!is.na(R2_topmed), R2_topmed != 0, R2_exome == 999)

# for(maf in c(0.01, 0.005, 0.001)){
for(maf in c(0.01)){
    maf_c <- gsub("\\.", "", as.character(maf))
    command_fname <- sprintf("./maf%s/extract_variants_exome.sh", maf_c)
    
    if (file.exists(command_fname)) file.remove(command_fname)
    fname <- file(command_fname, "w")
    writeLines("module load Plink/1.9.10", fname)
    close(fname)

    for(g in unique(annot_overlap$Gene.refGene)){
        annot_gene <- annot_overlap %>% 
            filter(is.na(MAF_nfe_exome) | MAF_nfe_exome < maf) %>% 
            filter(Gene.refGene == g)
        chr <- annot_gene[1,]$Chr

        command <- sprintf("plink --bfile ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/ukb23156_c%d_maf%s_LOF_missense_extracted \\
                            --extract %s --make-bed --out %s;", chr, maf_c, g, sprintf("%s_exome", g))
        write(command, command_fname, append=TRUE)
    }
}

### Writing bash script for submission

In [51]:
cd /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/

# for maf in 0001 0005 001; do
for maf in 001; do
    rscript_name='simulation_maf'$maf'.R'
    num_gene=3194

    for ((i = 1; i <= 3194; i += 100)); do
        start=$i
        end=$((start+100-1))
        script_name='simulation_maf'$maf'_'$start'_'$end'.sh'

        if (( end > 3194 )); then
            end=3194
        fi

        echo '#!/bin/bash' >> $script_name
        echo '#$ -N' 'simulation_maf'$maf'_'$start'_'$end >> $script_name
        echo '#$ -l h_vmem=50G' >> $script_name
        echo '#$ -l h_rt=600:00:00' >> $script_name
        echo '#$ -o /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/'$script_name'_'$JOB_ID'.out' >> $script_name
        echo '#$ -e /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/'$script_name'_'$JOB_ID'.err' >> $script_name
        echo '#$ -q csg.q' >> $script_name
        echo '#$ -j y' >> $script_name
        echo '#$ -S /bin/bash' >> $script_name
        echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
        echo 'module load R/4.2.2.10' >> $script_name
        echo 'cd /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/' >> $script_name
        echo 'Rscript '$rscript_name' '$start' '$end >> $script_name
    done
done

In [53]:
cd /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/

# for maf in 0001 0005 001; do
for maf in 001; do
    rscript_name='simulation_maf'$maf'_prev02.R'
    num_gene=3194

    for ((i = 1; i <= 3194; i += 100)); do
        start=$i
        end=$((start+100-1))
        script_name='simulation_maf'$maf'_'$start'_'$end'_prev02.sh'

        if (( end > 3194 )); then
            end=3194
        fi

        echo '#!/bin/bash' >> $script_name
        echo '#$ -N' 'simulation_maf'$maf'_'$start'_'$end >> $script_name
        echo '#$ -l h_vmem=50G' >> $script_name
        echo '#$ -l h_rt=600:00:00' >> $script_name
        echo '#$ -o /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/'$script_name'_'$JOB_ID'.out' >> $script_name
        echo '#$ -e /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/'$script_name'_'$JOB_ID'.err' >> $script_name
        echo '#$ -q csg.q' >> $script_name
        echo '#$ -j y' >> $script_name
        echo '#$ -S /bin/bash' >> $script_name
        echo 'export PATH=$HOME/miniconda3/bin:$PATH' >> $script_name
        echo 'module load R/4.2.2.10' >> $script_name
        echo 'cd /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/scripts/' >> $script_name
        echo 'Rscript '$rscript_name' '$start' '$end >> $script_name
    done
done

### Summarizing result

In [54]:
library(dscrutils)

dsc_dir = '/home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/dsc_rsq03_maf001/bin_pheno_168206id_brv'
setwd('/home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/dsc_rsq03_maf001/bin_pheno_168206id_brv')

result <- dscrutils::dscquery(dsc_dir, targets = c("parse_input",
                                                   "parse_input.idx",
                                                   "fixed_effect_prop.deletrious_effect",
                                                   "fixed_effect_prop.causal_del",
                                                   "bin_phenotype",
                                                   "bin_phenotype.prevalence"),
                             module.output.files = c("parse_input", "bin_phenotype"))

result_filtered <- result %>% filter(fixed_effect_prop.deletrious_effect == 1.5, fixed_effect_prop.causal_del == "c(1,1)", bin_phenotype.prevalence == 0.1)

Calling: dsc-query /home/tl3031/project/imputation-rvtest/workflows/imputation_aggregated_analysis/dsc_rsq03_maf001/bin_pheno_168206id_brv -o /tmp/8231881.1.csg.q/RtmpjPwtcr/fileea3b329de53c.csv --target "parse_input parse_input.idx fixed_effect_prop.deletrious_effect fixed_effect_prop.causal_del bin_phenotype bin_phenotype.prevalence" --force 
Loaded dscquery output table with 76656 rows and 7 columns.


In [70]:
annot <- data.table::fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_rsq03_maf001_annot.csv.gz")
gene_lst <- fread("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/gene_list.csv", header = FALSE)$V1
head(gene_lst)

In [71]:
## prevalence = 0.1

result <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(result) <- c("maf", "dataset", "power", "mean_r2")

for(maf in c(0.01, 0.005, 0.001)){
    maf_c <- gsub("\\.", "", as.character(maf))
    flst <- list.files(sprintf("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/maf%s", maf_c), 
                       pattern = "csv", full.names = TRUE) 
    flst <- gtools::mixedsort(flst[!grepl("prev02", flst)])
    result_df <- data.frame(matrix(ncol=2, nrow=0))
    for(f in flst){
        df <- fread(f)
        result_df <- rbind(result_df, df)
    }

    power_df <- result_df %>% 
        mutate(pval = ifelse(pval == "ERROR", 1, pval)) %>%
        mutate(pval = ifelse(is.na(pval), 1, pval)) %>%
        group_by(dataset) %>%
        summarise(power = sum(as.numeric(pval) < 2.5e-6) / 3197)
    
    topmed_gene_lst <- result_df %>% 
        filter(dataset == "topmed") %>% 
        mutate(gene = gene_lst) %>% 
        mutate(pval = ifelse(pval == "ERROR", 1, pval)) %>%
        mutate(pval = ifelse(is.na(pval), 1, pval)) %>%
        filter(as.numeric(pval) < 2.5e-6) %>% 
        pull(gene)
    
    mean_r2 <- annot %>% 
        filter(MAF_nfe_exome < maf, Gene.refGene %in% topmed_gene_lst, R2_topmed != 0, R2_exome == 999) %>% 
        group_by(Gene.refGene) %>% 
        summarise(mean_r2 = mean(R2_topmed)) %>%
        pull(mean_r2) %>%
        mean()
    
    power_df <- power_df %>% mutate(mean_r2 = c("NA", mean_r2), maf = maf)
    result <- rbind(result, power_df)
}

result %>% select(maf, dataset, power, mean_r2)

maf,dataset,power,mean_r2
<dbl>,<chr>,<dbl>,<chr>
0.01,exome,0.60682,
0.01,topmed,0.46544,0.72305592544771
0.005,exome,0.54739,
0.005,topmed,0.38755,0.720760063083636
0.001,exome,0.33688,
0.001,topmed,0.19769,0.7123860461728


In [72]:
## prevalence = 0.2

result <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(result) <- c("maf", "dataset", "power", "mean_r2")

for(maf in c(0.01, 0.005, 0.001)){
    maf_c <- gsub("\\.", "", as.character(maf))
    flst <- list.files(sprintf("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/rsq_missing_variant_comp/maf%s", maf_c), 
                       pattern = "prev02.csv", full.names = TRUE) 
    flst <- gtools::mixedsort(flst)
    result_df <- data.frame(matrix(ncol=2, nrow=0))
    for(f in flst){
        df <- fread(f)
        result_df <- rbind(result_df, df)
    }

    power_df <- result_df %>% 
        mutate(pval = ifelse(pval == "ERROR", 1, pval)) %>%
        mutate(pval = ifelse(is.na(pval), 1, pval)) %>%
        group_by(dataset) %>%
        summarise(power = sum(as.numeric(pval) < 2.5e-6) / 3197)
    
    topmed_gene_lst <- result_df %>% 
        filter(dataset == "topmed") %>% 
        mutate(gene = gene_lst) %>% 
        mutate(pval = ifelse(pval == "ERROR", 1, pval)) %>%
        mutate(pval = ifelse(is.na(pval), 1, pval)) %>%
        filter(as.numeric(pval) < 2.5e-6) %>% 
        pull(gene)
    
    mean_r2 <- annot %>% 
        filter(MAF_nfe_exome < maf, Gene.refGene %in% topmed_gene_lst, R2_topmed != 0, R2_exome == 999) %>% 
        group_by(Gene.refGene) %>% 
        summarise(mean_r2 = mean(R2_topmed)) %>%
        pull(mean_r2) %>%
        mean()
    
    power_df <- power_df %>% mutate(mean_r2 = c("NA", mean_r2), maf = maf)
    result <- rbind(result, power_df)
}

result %>% select(maf, dataset, power, mean_r2)

maf,dataset,power,mean_r2
<dbl>,<chr>,<dbl>,<chr>
0.01,exome,0.604,
0.01,topmed,0.45824,0.723051738835145
0.005,exome,0.5477,
0.005,topmed,0.38223,0.720717254375534
0.001,exome,0.33688,
0.001,topmed,0.19769,0.7123860461728


## Average R2 for all genes and significant genes

We are interested in whether the average R2 will be different for all genes vs the genes reaching exome wide significant level.

In [56]:
library(dplyr)
library(data.table)

param_idx_match_brv_rsq03 <- function(num_gene, causal_del_es, prev, sample, dsc_result_dir, dsc_gene_dir){
    gene_idx <- c(1:num_gene)
    
    idx <- switch(conditions[which(conditions == causal_del_es)],
                  '0,0,1.2' = 1,
                  '0,0,1.5' = 2, 
                  '0,0,1.8' = 3, 
                  '1,1,1.2' = 4, 
                  '1,1,1.5' = 5, 
                  '1,1,1.8' = 6,
                  '0.75,1,1.2' = 7, 
                  '0.75,1,1.5' = 8, 
                  '0.75,1,1.8' = 9,
                  '0.5,1,1.2' = 10, 
                  '0.5,1,1.5' = 11, 
                  '0.5,1,1.8' = 12)
    fixed_eff_idx <- seq(idx, num_gene, 12)
    
    pheno_idx <- ifelse(prev == 0.1, 1, 2)
    
    fnames <- lapply(gene_idx, function(x) {
        if(x <= 3172){
            fname <- paste0(dsc_result_dir, "parse_input_", x, "_fixed_effect_prop_", idx, "_bin_phenotype_", pheno_idx, "_brv_preprocess_", sample, "_1_brv_1.rds")
        } else {
            # fname <- paste0(dsc_result_dir, "parse_input_", x, "_fixed_effect_prop_", idx+12, "_bin_phenotype_", pheno_idx + 2, "_brv_pre_", sample, "_2_brv_2.rds")
            fname <- paste0(dsc_result_dir, "parse_input_", x, "_fixed_effect_prop_", idx, "_bin_phenotype_", pheno_idx, "_brv_preprocess_", sample, "_1_brv_1.rds")
        }
    })
    
    gene_fnames <- lapply(gene_idx, function(x) {
            gene_fname <- paste0(dsc_gene_dir, "parse_input_", x, ".rds")
    })
    
    return(list(fnames = unlist(fnames), gene_fnames = unlist(gene_fnames)))
}

In [57]:
get_sig_genes_rsq03 <- function(dsc_result_dir, dsc_gene_dir, num_gene, data){
    result_df <- data.frame(condition = c(), prevalence = c(), gene = c(), pvals = c())
    for(i in conditions){
        for(prev in c(0.1, 0.2)){
            print(sprintf("conditions %s, prev %f, data %s", i, prev, data))
            result_fname_lst <- param_idx_match_brv_rsq03(num_gene, i, prev, data, dsc_result_dir, dsc_gene_dir)
            result_fname_lst$fnames
            result_fname_lst$gene_fnames
            
            gene_lst <- list()
            pval_lst <- list()
            # for(f in c(1:5)){
            for(f in c(1:length(result_fname_lst$fnames))){
                if(f %% 500 == 0) print(f)
                gene_lst <- c(gene_lst, readRDS(result_fname_lst$gene_fnames[f])$gene)
                pval_lst <- c(pval_lst, readRDS(result_fname_lst$fnames[f])$pval)
            }
            
            genes <- lapply(unlist(gene_lst), function(x) unlist(strsplit(x, "_"))[[2]])
            pvals <- lapply(unlist(pval_lst), function(x) ifelse(x == "ERROR", 1, x))

            subdf <- data.frame(condition = i, prevalence = prev, gene = unlist(genes), pvals = unlist(pvals))
            result_df <- rbind(result_df, subdf)
        }
    }
    return(result_df)
}

### HRC result

In [59]:
annot <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc_topmed_exome/hrc_topmed_exome_168206ids_rsq03_maf001_annot.csv.gz")

In [61]:
hrc_chr1 <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc/hrc_chr1_rsq03_hg19_hg38_maf001_LOF_missense_annot.csv.gz")
hrc_chr2 <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/hrc/hrc_chr2_rsq03_hg19_hg38_maf001_LOF_missense_annot.csv.gz")
hrc_gene_lst <- rbind(hrc_chr1, hrc_chr2) %>% select(Gene.refGene, MAF_nfe_exome)

In [62]:
setwd("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/analysis/20230911/")

hrc_maf001 <- fread("hrc_sig_genes.csv") %>% mutate(MAF = 0.01)
hrc_maf0005 <- rbind(fread("hrc_sig_genes_es15_maf0005.csv"), fread("hrc_sig_genes_es12_18_maf0005.csv")) %>% mutate(MAF = 0.005)
hrc_maf0001 <- rbind(fread("hrc_sig_genes_es15_maf0001.csv"), fread("hrc_sig_genes_es12_18_maf0001.csv")) %>% mutate(MAF = 0.001)

hrc_df_gp <- rbind(hrc_maf001, hrc_maf0005, hrc_maf0001) %>% group_by(MAF, condition, prevalence) %>% group_split()

In [63]:
result_df <- data.frame(matrix(nrow = 0, ncol = 7))
colnames(result_df) <- c("maf", "prevalence", "condition", "mean_r2_overall_all", "mean_r2_overall_sig", "mean_r2_all", "mean_r2_sig")
for(df in hrc_df_gp){
    maf <- df[1,]$MAF
    
    all_gene_lst <- hrc_gene_lst %>% filter(MAF_nfe_exome < maf) %>% pull(Gene.refGene) %>% unique()
    sig_gene_lst <- df %>% filter(pvals < 2.5e-6) %>% pull(gene)
    
    hrc_annot <- annot %>% filter(R2_hrc != 0) %>% filter(MAF_nfe_exome < maf)
    # print(dim(hrc_annot))
    all_mean_r2 <- hrc_annot %>% filter(Gene.refGene %in% all_gene_lst) %>% pull(R2_hrc) %>% mean()
    sig_mean_r2 <- hrc_annot %>% filter(Gene.refGene %in% sig_gene_lst) %>% pull(R2_hrc) %>% mean()
    
    all_gene_mean_r2 <- hrc_annot %>% filter(Gene.refGene %in% all_gene_lst) %>% 
                    group_by(Gene.refGene) %>% summarise(mean_r2_hrc = mean(R2_hrc)) %>%
                    pull(mean_r2_hrc) %>% mean()
    sig_gene_mean_r2 <- hrc_annot %>% filter(Gene.refGene %in% sig_gene_lst) %>% 
                    group_by(Gene.refGene) %>% summarise(mean_r2_hrc = mean(R2_hrc)) %>%
                    pull(mean_r2_hrc) %>% mean()
    
    result_sub_df <- data.frame(maf = maf, prevalence = df[1,]$prevalence, condition = df[1,]$condition, 
                                mean_r2_overall_all = all_mean_r2, mean_r2_overall_sig = sig_mean_r2,
                                mean_r2_all = all_gene_mean_r2, mean_r2_sig = sig_gene_mean_r2)
    result_df <- rbind(result_df, result_sub_df)
    
}

In [64]:
result_df %>% select(-mean_r2_overall_all, -mean_r2_overall_sig) %>% arrange(desc(maf), prevalence, condition) %>% filter(condition == "1,1,1.5")

maf,prevalence,condition,mean_r2_all,mean_r2_sig
<dbl>,<dbl>,<chr>,<dbl>,<dbl>
0.01,0.1,111.5,0.72273,0.74783
0.01,0.2,111.5,0.72273,0.74755
0.005,0.1,111.5,0.71416,0.73867
0.005,0.2,111.5,0.71416,0.73878
0.001,0.1,111.5,0.68203,0.71206
0.001,0.2,111.5,0.68203,0.71095


### TOPMed result

In [65]:
topmed_chr1 <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/topmed/topmed_chr1_rsq03_hg19_hg38_maf001_LOF_missense_annot.csv.gz")
topmed_chr2 <- fread("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/topmed/topmed_chr1_rsq03_hg19_hg38_maf001_LOF_missense_annot.csv.gz")
topmed_gene_lst <- rbind(topmed_chr1, topmed_chr2) %>% select(Gene.refGene, MAF_nfe_exome)

In [66]:
setwd("~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/analysis/20230911/")

topmed_maf001 <- rbind(fread("topmed_sig_genes_es15_maf001.csv"), fread("topmed_sig_genes_es12_18_maf001.csv")) %>% mutate(MAF = 0.01)
topmed_maf0005 <- rbind(fread("topmed_sig_genes_es15_maf0005.csv"), fread("topmed_sig_genes_es12_18_maf0005.csv")) %>% mutate(MAF = 0.005)
topmed_maf0001 <- rbind(fread("topmed_sig_genes_es15_maf0001.csv"), fread("topmed_sig_genes_es12_18_maf0001.csv")) %>% mutate(MAF = 0.001)

topmed_df_gp <- rbind(topmed_maf001, topmed_maf0005, topmed_maf0001) %>% group_by(MAF, condition, prevalence) %>% group_split()

In [67]:
result_df <- data.frame(matrix(nrow = 0, ncol = 7))
colnames(result_df) <- c("maf", "prevalence", "condition", "mean_r2_overall_all", "mean_r2_overall_sig", "mean_r2_all", "mean_r2_sig")
for(df in hrc_df_gp){
    maf <- df[1,]$MAF
    
    all_gene_lst <- topmed_gene_lst %>% filter(MAF_nfe_exome < maf) %>% pull(Gene.refGene) %>% unique()
    sig_gene_lst <- df %>% filter(pvals < 2.5e-6) %>% pull(gene)
    
    topmed_annot <- annot %>% filter(R2_topmed != 0) %>% filter(MAF_nfe_exome < maf)
    # print(dim(topmed_annot))
    all_mean_r2 <- topmed_annot %>% filter(Gene.refGene %in% all_gene_lst) %>% pull(R2_topmed) %>% mean()
    sig_mean_r2 <- topmed_annot %>% filter(Gene.refGene %in% sig_gene_lst) %>% pull(R2_topmed) %>% mean()
    
    all_gene_mean_r2 <- topmed_annot %>% filter(Gene.refGene %in% all_gene_lst) %>% 
                    group_by(Gene.refGene) %>% summarise(mean_r2_topmed = mean(R2_topmed)) %>%
                    pull(mean_r2_topmed) %>% mean()
    sig_gene_mean_r2 <- topmed_annot %>% filter(Gene.refGene %in% sig_gene_lst) %>% 
                    group_by(Gene.refGene) %>% summarise(mean_r2_topmed = mean(R2_topmed)) %>%
                    pull(mean_r2_topmed) %>% mean()
    
    result_sub_df <- data.frame(maf = maf, prevalence = df[1,]$prevalence, condition = df[1,]$condition, 
                                mean_r2_overall_all = all_mean_r2, mean_r2_overall_sig = sig_mean_r2,
                                mean_r2_all = all_gene_mean_r2, mean_r2_sig = sig_gene_mean_r2)
    result_df <- rbind(result_df, result_sub_df)
}

In [69]:
result_df %>% select(-mean_r2_overall_all, -mean_r2_overall_sig) %>% arrange(desc(maf), prevalence, condition) %>% filter(condition == "1,1,1.5")

maf,prevalence,condition,mean_r2_all,mean_r2_sig
<dbl>,<dbl>,<chr>,<dbl>,<dbl>
0.01,0.1,111.5,0.69632,0.70714
0.01,0.2,111.5,0.69632,0.70802
0.005,0.1,111.5,0.69422,0.70588
0.005,0.2,111.5,0.69422,0.70604
0.001,0.1,111.5,0.68788,0.69925
0.001,0.2,111.5,0.68788,0.69915
