In [2]:
library(dplyr)
library(data.table)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




In [3]:
gene_idx = 1
bin_phenotype_idx = 2
rsq = 3
maf = 0.005
phenotype_folder = "~/project/git/imputation_brv/workflow/dsc_pipeline_rsq03/bin_pheno_168206id_brv_rsq03"

maf_c <- gsub("\\.", "", as.character(maf))
maf_c <- paste0("maf", maf_c)

In [7]:
phenotype_fname <- sprintf("%s/parse_input/parse_input_%i.rds", phenotype_folder, gene_idx)
rds <- readRDS(phenotype_fname)

In [8]:
str(rds)

List of 7
 $ gene                  : chr "chr1_A3GALT2"
 $ exome_fname           : chr "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/exome_maf001/chr1_A3GALT2_aggregate.txt"
 $ hrc_fname             : chr "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_rsq03_maf001/chr1_A3GALT2_aggregate.txt"
 $ topmed_fname          : chr "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/topmed_v3_rsq03_maf001/chr1_A3GALT2_aggregate.txt"
 $ hrc_topmed_fname      : chr "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_topmed_v3_rsq03_maf001/chr1_A3GAL"| __truncated__
 $ hrc_topmed_exome_fname: chr "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_topmed_v3_exome_rsq03_maf001/chr1_A3GALT2.raw"
 $ DSC_DEBUG             :List of 5
  ..$ time     :List of 5
  .. ..$ user.self : num 0.042
  .. ..$ sys.self  : num 0.008
  .. ..$ elapsed   : num 0.179
  .. ..$ user.chi

In [28]:
prepare_input <- function(fname, case_control_ref){
    if(!is.na(fname)) {
        aggregate_geno <- data.table::fread(fname, header = TRUE)
        aggregate_geno <- left_join(case_control_ref, aggregate_geno) %>% select(-FID, -IID, -assigned_id) %>% as.matrix()
        class(aggregate_geno) <- "numeric"

        num_var <- which(aggregate_geno != 0) %>% length()
        if(num_var >= 2){
            return(aggregate_geno)
        } else {
            return("LESS THAN TWO VAR PER GENE")
        }
    } else {
        return("FILE MISSING")
    }
}

BRV <- function(fname, case_control_ref, y, sample, gene, es, causal_prop, prev, out_folder, maf){
    maf_c <- gsub("\\.", "", as.character(maf))
    maf_c <- paste0("maf", maf_c)
    
    if(sample == "hrc_topmed_exome") fname <- gsub(".raw", "_aggregate.raw", fname)
    X <- prepare_input(fname, case_control_ref)

    ## count number of rare variants for each individual
    if(!is.matrix(X)){
        cat("X is not matrix", file = sprintf("%s/%s_prev%.1f_es%.1f_causal%.1f_%s_maf001.csv", out_folder, gene, prev, es, causal_prop, sample))
        return(list(result_df = "X is not matrix", sample_name = sample))
    }

    if(is.na(X)[1]){
        result_df <- "X is NA"
        cat("X is NA", file = sprintf("%s/%s_prev%.1f_es%.1f_causal%.1f_%s_maf001.csv", out_folder, gene, prev, es, causal_prop,sample))
    } else {
        X.fit <- glm(y ~ X, family = "binomial")

        if(dim(coef(summary(X.fit)))[1] != 1){
          zstat <- coef(summary(X.fit))[2, 3]
          pval <- coef(summary(X.fit))[2, 4]
        }else{
          zstat <- NA
          pval <- 1
        }

        result_df <-  data.frame(gene = gene, sample = sample, es = es, causal_prop = causal_prop, prev = prev, zstat = zstat, pval = pval, maf = maf)
        data.table::fwrite(result_df, file = sprintf("%s/%s_prev%.1f_es%.1f_causal%.1f_%s_%s.csv", out_folder, gene, prev, es, causal_prop,sample,maf_c))
    }

  return(list(result_df = result_df, sample_name = sample))
}

In [29]:
result_dir <- sprintf("~/project/git/imputation_brv/workflow/dsc_pipeline_rsq03/bin_pheno_168206id_brv_rsq0%i", rsq)
parse_input_fname <- sprintf("%s/parse_input/parse_input_%i.rds", result_dir, gene_idx)
parse_input <- readRDS(parse_input_fname)

exome_fname <- gsub("maf001", maf_c, parse_input$exome_fname)
if(!file.exists(exome_fname)) exome_fname <- NA

hrc_fname <- gsub("maf001", maf_c, parse_input$hrc_fname)
if(!file.exists(hrc_fname)) hrc_fname <- NA

topmed_fname <- gsub("maf001", maf_c, parse_input$topmed_fname)
if(!file.exists(topmed_fname)) topmed_fname <- NA

hrc_topmed_fname <- gsub("maf001", maf_c, parse_input$hrc_topmed_fname)
if(!file.exists(hrc_topmed_fname)) hrc_topmed_fname <- NA

hrc_topmed_exome_fname <- gsub("maf001", maf_c, parse_input$hrc_topmed_exome_fname)
if(!file.exists(hrc_topmed_exome_fname)) hrc_topmed_exome_fname <- NA

print(exome_fname)
print(hrc_fname)
print(topmed_fname)
print(hrc_topmed_fname)
print(hrc_topmed_exome_fname)

bin_phenotype_fname <- sprintf("%s/bin_phenotype/parse_input_%i_bin_phenotype_%i.rds", result_dir, gene_idx, bin_phenotype_idx)
bin_phenotype <- readRDS(bin_phenotype_fname)

y <- bin_phenotype$y
prev <- bin_phenotype$prev
es <- bin_phenotype$es
causal_prop <- bin_phenotype$causal_prop
case_control_ref <- bin_phenotype$case_control_ref

BRV(exome_fname, case_control_ref, y, "exome", parse_input$gene, es, causal_prop, prev, out_folder, maf)
BRV(hrc_fname, case_control_ref, y, "hrc", parse_input$gene, es, causal_prop, prev, out_folder, maf)
BRV(topmed_fname, case_control_ref, y, "topmed", parse_input$gene, es, causal_prop, prev, out_folder, maf)
BRV(hrc_topmed_fname, case_control_ref, y, "hrc_topmed", parse_input$gene, es, causal_prop, prev, out_folder, maf)
BRV(hrc_topmed_exome_fname, case_control_ref, y, "hrc_topmed_exome", parse_input$gene, es, causal_prop, prev, out_folder, maf)

[1] "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/exome_maf0005/chr1_A3GALT2_aggregate.txt"
[1] "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_rsq03_maf0005/chr1_A3GALT2_aggregate.txt"
[1] "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/topmed_v3_rsq03_maf0005/chr1_A3GALT2_aggregate.txt"
[1] "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_topmed_v3_rsq03_maf0005/chr1_A3GALT2_aggregate.raw"
[1] "~/project/imputation-rvtest/workflows/imputation_aggregated_analysis/gene/hrc_topmed_v3_exome_rsq03_maf0005/chr1_A3GALT2.raw"


[1m[22mJoining with `by = join_by(FID, IID)`


gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,exome,1.2,1,0.1,6.150101,7.743347e-10,0.005


[1m[22mJoining with `by = join_by(FID, IID)`


gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,hrc,1.2,1,0.1,6.150101,7.743347e-10,0.005


[1m[22mJoining with `by = join_by(FID, IID)`


gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,topmed,1.2,1,0.1,6.150101,7.743347e-10,0.005


[1m[22mJoining with `by = join_by(FID, IID)`


gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,hrc_topmed,1.2,1,0.1,6.150101,7.743347e-10,0.005


[1m[22mJoining with `by = join_by(FID, IID)`


gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,hrc_topmed_exome,1.2,1,0.1,6.150101,7.743347e-10,0.005


In [25]:
brv_result

gene,sample,es,causal_prop,prev,zstat,pval,maf
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1_A3GALT2,exome,1.2,1,0.1,6.150101,7.743347e-10,0.005
