# Exome Data Preparation

This notebook records the steps to prepare exome sequence data for rare variant aggregate analysis and simulation.

## Annotate Exome

In [6]:
## Start from the qc'ed exome data
ls ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c*.merged.filtered.bed

[0m[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c10.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c11.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c12.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c13.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c14.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c15.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c16.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c17.merged.filtered.bed[0m[K
[30;43m/home/tl3031/UKBiobank/data/exome_files/project_VCF/

In [7]:
## Annotate exome - write script
for i in list((1,2,11)):
    script='''#!/bin/sh
#$ -l h_rt=24:00:00
#$ -l h_vmem=10G
#$ -N annotate_exome_chr%i
#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/annotate_exome_chr%i_$JOB_ID.out
#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/annotate_exome_chr%i_$JOB_ID.err
#$ -q csg.q
 
export PATH=$HOME/miniconda3/bin:$PATH
module load Singularity

sos run /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/notebooks/annovar.ipynb annovar \
    --build 'hg38' \
    --cwd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome \ 
    --bim_name /mnt/vast/hpc/csg/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c%i.merged.filtered.bim \
    --humandb /mnt/mfs/statgen/isabelle/REF/humandb  \
    --job_size 1 \
    --name_prefix exome_168206_chr%i_hg38 \
    --container_annovar /mnt/mfs/statgen/containers/gatk4-annovar.sif
'''%(i,i,i,i,i)
    f=open("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/annotate_exome_chr"+str(i)+".sh", 'w')
    f.write(script)
    f.close()

In [8]:
cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/
for i in annotate_exome_chr*sh; do qsub $i; done

Your job 8658496 ("annotate_exome_chr11") has been submitted
Your job 8658497 ("annotate_exome_chr1") has been submitted
Your job 8658498 ("annotate_exome_chr2") has been submitted


In [9]:
## Format exome annotations - rename column names, remove unused columns
library(dplyr)
library(data.table)

setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome")

for(chr in c(1,2,11)){
    annot <- data.table::fread(sprintf("ukb23156_c%i.merged.filtered.hg38.hg38_multianno.csv", chr))
    colnames(annot)[29:41] <- c("AF_genome",
                                "AF_raw_genome",
                                "AF_male_genome",
                                "AF_female_genome",
                                "AF_afr_genome",
                                "AF_ami_genome",
                                "AF_amr_genome",
                                "AF_asj_genome",
                                "AF_eas_genome",
                                "AF_fin_genome",
                                "AF_nfe_genome",
                                "AF_oth_genome",
                                "AF_sas_genome")
    colnames(annot)[42:54] <- c("AF_exome",
                                "AF_popmax_exome",
                                "AF_male_exome",
                                "AF_female_exome",
                                "AF_raw_exome",
                                "AF_afr_exome",
                                "AF_sas_exome",
                                "AF_amr_exome",
                                "AF_eas_exome",
                                "AF_nfe_exome",
                                "AF_fin_exome",
                                "AF_asj_exome",
                                "AF_oth_exome")
    annot <- annot %>% 
        mutate(AF_nfe_exome = as.numeric(AF_nfe_exome)) %>% 
        mutate(MAF_nfe_exome = ifelse(AF_nfe_exome > 0.5, 1 - AF_nfe_exome, AF_nfe_exome)) %>% 
        rename("ID_hg38" = Otherinfo1) %>% 
        mutate(ID = paste(Chr, Start, Ref, Alt, sep = ":")) %>% 
        mutate(ID = paste0("chr", ID)) %>%
        select(Chr, Start, End, Ref, Alt, Func.refGene, Gene.refGene, ExonicFunc.refGene, MAF_nfe_exome, REVEL_score, ID_hg38, ID, CADD_phred)
    
    data.table::fwrite(annot, sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col.csv.gz", chr))
}

[1m[22m[36mℹ[39m In argument: `AF_nfe_exome = as.numeric(AF_nfe_exome)`.
[33m![39m NAs introduced by coercion”
[1m[22m[36mℹ[39m In argument: `AF_nfe_exome = as.numeric(AF_nfe_exome)`.
[33m![39m NAs introduced by coercion”
[1m[22m[36mℹ[39m In argument: `AF_nfe_exome = as.numeric(AF_nfe_exome)`.
[33m![39m NAs introduced by coercion”


## Extract monomorphic alleles

In [10]:
for i in list((1,2,11)):
    script='''#!/bin/sh
#$ -l h_rt=24:00:00
#$ -l h_vmem=10G
#$ -N mono_exome_chr%i
#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/mono_exome_chr%i-$JOB_ID.out
#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/mono_exome_chr%i-$JOB_ID.err
#$ -j y
#$ -q csg.q
#$ -S /bin/bash
export PATH=$HOME/miniconda3/bin:$PATH
module load Plink/1.9.10

plink \
    --bfile ~/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c%i.merged.filtered \
    --keep-fam /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/168206ind.sample.txt \
    --freq counts \
    --out ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/ukb23156_c%i.merged.filtered

cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/
awk 'BEGIN {FS=" "; OFS=" "} {if(NR==1 || $5==0 || $6==0)print $2}' ukb23156_c%i.merged.filtered.frq.counts > monomorphic_chr%i_SNPs

'''%(i,i,i,i,i,i,i)
    f=open("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/mono_exome_chr"+str(i)+".sh", 'w')
    f.write(script)
    f.close()

In [11]:
cd /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/
for i in mono_exome_chr*sh; do qsub $i; done

Your job 8658499 ("mono_exome_chr11") has been submitted
Your job 8658500 ("mono_exome_chr1") has been submitted
Your job 8658501 ("mono_exome_chr2") has been submitted


## Filter Exome

In [12]:
library(dplyr)
library(data.table)

setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome")

In [13]:
filter_df <- data.frame(data.frame(matrix(ncol = 6, nrow = 0)))

for(i in c(1, 2, 11)){
    for(maf in c(0.01, 0.005, 0.001)){
        annot <- data.table::fread(sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col.csv.gz", i)) %>% select(-CADD_phred)
        mono <- data.table::fread(sprintf("monomorphic_chr%d_SNPs", i))
        maf_c <- gsub("\\.", "", as.character(maf))
        
        annot_maf <- annot %>% 
            filter(!ID_hg38 %in% mono$SNP) %>% 
            filter(is.na(MAF_nfe_exome) | MAF_nfe_exome < maf) 
        
        annot_func <- annot_maf %>% 
            filter(Func.refGene %in% c("exonic", "splicing", "exonic;splicing")) %>%
            filter(ExonicFunc.refGene != 'unknown') %>% 
            filter(ExonicFunc.refGene != 'synonymous SNV' & ExonicFunc.refGene != 'nonframeshift substitution') %>%
            mutate(Function = ifelse(ExonicFunc.refGene == "nonsynonymous SNV", "missense", "")) %>%
            mutate(Function = ifelse(grepl("splicing", Func.refGene), "splicing", Function)) %>%
            mutate(Function = ifelse(ExonicFunc.refGene %in% c("stopgain", "stoploss", "startloss", "frameshift substitution"), "LoF", Function))
        
        annot_func <- annot_func %>% 
            tidyr::separate(Gene.refGene, c("Gene.refGene", "discard_1", "discard_2"), sep = ";") %>% 
            select(-discard_1, -discard_2)
        
        gene_list <- annot_func %>% pull(Gene.refGene) %>% table() %>% as.data.frame() %>% filter(Freq >= 2) %>% pull(1)
        annot_final <- annot_func %>% filter(Gene.refGene %in% gene_list)
        
        ## output all variants
        data.table::fwrite(annot_func, 
                           sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all.csv.gz", i, maf_c), 
                           quote = FALSE)
        
        ## output all variants snplist
        write.table(annot_func$ID_hg38, 
                    sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all.extractlist.snplist", i, maf_c), 
                    col.names = FALSE, row.name = FALSE, quote = FALSE)
        
        ## output genes with 2 or more variants
        data.table::fwrite(annot_final, 
                           sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.csv.gz", i, maf_c), 
                           quote = FALSE)
        
        ## output genes with 2 or more variants snplist
        write.table(annot_final$ID_hg38, 
            sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.extractlist.snplist", i, maf_c), 
            col.names = FALSE, row.name = FALSE, quote = FALSE)
        
        sub_df <- data.frame(data = "exome", chromosome = i, maf = maf, 
                             total_num_var = nrow(annot), 
                             maf_filtering_var = nrow(annot_maf),
                             function_filtering_var = sprintf("%d (%d)", nrow(annot_func), length(unique(annot_func$Gene.refGene))),
                             gene_filtering_var = sprintf("%d (%d)", nrow(annot_final), length(gene_list)))
        filter_df <- rbind(filter_df, sub_df)
    }
}

“[1m[22mExpected 3 pieces. Additional pieces discarded in 278 rows [38948, 38949,
38964, 38965, 38966, 38967, 38968, 38969, 38970, 38980, 38983, 38989, 38992,
38993, 39033, 39034, 39035, 39036, 51737, 51738, ...].”
“[1m[22mExpected 3 pieces. Missing pieces filled with `NA` in 477795 rows [1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”
“[1m[22mExpected 3 pieces. Additional pieces discarded in 278 rows [38870, 38871,
38886, 38887, 38888, 38889, 38890, 38891, 38892, 38902, 38905, 38911, 38914,
38915, 38955, 38956, 38957, 38958, 51617, 51618, ...].”
“[1m[22mExpected 3 pieces. Missing pieces filled with `NA` in 476890 rows [1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].”
“[1m[22mExpected 3 pieces. Additional pieces discarded in 274 rows [38531, 38532,
38547, 38548, 38549, 38550, 38551, 38552, 38553, 38563, 38566, 38572, 38575,
38576, 38615, 38616, 38617, 38618, 51151, 51152, ...].”
“[1m[22mExpected 3 pieces. Missing pi

In [14]:
filter_df

data,chromosome,maf,total_num_var,maf_filtering_var,function_filtering_var,gene_filtering_var
<chr>,<dbl>,<dbl>,<int>,<int>,<chr>,<chr>
exome,1,0.01,1587094,1269959,478357 (1953),478357 (1953)
exome,1,0.005,1587094,1267652,477452 (1953),477452 (1953)
exome,1,0.001,1587094,1259467,473986 (1953),473986 (1953)
exome,2,0.01,1141868,910875,345527 (1193),345527 (1193)
exome,2,0.005,1141868,909318,344918 (1193),344918 (1193)
exome,2,0.001,1141868,903385,342413 (1193),342413 (1193)
exome,11,0.01,958927,766172,293784 (1248),293784 (1248)
exome,11,0.005,958927,764880,293221 (1248),293221 (1248)
exome,11,0.001,958927,759851,291093 (1248),291093 (1248)


## Subsetting for CADD_c-score

We are not using the CADD_c-score provided by the annovar pipeline since the database included in the pipeline do not annotate all variants. There is an annovar CADD database for all variants, but we are not using it for this project. 

In order to retrieve the CADD_c-score for variants of interest, we manually upload them to the [CADD website](https://cadd.gs.washington.edu/score) for queries. The website accepts 5-column vcf file for up to 2MB.

In [15]:
for i in list((1,2,11)):
        script='''#!/bin/sh
#$ -l h_rt=48:00:00
#$ -l h_vmem=64G
#$ -N extract_filtered_maf001_chr%i
#$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/make_5col_vcf_maf001_chr%i_$JOB_ID.out
#$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/make_5col_vcf_maf001_chr%i_$JOB_ID.out
#$ -q csg.q
#$ -S /bin/bash

export PATH=$HOME/miniconda3/bin:$PATH
module load Plink/1.9.10
module load HTSLIB/1.17
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome

plink \
    --bfile /home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c%i.merged.filtered \
    --keep-fam /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/168206ind.sample.txt \
    --extract ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf001_LOF_missense_all.extractlist.snplist \
    --make-bed \
    --export vcf bgz \
    --out ukb23156_c%i_maf001_LOF_missense_all_extracted

zcat ukb23156_c%i_maf001_LOF_missense_all_extracted.vcf.gz | cut -f-5 > ukb23156_c%i_maf001_LOF_missense_all_extracted_5col.vcf

'''%(i,i,i,i,i,i,i,i)
        f=open("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/make_5col_vcf_maf001_chr"+str(i)+".sh", 'w')
        f.write(script)
        f.close()

Since the maximum file size for CADD server is 2MB, need to split VCF for chromosome 1 and 2 into smaller files

In [16]:
cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome

split -l 100000 ukb23156_c1_maf001_LOF_missense_all_extracted_5col.vcf ukb23156_c1_maf001_LOF_missense_all_extracted_5col.
split -l 100000 ukb23156_c2_maf001_LOF_missense_all_extracted_5col.vcf ukb23156_c2_maf001_LOF_missense_all_extracted_5col.
split -l 100000 ukb23156_c11_maf001_LOF_missense_all_extracted_5col.vcf ukb23156_c11_maf001_LOF_missense_all_extracted_5col.

for i in ukb23156_c*_maf001_LOF_missense_all_extracted_5col.a*; do bgzip ${i}; done

After downloading all the files and gunzipped and renamed, need some post processing steps.

In [17]:
for i in 1 2 11; do
    for j in GRCh38-v1.6_chr${i}_*.tsv; do
        sed -i '1,2d' ${j};
    done
done

cat GRCh38-v1.6_chr1_*.tsv > GRCh38-v1.6_chr1_all.tsv
cat GRCh38-v1.6_chr2_*.tsv > GRCh38-v1.6_chr2_all.tsv
cat GRCh38-v1.6_chr11_*.tsv > GRCh38-v1.6_chr11_all.tsv

Fill in the missing variants by requerying

In [18]:
setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome")

for(i in c(1,2,11)){
    annot <- data.table::fread(sprintf("ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf001_LOF_missense_all.csv.gz", i))

    cadd <- data.table::fread(sprintf("GRCh38-v1.6_chr%i_all.tsv", i), header = FALSE) %>% arrange(V2)
    colnames(cadd) <- c("Chr", "Start", "Ref", "Alt", "RawScore", "PHRED")

    annot_cadd <- left_join(annot, cadd)
    annot_cadd_miss <- annot_cadd[which(is.na(annot_cadd$PHRED)),] %>% 
            mutate(ID = paste(Chr, Start, Ref, Alt, sep = ":")) %>%
            select(Chr, Start, ID, Ref, Alt)
    data.table::fwrite(annot_cadd_miss, sprintf("ukb23156_c%i_maf001_LOF_missense_all_extracted_5col.miss", i), sep = "\t")
}

Then, merge the CADD score to the annotation and filter.

In [19]:
library(data.table)
library(dplyr)

setwd("~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome")

for(i in c(1,2,11)){
    for(maf in c(0.01, 0.001, 0.005)){
        maf_c <- gsub("\\.", "", as.character(maf))
        annot <- fread(sprintf("ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf001_LOF_missense_all.csv.gz", i))

        cadd <- fread(sprintf("GRCh38-v1.6_chr%i_all.tsv", i), header = FALSE) %>% arrange(V2)
        cadd_miss <- fread(sprintf("GRCh38-v1.6_chr%i_miss.tsv", i), header = FALSE) %>% arrange(V2)
        cadd_all <- rbind(cadd, cadd_miss)
        colnames(cadd_all) <- c("Chr", "Start", "Ref", "Alt", "RawScore", "PHRED")

        annot_all <- left_join(annot, cadd_all) %>% filter(is.na(MAF_nfe_exome) | MAF_nfe_exome < maf)
        annot_all_lof <- annot_all %>% filter(Function == "LoF")
        annot_all_cadd <- annot_all %>% filter(Function != "LoF") %>% filter(as.numeric(PHRED) >= 20)
        
        gene_list <- annot_all %>% pull(Gene.refGene) %>% table() %>% as.data.frame() %>% filter(Freq > 1) %>% pull(1)
        annot_final <- annot_all %>% filter(Gene.refGene %in% gene_list)
        annot_final_lof <- annot_final %>% filter(Function == "LoF")
        annot_final_cadd <- annot_final %>% filter(Function != "LoF") %>% filter(as.numeric(PHRED) >= 20)
        
        ## >= 1 variant
        fwrite(annot_all, 
               sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all.csv.gz", i, maf_c), 
               quote = FALSE)
        
        write.table(annot_all$ID_hg38, 
                    sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all.extractlist.snplist", i, maf_c), 
                    col.names = FALSE, row.name = FALSE, quote = FALSE)
        
        ## >= 1 variant + CADD filtering
        fwrite(rbind(annot_all_lof, annot_all_cadd), 
               sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all_cadd.csv.gz", i, maf_c), 
               quote = FALSE)
        
        write.table(rbind(annot_all_lof, annot_all_cadd) %>% pull(ID_hg38), 
                    sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_all_cadd.extractlist.snplist", i, maf_c), 
                    col.names = FALSE, row.name = FALSE, quote = FALSE)
        
        ## >= 2 variant
        data.table::fwrite(annot_final, 
                           sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.csv.gz", i, maf_c), 
                           quote = FALSE)
        
        write.table(annot_final$ID_hg38, 
            sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense.extractlist.snplist", i, maf_c), 
            col.names = FALSE, row.name = FALSE, quote = FALSE)
        
        ## >= 2 variant + CADD filtering
        fwrite(rbind(annot_final_lof, annot_final_cadd), 
               sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_cadd.csv.gz", i, maf_c), 
               quote = FALSE)
        
        write.table(rbind(annot_all_lof, annot_all_cadd) %>% pull(ID_hg38), 
                    sprintf("ukb23156_c%d.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf%s_LOF_missense_cadd.extractlist.snplist", i, maf_c), 
                    col.names = FALSE, row.name = FALSE, quote = FALSE)
    }   
}


[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`
[1m[22mJoining with `by = join_by(Chr, Start, Ref, Alt, RawScore, PHRED)`


In [20]:
# for i in list((1,2,11)):
#     for j in list(("1", "05", "01")):
#         for k in list(("", "_cadd")):
#             script='''#!/bin/sh
# #$ -l h_rt=48:00:00
# #$ -l h_vmem=64G
# #$ -N extract_filtered_maf00%s_chr%i%s
# #$ -o /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/extract_filtered_maf00%s_chr%i%s_$JOB_ID.out
# #$ -e /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/extract_filtered_maf00%s_chr%i%s_$JOB_ID.err
# #$ -q csg.q
# #$ -S /bin/bash
# export PATH=$HOME/miniconda3/bin:$PATH
# module load Plink/1.9.10

# cd ~/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome
# plink \
#     --bfile /home/tl3031/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c%i.merged.filtered \
#     --keep-fam /home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/168206ind.sample.txt \
#     --extract ukb23156_c%i.merged.filtered.hg38.hg38_multianno_formatted_sel_col_maf00%s_LOF_missense%s.extractlist.snplist \
#     --make-bed \
#     --out ukb23156_c%i_maf00%s_LOF_missense%s_extracted

# '''%(j,i,k,j,i,k,j,i,k,i,i,j,k,i,j,k)
#             f=open("/home/tl3031/project/imputation-rvtest/analysis/imputation_aggregated_analysis/exome/scripts/extract_filtered_maf00"+j+"_chr"+str(i)+k+".sh", 'w')
#             f.write(script)
#             f.close()