## Imports

In [11]:
import os
import subprocess
import pandas as pd
import pybedtools

import warnings
warnings.filterwarnings('ignore')

## Get ADPD SNPs

In [2]:
adpd_snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/snps_final/191121_ld_buddies_table_stage3.tsv', sep='\t')
display(adpd_snps.columns)

Index(['chr', 'pos', 'r2_with_ld_tag', 'ld_tag_chr', 'ld_tag_pos',
       'source_gwas', 'source', 'snp_id', 'locus_num', 'number_ld_buddies',
       'ld_tag_locus', 'rsid', 'chrom_hg19', 'snp_pos_hg19', 'file',
       'effect_allele', 'noneffect_allele', 'direction', 'pvalue', 'has_coloc',
       'direct_atac_overlap_narrow_tissue_regions',
       'containing_atac_tissues_narrow_tissue_regions',
       'nearest_atac_tissue_narrow_tissue_regions',
       'start_narrow_tissue_regions', 'end_narrow_tissue_regions',
       'dist_narrow_tissue_regions',
       'direct_atac_overlap_broad_tissue_regions',
       'containing_atac_tissues_broad_tissue_regions',
       'nearest_atac_tissue_broad_tissue_regions',
       'start_broad_tissue_regions', 'end_broad_tissue_regions',
       'dist_broad_tissue_regions', 'direct_atac_overlap_single_cell',
       'containing_atac_tissues_single_cell',
       'nearest_atac_tissue_single_cell', 'start_single_cell',
       'end_single_cell', 'dist_single_cel

## Make hg19 SNP BED files

In [3]:
snps_bed_hg19 = adpd_snps[['chrom_hg19', 'snp_pos_hg19']]
snps_bed_hg19 = snps_bed_hg19.loc[snps_bed_hg19['chrom_hg19'].notnull()]
snps_bed_hg19['chrom_hg19'] = snps_bed_hg19['chrom_hg19'].astype(int).astype(str)
snps_bed_hg19['snp_pos_hg19'] = snps_bed_hg19['snp_pos_hg19'].astype(int)
snps_bed_hg19['chrom_hg19'] = 'chr' + snps_bed_hg19['chrom_hg19']
snps_bed_hg19['start'] = snps_bed_hg19['snp_pos_hg19'] - 1
snps_bed_hg19 = snps_bed_hg19[['chrom_hg19', 'start', 'snp_pos_hg19']]
snps_bed_hg19.sort_values(by=['chrom_hg19', 'start', 'snp_pos_hg19'], inplace=True)
snps_bed_hg19.drop_duplicates(inplace=True)
display(snps_bed_hg19)
snps_bed_hg19.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg19_snps.bed', sep='\t', index=False, header=False)

nochr_snps_bed_hg19 = adpd_snps[['chrom_hg19', 'snp_pos_hg19']]
nochr_snps_bed_hg19 = nochr_snps_bed_hg19.loc[nochr_snps_bed_hg19['chrom_hg19'].notnull()]
nochr_snps_bed_hg19['chrom_hg19'] = nochr_snps_bed_hg19['chrom_hg19'].astype(int).astype(str)
nochr_snps_bed_hg19['snp_pos_hg19'] = nochr_snps_bed_hg19['snp_pos_hg19'].astype(int)
nochr_snps_bed_hg19['start'] = nochr_snps_bed_hg19['snp_pos_hg19'] - 1
nochr_snps_bed_hg19 = nochr_snps_bed_hg19[['chrom_hg19', 'start', 'snp_pos_hg19']]
nochr_snps_bed_hg19.sort_values(by=['chrom_hg19', 'start', 'snp_pos_hg19'], inplace=True)
nochr_snps_bed_hg19.drop_duplicates(inplace=True)
nochr_snps_bed_hg19.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/nochr_hg19_snps.bed', sep='\t', index=False, header=False)

Unnamed: 0,chrom_hg19,start,snp_pos_hg19
2334,chr1,40287822,40287823
2335,chr1,40287916,40287917
2336,chr1,40287931,40287932
2337,chr1,40288463,40288464
2338,chr1,40290896,40290897
...,...,...,...
13768,chr9,34036528,34036529
13769,chr9,34039704,34039705
13770,chr9,34040773,34040774
13771,chr9,34046390,34046391


## Get hg38 Coordinates for 1KG intersected SNPs

In [4]:
hg19_vcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/1KG_ADPD_hg19_snps.vcf', header=None, sep='\t')
hg19_vcf = hg19_vcf[[2, 3, 4, 5, 6, 7]]
hg19_vcf.columns = ['rsid', 'ref', 'alt', 'score', 'qc', 'info']
hg19_vcf.sort_values(by=['rsid', 'ref'], inplace=True)
hg19_vcf.drop_duplicates(inplace=True)
hg19_vcf

Unnamed: 0,rsid,ref,alt,score,qc,info
1230,.;esv3626682,T,"<CN0>,<CN2>",100,PASS,"AC=0,1;AF=0,0.000199681;AN=5008;CS=DUP_gs;END=..."
9680,DUP_uwash_chr5_60140563_60179087,A,<CN0>,100,PASS,AC=0;AF=0;AN=5008;CS=DUP_uwash;END=60179087;NS...
0,esv3585792,G,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=0,500;CIPOS=..."
45,esv3585794,C,<CN0>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-23,24;CIPOS..."
64,esv3587615,T,<CN2>,100,PASS,"AC=1;AF=0.000199681;AN=5008;CIEND=-150,150;CIP..."
...,...,...,...,...,...,...
8127,rs9984928,G,A,100,PASS,AC=1660;AF=0.33147;AN=5008;NS=2504;DP=16963;EA...
7588,rs9989786,A,G,100,PASS,AC=3719;AF=0.742612;AN=5008;NS=2504;DP=19926;E...
8852,rs9994159,C,T,100,PASS,AC=599;AF=0.119609;AN=5008;NS=2504;DP=20517;EA...
9357,rs9995651,C,G,100,PASS,AC=310;AF=0.061901;AN=5008;NS=2504;DP=13879;EA...


In [5]:
adpd_snps = adpd_snps[['chr', 'pos', 'rsid']]
adpd_snps.sort_values(by=['chr', 'pos'], inplace=True)
adpd_snps.drop_duplicates(inplace=True)
adpd_snps

Unnamed: 0,chr,pos,rsid
2334,1,39822151,rs34640847
2335,1,39822245,rs36015266
2336,1,39822260,rs61779808
2337,1,39822792,rs61779809
2338,1,39825225,rs72666941
...,...,...,...
10156,21,37517450,rs2236688
10157,21,37517672,rs2835776
10158,21,37519947,rs11701836
10159,21,37527140,rs7280075


In [6]:
merged = adpd_snps.merge(hg19_vcf, on='rsid')
merged['chr'] = 'chr' + merged['chr'].astype('str')
merged['start'] = merged['pos'] - 1
merged = merged.loc[merged['alt'].apply(lambda x : len(x) < 2)]
merged = merged.loc[merged['ref'].apply(lambda x : len(x) < 2)]
# hg38_bed = merged[['chr', 'start', 'pos', 'rsid', 'ref', 'alt', 'info']]
# hg38_bed['info'] = hg38_bed['info'].apply(lambda x : x.split('EUR_AF=')[1].split(';')[0])
# hg38_bed.sort_values(by=['chr', 'pos'], inplace=True)
# hg38_bed.drop_duplicates(inplace=True)
# hg38_bed.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps_1KG_ADPD.bed', header=False, index=False, sep='\t')

## Make New VCF file from merged file

In [7]:
hg38_vcf = merged[['chr', 'pos', 'rsid', 'ref', 'alt', 'score', 'qc', 'info']]
hg38_vcf['format'] = ['GT' for i in range(len(hg38_vcf))]
hg38_vcf = hg38_vcf.sort_values(by=['chr', 'pos'])
hg38_vcf.drop_duplicates(inplace=True)
display(hg38_vcf)
hg38_vcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps_1KG_ADPD.vcf', header=False, index=False, sep='\t')

Unnamed: 0,chr,pos,rsid,ref,alt,score,qc,info,format
0,chr1,39822151,rs34640847,G,A,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=18110;EAS...,GT
1,chr1,39822245,rs36015266,C,T,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=19296;EAS...,GT
2,chr1,39822260,rs61779808,T,A,100,PASS,AC=493;AF=0.0984425;AN=5008;NS=2504;DP=19241;E...,GT
3,chr1,39822792,rs61779809,C,T,100,PASS,AC=595;AF=0.11881;AN=5008;NS=2504;DP=19613;EAS...,GT
4,chr1,39825225,rs72666941,C,A,100,PASS,AC=477;AF=0.0952476;AN=5008;NS=2504;DP=20561;E...,GT
...,...,...,...,...,...,...,...,...,...
3305,chr9,34036531,rs1629188,G,C,100,PASS,AC=3379;AF=0.67472;AN=5008;NS=2504;DP=18771;EA...,GT
3306,chr9,34039707,rs9696412,A,G,100,PASS,AC=3379;AF=0.67472;AN=5008;NS=2504;DP=13159;EA...,GT
3307,chr9,34040776,rs1543605,C,G,100,PASS,AC=3379;AF=0.67472;AN=5008;NS=2504;DP=21998;EA...,GT
3308,chr9,34046393,rs6476434,C,T,100,PASS,AC=3379;AF=0.67472;AN=5008;NS=2504;DP=15262;EA...,GT


## Get ADPD Metadata

In [8]:
metadata = pd.read_excel('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/190215_Brain-ControlsOnly_Metadata_Merged.xlsx')
display(metadata)

Unnamed: 0,OldName,NewName,HarmonizedName,Bam,Contrast,Type,Group,Region,RegionMod,PatientID,...,xxx.CONTRIBUTING_NP_DX,xxx.Interval between death and last MMSE,xxx.ch_lastCasiScore,xxx.ch_lastCasiDate,xxx.micro_AmyloidAngiopathyOccipitalLobe_ID,xxx.GE_atherosclerosis_ID,xxx.calc_A,xxx.calc_B,xxx.C,xxx.CognitiveStatus
0,PD_00_38_CTRL_CAUD_X014_S01_L001_B1_T1_P025,PD_00_38_CTRL_CAUD_X014_S01_L001_B1_T1_P025,CTRL_CAUD_PD_00x38xx_X014_S01_L001_B1_T1_P025,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_CAUD,CTRL,CTRL,CAUD,CAUD,00_38,...,,,,,,,,,,
1,PD_00_38_CTRL_CAUD_X014_S01_L002_B1_T2_P028,PD_00_38_CTRL_CAUD_X014_S01_L002_B1_T2_P028,CTRL_CAUD_PD_00x38xx_X014_S01_L002_B1_T2_P028,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_CAUD,CTRL,CTRL,CAUD,CAUD,00_38,...,,,,,,,,,,
2,PD_00_38_CTRL_HIPP_X002_S11_L045_B1_T1_P002,PD_00_38_CTRL_HIPP_X002_S11_L045_B1_T1_P002,CTRL_HIPP_PD_00x38xx_X002_S11_L045_B1_T1_P002,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_HIPP,CTRL,CTRL,HIPP,HIPP,00_38,...,,,,,,,,,,
3,PD_00_38_CTRL_HIPP_X002_S11_L046_B1_T2_P003,PD_00_38_CTRL_HIPP_X002_S11_L046_B1_T2_P003,CTRL_HIPP_PD_00x38xx_X002_S11_L046_B1_T2_P003,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_HIPP,CTRL,CTRL,HIPP,HIPP,00_38,...,,,,,,,,,,
4,PD_00_38_CTRL_MDFG_X007_S04_L055_B1_T1_P014,PD_00_38_CTRL_MDFG_X007_S04_L055_B1_T1_P014,CTRL_MDFG_PD_00x38xx_X007_S04_L055_B1_T1_P014,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_MDFG,CTRL,CTRL,MDFG,MDFG,00_38,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,PD_16_32_CTRL_MDTG_X003_S04_L008_B1_T2_P003,PD_16_32_CTRL_MDTG_X003_S04_L008_B1_T2_P003,CTRL_SMTG_PD_16x32xx_X003_S04_L008_B1_T2_P003,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_MDTG,CTRL,CTRL,MDTG,SMTG,16_32,...,,,,,,,,,,
264,PD_16_32_CTRL_PTMN_X019_S08_L039_B1_T1_P033,PD_16_32_CTRL_PTMN_X019_S08_L039_B1_T1_P033,CTRL_PTMN_PD_16x32xx_X019_S08_L039_B1_T1_P033,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_PTMN,CTRL,CTRL,PTMN,PTMN,16_32,...,,,,,,,,,,
265,PD_16_32_CTRL_PTMN_X019_S08_L040_B1_T2_P030,PD_16_32_CTRL_PTMN_X019_S08_L040_B1_T2_P030,CTRL_PTMN_PD_16x32xx_X019_S08_L040_B1_T2_P030,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_PTMN,CTRL,CTRL,PTMN,PTMN,16_32,...,,,,,,,,,,
266,PD_16_32_CTRL_SUNI_X007_S12_L071_B1_T1_P014,PD_16_32_CTRL_SUNI_X007_S12_L071_B1_T1_P014,CTRL_SUNI_PD_16x32xx_X007_S12_L071_B1_T1_P014,/oak/stanford/groups/howchang/users/mcorces/PD...,CTRL_SUNI,CTRL,CTRL,SUNI,SUNI,16_32,...,,,,,,,,,,


## Get Bams for each brain region

In [9]:
display(metadata['Region'].unique())
regions = list(metadata['Region'].unique())

array(['CAUD', 'HIPP', 'MDFG', 'MDTG', 'PTMN', 'SUNI', 'SMTG', 'PARL'],
      dtype=object)

In [21]:
with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/ctrl_bams.txt') as infile:
    ctrl_bams = infile.readlines()
multi_bam_regs = []
for reg in regions:
    print(reg)
    region_bams = []
    for bam in ctrl_bams:
        if reg in bam:
            region_bams.append(bam)
    if len(region_bams) > 0:
        multi_bam_regs.append(reg)
        region_bams.sort()
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' + reg)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' + reg):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' + reg)
        with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt', 'w') as outfile:
            for bam in region_bams:
                outfile.write(bam)

CAUD
HIPP
MDFG
MDTG
PTMN
SUNI
SMTG
PARL


## Make Input VCF files for each region

In [None]:
for reg in multi_bam_regs:
    print(reg)
    with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
        reg_bams = [i.strip() for i in infile.readlines()]
    reg_vcf = hg38_vcf.copy(deep=True)
    for bam in reg_bams:
        bam_name = bam.split('/')[10]
        #print(bam_name)
        reg_vcf[bam_name] = ['./.' for i in range(len(reg_vcf))]
    #reg_vcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf', header=False, index=False, sep='\t')
    bgzip_cmd = 'bgzip /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf'
    #! {bgzip_cmd}
    tabix_cmd = 'tabix -p vcf /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/initial.vcf.gz'
    #! {tabix_cmd}
    asvcf_cmd_1 = 'echo sbatch --export=ALL -n 1 -t 1-0 -p akundaje --mail-type=ALL -J ' \
                + reg + ' -o /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' \
                + reg + '/asvcf.o -e /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/logs/' \
                + reg + '/asvcf.e /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
                + reg + '/asvcf.sh >> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/run_asvcf.sh'
    asvcf_cmd_2 = 'echo \'#!/bin/bash\' > /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
                + reg + '/asvcf.sh'
    asvcf_cmd_3 = 'echo /home/users/soumyak/rasqual/src/ASVCF/createASVCF.sh paired_end ' \
                + '/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' \
                + reg + '_ctrl_bams.txt /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                + reg + '/initial.vcf.gz /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                + reg + '/asvcf.gz atac >> /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/' \
                + reg + '/asvcf.sh'
    print(asvcf_cmd_1)
    print(asvcf_cmd_2)
    print(asvcf_cmd_3)
    ! {asvcf_cmd_1}
    ! {asvcf_cmd_2}
    ! {asvcf_cmd_3}

## Load IDR Counts Matrix

In [24]:
idr_counts = pd.read_csv('/mnt/lab_data2/annashch/alzheimers_parkinsons/adpd.atac.idr.counts.txt.gz', sep='\t')
counts_header = idr_counts.columns
display(idr_counts)

Unnamed: 0,chrom,start,end,ADAD_CAUD_00_0281,ADAD_CAUD_00_0387,ADAD_CAUD_01_0164,ADAD_CAUD_01_1400,ADAD_CAUD_06_0194,ADAD_CAUD_06_1486,ADAD_CAUD_07_0787,...,LRRK_MDTG_01_39,LRRK_MDTG_04_10,LRRK_MDTG_10_37,LRRK_MDTG_13_60,LRRK_PTMN_01_39,LRRK_PTMN_04_10,LRRK_PTMN_10_37,LRRK_PTMN_13_60,LRRK_SUNI_04_10,LRRK_SUNI_10_37
0,chr1,10015,10231,12,16,22,12,14,20,12,...,3,2,14,18,0,4,12,9,22,26
1,chr1,181363,181563,1,6,4,2,10,6,1,...,1,5,2,14,0,4,7,4,8,18
2,chr1,183716,183916,7,4,4,12,2,10,19,...,1,19,10,21,3,16,8,6,20,13
3,chr1,184083,184283,11,8,4,11,2,19,20,...,1,20,26,25,3,22,18,11,26,24
4,chr1,184370,184570,7,6,6,0,6,9,6,...,5,16,11,18,1,13,7,4,23,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385720,chrY,56747987,56748187,42,1,3,3,8,5,0,...,0,2,0,7,2,2,0,6,7,0
385721,chrY,56754063,56754271,10,0,10,9,15,5,12,...,8,2,0,5,9,5,0,1,4,0
385722,chrY,56763412,56763626,231,6,116,196,376,187,94,...,97,45,4,58,191,51,0,24,128,6
385723,chrY,56763699,56763918,27,0,10,9,16,13,7,...,1,4,0,4,2,9,0,1,1,0


## Intersect Counts Matrix with SNPs

In [14]:
counts_bed = pybedtools.BedTool.from_dataframe(idr_counts)
counts_bed

<BedTool(/tmp/pybedtools.yk6w1b6i.tmp)>

In [15]:
snps = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/snps/hg38_snps_1KG_ADPD.bed', header=None, sep='\t')
snps_bed = pybedtools.BedTool.from_dataframe(snps)
snps_bed

<BedTool(/tmp/pybedtools.lin6tc81.tmp)>

In [16]:
intersect_bed = counts_bed.intersect(snps_bed, u=True, wa=True)

In [25]:
idr_counts = pybedtools.BedTool.to_dataframe(intersect_bed, header=None)
idr_counts.columns = counts_header 
print(idr_counts)

    chrom      start        end  ADAD_CAUD_00_0281  ADAD_CAUD_00_0387  \
0    chr1   39878006   39878581                 51                 44   
1    chr1   39878599   39879244                 57                 69   
2    chr1   39882784   39884105                302                302   
3    chr1   39887608   39887837                 11                 12   
4    chr1  154865411  154865652                 45                 19   
..    ...        ...        ...                ...                ...   
607  chr9   17699014   17699637                 52                 27   
608  chr9   17730813   17731948                 34                 57   
609  chr9   33816144   33818416               1039               1523   
610  chr9   33957284   33957573                 18                  6   
611  chr9   33971219   33972644                 86                 23   

     ADAD_CAUD_01_0164  ADAD_CAUD_01_1400  ADAD_CAUD_06_0194  \
0                   62                 94                 3

## Create Region-Specific Counts Matrices

In [26]:
for reg in multi_bam_regs:
    print(reg)
    with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
        reg_bams = [i.strip() for i in infile.readlines()]
    bam_names = [bam.split('/')[10] for bam in reg_bams]
    print(bam_names)
    reg_counts = idr_counts.copy(deep=True)
    reg_counts['index'] = reg_counts['chrom'] + '_' + reg_counts['start'].astype(str) + '_' + reg_counts['end'].astype(str)
    reg_counts = reg_counts[['index'] + bam_names]
    reg_counts.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                        + reg + '/idr_counts.txt', index=False, header=False, sep='\t')

CAUD
['CTRH_CAUD_07_1144', 'CTRH_CAUD_07_1287', 'CTRH_CAUD_08_0712', 'CTRH_CAUD_10_0867', 'CTRH_CAUD_13_0962', 'CTRH_CAUD_14_0380', 'CTRH_CAUD_14_0513', 'CTRH_CAUD_14_1383', 'CTRH_CAUD_15_1023', 'CTRH_CAUD_15_1025', 'CTRL_CAUD_06_0615', 'CTRL_CAUD_06_1516', 'CTRL_CAUD_09_1589', 'CTRL_CAUD_11_0393', 'CTRL_CAUD_13_0038', 'CTRL_CAUD_13_1226', 'CTRL_CAUD_14_0586', 'CTRL_CAUD_14_0941', 'CTRL_CAUD_14_1018', 'CTRL_CAUD_00_38', 'CTRL_CAUD_01_31', 'CTRL_CAUD_03_15', 'CTRL_CAUD_03_39', 'CTRL_CAUD_03_41', 'CTRL_CAUD_03_66', 'CTRL_CAUD_04_38', 'CTRL_CAUD_05_16', 'CTRL_CAUD_08_90', 'CTRL_CAUD_09_35', 'CTRL_CAUD_15_78', 'CTRL_CAUD_16_10', 'CTRL_CAUD_16_32']
HIPP
['CTRH_HIPP_07_1058', 'CTRH_HIPP_07_1144', 'CTRH_HIPP_07_1287', 'CTRH_HIPP_08_0298', 'CTRH_HIPP_10_0867', 'CTRH_HIPP_13_0962', 'CTRH_HIPP_14_0380', 'CTRH_HIPP_14_0513', 'CTRH_HIPP_14_1383', 'CTRH_HIPP_15_1023', 'CTRH_HIPP_15_1025', 'CTRL_HIPP_06_1516', 'CTRL_HIPP_11_0393', 'CTRL_HIPP_13_0038', 'CTRL_HIPP_13_1226', 'CTRL_HIPP_14_0586', 'CTRL_

## Create Chromosome-Specific Inputs

In [27]:
chroms = ['chr' + str(i) for i in range(1, 23)]
chroms.append('chrX')
chroms.append('chrY')
print(chroms)
for reg in multi_bam_regs:
    print(reg)
    reg_counts = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                        + reg + '/idr_counts.txt', header=None, sep='\t')
    reg_offset = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                        + reg + '/offset.txt',header=None, sep='\t')
    for chrom in chroms:
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                     + reg + '/' + chrom)
        print(chrom)
        reg_chrom_counts = reg_counts.loc[reg_counts[0].apply(lambda x : x.startswith(chrom))]
        reg_chrom_offset = reg_offset.loc[reg_offset[0].apply(lambda x : x.startswith(chrom))]
        reg_chrom_counts.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                                 + reg + '/' + chrom + '/Y.txt', index=False, header=False, sep='\t')
        reg_chrom_offset.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                                 + reg + '/' + chrom + '/K.txt', index=False, header=False, sep='\t')
    
    

['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY']
CAUD
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
HIPP
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDFG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
PTMN
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
SUNI
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18


## Get Genotyped ASVCFs

In [114]:
for reg in multi_bam_regs:
    print(reg)
    with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/bam_lists/' + reg + '_ctrl_bams.txt') as infile:
        reg_bams = [i.strip() for i in infile.readlines()]
    asvcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/asvcf.gz',
                        sep='\t', skiprows=1, header=None)
    bam_names = [bam.split('/')[10] for bam in reg_bams]
    bam_patients = [name.split('_')[2] + '_' + name.split('_')[3] for name in bam_names]
    header = ['chrom', 'pos', 'rsid', 'ref', 'alt', 'score', 'qc', 'info', 'format'] + bam_patients
    asvcf.columns = header
    asvcf.set_index('rsid', inplace=True)
    #print(asvcf.head())
    for patient in bam_patients:
        if os.path.isfile('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/quasar/output/'
                            + patient + '/genotypes.txt'):
            genotype = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/quasar/output/'
                                   + patient + '/genotypes.txt', sep='\t')
        genotype = genotype[['rsID', 'map.g0', 'map.g1', 'map.g2']]
        for index,row in genotype.iterrows():
            if row['map.g1'] >= row['map.g0']:
                if row['map.g1'] >= row['map.g2']:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '0/1')
                else:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '1/1')
            else:
                if row['map.g0'] >= row['map.g2']:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '0/0')
                else:
                    asvcf.at[row['rsID'], patient] = asvcf.at[row['rsID'], patient].replace('./.', '1/1')
    asvcf.reset_index(inplace=True)
    asvcf = asvcf[header]
    #print(asvcf.head())
    asvcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/genotyped_asvcf.txt',
                    sep='\t', index=False)
    for chrom in chroms:
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                     + reg + '/' + chrom)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/output/'
                     + reg + '/' + chrom)
        print(chrom)
        chrom_asvcf = asvcf.loc[asvcf['chrom'] == chrom]
        chrom_asvcf.to_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                                 + reg + '/' + chrom + '/asvcf', index=False, header=False, sep='\t')
        bgzip_cmd = 'bgzip /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/' + chrom + '/asvcf'
        ! {bgzip_cmd}
        tabix_cmd = 'tabix -p vcf /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' + reg + '/' + chrom + '/asvcf.gz'
        ! {tabix_cmd}

CAUD
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
HIPP
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDFG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
PTMN
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
SUNI
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
SMTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
PARL
chr1
chr2
chr3
chr4
chr5
chr6

In [121]:
for reg in multi_bam_regs:
    print(reg)
    for chrom in chroms:
        print(chrom)
        if not os.path.isdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                             + reg + '/' + chrom):
            os.mkdir('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                     + reg + '/' + chrom)

CAUD
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
HIPP
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDFG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
MDTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
PTMN
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
SUNI
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
SMTG
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
chrX
chrY
PARL
chr1
chr2
chr3
chr4
chr5
chr6

## Generate RASQUAL commands

In [29]:
chroms_noXY = ['chr' + str(i) for i in range(1, 23)]
for reg in multi_bam_regs:
    print(reg)
    for chrom in chroms_noXY:
        print(chrom)
        asvcf = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                            + reg + '/' + chrom + '/asvcf.gz', header=None, sep='\t')
        counts = pd.read_csv('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/'
                            + reg + '/' + chrom + '/Y.txt', header=None, sep='\t')
        print(asvcf.head())
        print(counts.head())
        samples_n = counts.shape[1]
        print("Samples (n): ", samples_n)
        testing_snps_l = len(asvcf)
        print("Testing SNPs (l): ", testing_snps_l)
        feature_snps_m = len(counts)
        print("Feature SNPs (m): ", feature_snps_m)
        starts = ','.join([i.split('_')[1] for i in counts[0]])
        #print("Starts: ", starts)
        ends = ','.join([i.split('_')[2] for i in counts[0]])
        #print("Ends: ", ends)
        with open('/oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/scripts/'
                     + reg + '/' + chrom + '/run_rasqual.sh', 'w') as outfile:
            for feat in range(1, feature_snps_m + 1):
                rasqual_cmd = 'zcat /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                + reg + '/' + chrom + '/asvcf.gz | /home/users/soumyak/rasqual/bin/rasqual -y /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                + reg + '/' + chrom + '/Y.bin -k /oak/stanford/groups/akundaje/projects/alzheimers_parkinsons/allelic_imbalance/rasqual/input/' \
                                + reg + '/' + chrom + '/K.bin -n ' + str(samples_n) + ' -j ' + str(feat) + ' -l ' + str(testing_snps_l) + ' -m ' \
                                + str(feature_snps_m) + ' -s ' + starts + ' -e ' + ends
                outfile.write(rasqual_cmd + '\n')

CAUD
chr1
     0         1           2  3  4    5     6   \
0  chr1  39822151  rs34640847  G  A  100  PASS   
1  chr1  39822245  rs36015266  C  T  100  PASS   
2  chr1  39822260  rs61779808  T  A  100  PASS   
3  chr1  39822792  rs61779809  C  T  100  PASS   
4  chr1  39825225  rs72666941  C  A  100  PASS   

                                                  7      8        9   ...  \
0  AC=595;AF=0.11881;AN=5008;NS=2504;DP=18110;EAS...  GT:AS  ./.:0,0  ...   
1  AC=595;AF=0.11881;AN=5008;NS=2504;DP=19296;EAS...  GT:AS  ./.:0,0  ...   
2  AC=493;AF=0.0984425;AN=5008;NS=2504;DP=19241;E...  GT:AS  ./.:1,0  ...   
3  AC=595;AF=0.11881;AN=5008;NS=2504;DP=19613;EAS...  GT:AS  ./.:1,0  ...   
4  AC=477;AF=0.0952476;AN=5008;NS=2504;DP=20561;E...  GT:AS  ./.:1,0  ...   

        31       32       33       34       35       36       37       38  \
0  ./.:0,0  ./.:0,0  ./.:0,0  ./.:1,0  ./.:0,0  ./.:0,0  ./.:0,0  ./.:0,0   
1  ./.:0,1  ./.:1,0  ./.:0,0  ./.:0,0  ./.:0,0  ./.:0,0  ./.:1,0  ./.:0,

EmptyDataError: No columns to parse from file