In [31]:
import boto3
import pysam
import os
import pandas as pd

## Setup

In [51]:
os.chdir('/data/jake/sv-gmm/python')
sv_tbl='../data/sv-modes.tsv'
dirout='../data/variants'
idx_1kg_phase3='../data/20130502.phase3.low_coverage.alignment.index' # from: aws s3 ls s3://1000genomes/alignment_indices/
if not os.path.exists(dirout):
    os.mkdir(dirout)

In [34]:
df = pd.read_csv(sv_tbl,sep='\t')
df.head()

Unnamed: 0,SV ID,# modes predicted,chr,start,stop,allele frequency,length,Total # samples,# reference samples,Mode 1,Mode 2,Mode 3
0,UW_VH_9038,1,19,54887338,54888354,0.014,1016,51,0,"['HG01125', 'HG01896', 'HG01956', 'HG02009', '...",,
1,SI_BD_10797,1,10,86801825,86802449,0.033,624,283,139,"['HG00154', 'HG00159', 'HG00250', 'HG00264', '...",,
2,UW_VH_19141,1,3,177294474,177297489,0.032,3015,133,1,"['HG00180', 'HG00185', 'HG00189', 'HG00237', '...",,
3,DEL_pindel_47187,2,18,45379612,45379612,0.55,195,847,66,"['HG00114', 'HG00132', 'HG00142', 'HG00150', '...","['HG00108', 'HG00111', 'HG00121', 'HG00125', '...",
4,DEL_pindel_24042,2,7,136996507,136996739,0.071,232,184,48,"['HG01241', 'HG01259', 'HG01488', 'HG01890', '...","['HG01125', 'HG01392', 'HG01403', 'HG01556', '...",


In [35]:
df.columns = ['id', 'n_modes', 'chr', 'start', 'stop', 'allele_freq', 'length', 'n_samples', 'n_ref_samples', 'mode_1', 'mode_2', 'mode_3']
df.head()

Unnamed: 0,id,n_modes,chr,start,stop,allele_freq,length,n_samples,n_ref_samples,mode_1,mode_2,mode_3
0,UW_VH_9038,1,19,54887338,54888354,0.014,1016,51,0,"['HG01125', 'HG01896', 'HG01956', 'HG02009', '...",,
1,SI_BD_10797,1,10,86801825,86802449,0.033,624,283,139,"['HG00154', 'HG00159', 'HG00250', 'HG00264', '...",,
2,UW_VH_19141,1,3,177294474,177297489,0.032,3015,133,1,"['HG00180', 'HG00185', 'HG00189', 'HG00237', '...",,
3,DEL_pindel_47187,2,18,45379612,45379612,0.55,195,847,66,"['HG00114', 'HG00132', 'HG00142', 'HG00150', '...","['HG00108', 'HG00111', 'HG00121', 'HG00125', '...",
4,DEL_pindel_24042,2,7,136996507,136996739,0.071,232,184,48,"['HG01241', 'HG01259', 'HG01488', 'HG01890', '...","['HG01125', 'HG01392', 'HG01403', 'HG01556', '...",


In [36]:
variants=df.id.tolist()
variants

['UW_VH_9038',
 'SI_BD_10797',
 'UW_VH_19141',
 'DEL_pindel_47187',
 'DEL_pindel_24042',
 'BI_GS_DEL1_B2_P0106_507',
 'BI_GS_DEL1_B4_P2674_173',
 'UW_VH_10394',
 'BI_GS_DEL1_B2_P0114_484']

In [43]:
# output dirs 
# variant_name/
# |
# l_ samplot/
# l_ bam/
for v in variants:
    d = os.path.join(dirout,v)
    if not os.path.exists(d):
        os.mkdir(d)
    d_bam = os.path.join(dirout,v,'bam')
    d_samplot = os.path.join(dirout,v,'samplot')
    if not os.path.exists(d_bam):
        os.mkdir(d_bam)
    if not os.path.exists(d_samplot):
        os.mkdir(d_samplot)

In [81]:
# filter index file for mapped reads only
paths = pd.read_csv(idx_1kg_phase3,sep='\t').iloc[:,0]
mask = paths.str.contains(r'\.mapped\.ILLUMINA.*bam$',regex=True)
print(mapped.shape)
mapped.head()

(2535,)


0    data/HG00096/alignment/HG00096.mapped.ILLUMINA...
1    data/HG00097/alignment/HG00097.mapped.ILLUMINA...
2    data/HG00099/alignment/HG00099.mapped.ILLUMINA...
3    data/HG00100/alignment/HG00100.mapped.ILLUMINA...
4    data/HG00101/alignment/HG00101.mapped.ILLUMINA...
Name: BAM FILE, dtype: object

In [98]:
def download_1kg_bam_region(
    index_series,
    sample,
    chrom,
    left,
    right,
    outfile,
    bucket='1000genomes',
    phase='phase3'
):
    chrom = str(chrom)
    left = int(left)
    right = int(right)

    # filter index for sample
    mask = index_series.str.contains(sample)
    if bool(mask.sum() > 1):
        raise ValueError("sample name matched more than one file in index")
    key_bam = index_series[mask].tolist()[0]
    key_bam = os.path.join(phase, key_bam) # e.g., phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam
    key_bai = key_bam + '.bai'
    # e.g., s3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam
    url_bam = os.path.join(
        's3://', bucket, key_bam
    )
    return url_bam

    base = 'test'
    out_idx=os.path.join(scratch, f'{base}.full.bam.bai')
    #url_idx="s3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam.bai"
    url_bam="s3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam"

    # download index
    s3 = boto3.client('s3')
    s3.download_file(bucket, key_idx, out_idx)

    with pysam.AlignmentFile(url_bam, "rb", index_filename=out_idx) as f_in:
        with pysam.AlignmentFile(out, "wb", header=f_in.header) as f_out:
            for read in f_in.fetch(chrom, region_l, region_r):
                f_out.write(read)
    # index region bam
    pysam.index(out)
download_1kg_bam_region(mapped,"HG01125",19,54886338,54889354,'test.py.bam')
        

's3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam'

In [44]:
df.head()

Unnamed: 0,id,n_modes,chr,start,stop,allele_freq,length,n_samples,n_ref_samples,mode_1,mode_2,mode_3
0,UW_VH_9038,1,19,54887338,54888354,0.014,1016,51,0,"['HG01125', 'HG01896', 'HG01956', 'HG02009', '...",,
1,SI_BD_10797,1,10,86801825,86802449,0.033,624,283,139,"['HG00154', 'HG00159', 'HG00250', 'HG00264', '...",,
2,UW_VH_19141,1,3,177294474,177297489,0.032,3015,133,1,"['HG00180', 'HG00185', 'HG00189', 'HG00237', '...",,
3,DEL_pindel_47187,2,18,45379612,45379612,0.55,195,847,66,"['HG00114', 'HG00132', 'HG00142', 'HG00150', '...","['HG00108', 'HG00111', 'HG00121', 'HG00125', '...",
4,DEL_pindel_24042,2,7,136996507,136996739,0.071,232,184,48,"['HG01241', 'HG01259', 'HG01488', 'HG01890', '...","['HG01125', 'HG01392', 'HG01403', 'HG01556', '...",


In [None]:
mapped.str.contains('NA21143')

2533    data/NA21143/alignment/NA21143.mapped.ILLUMINA...
Name: BAM FILE, dtype: object

In [28]:
import boto3
import pysam
import os

base = 'test'
bucket='1000genomes'
key_idx='phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam.bai'
out_idx=os.path.join(scratch, f'{base}.bam.bai')
#url_idx="s3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam.bai"
url_bam="s3://1000genomes/phase3/data/HG01125/alignment/HG01125.mapped.ILLUMINA.bwa.CLM.low_coverage.20120522.bam"
chrom = "19"
region_l = 54886338
region_r= 54889354
out_bam=os.path.join(scratch, f'{base}.bam')

# download index
s3 = boto3.client('s3')
s3.download_file(bucket, key_idx, out_idx)

## Use pysam's remote access capability
#url = f"s3://{bucket_name}.s3.amazonaws.com/{bam_key}"
with pysam.AlignmentFile(url_bam, "rb", index_filename=out_idx) as f_in:
    with pysam.AlignmentFile(out_bam, "wb", header=f_in.header) as f_out:
        for read in f_in.fetch(chrom, region_l, region_r):
            f_out.write(read)


''

In [29]:
2.2 * 4562 / 60

167.27333333333337