In this notebook I will trial a working concept of the core functionality.

Main design:
- Indexed bam file is read by multiple workers in paralllel
- Each worker fetches a region and applies a user-specified function over the region.
- Regions can be specified manually by user, or defined automatically according to different schemes. First scheme to implement is `byfeature` which bins genome into N non-overlapping regions that each contain features of coordinates `(start, end)` such that each feature is contained in only one region.

In [38]:
import multiprocessing as mp
import pysam
import pandas as pd
from datetime import datetime

In [20]:
bamfilepath = "../data/SLX-18505_N701_A03_r2.umiAppend.Aligned.out.featureCounts.sorted.bam"

In [21]:
samfile = pysam.AlignmentFile(bamfilepath, "rb")

In [19]:
samfile.lengths[1:10]

(133797422,
 135086622,
 133275309,
 114364328,
 107043718,
 101991189,
 90338345,
 83257441,
 80373285)

In [4]:
samfile.nreferences

352

In [50]:
def fetch(bamfilepath, contig=None, start=None, end=None):
    samfile = pysam.AlignmentFile(bamfilepath, "rb")
    reads = []
    for read in samfile.fetch(contig, start, end):
        reads.append((read.query_name, read.get_tag("XS:Z")))
    samfile.close()
    return reads

def apply(func, args_df, processes=None, timeout=None):
    results = []
    with mp.Pool(processes=processes) as pool:
        for i in range(len(args_df.index)):
            res = pool.apply_async(func, args_df.iloc[i,])
            results.append(res.get(timeout=timeout))
    return results

def read_bam(bamfilepath, contigs=None, starts=None, ends=None, processes=None, timeout=None, chunks=None):
    kwargs_df = make_fetch_args(bamfilepath, contigs, starts, ends, chunks)
    res = apply(fetch, kwargs_df, processes=processes, timeout=timeout)
    return res

def make_fetch_args(bamfilepath, contigs, starts, ends, chunks):
    with pysam.AlignmentFile(bamfilepath, "rb") as samfile:
        contig_names = samfile.references
        contig_lens = samfile.lengths
    if chunks is None or chunks < len(contig_names):
        kwargs_df = pd.DataFrame({
            "bamfilepath": bamfilepath,
            "contig": contig_names
        })
    else:
        # split contigs by len in order to get right number of chunks
        raise NotImplementedError
    return kwargs_df

In [51]:
print(datetime.now())
bam = read_bam(bamfilepath, processes=1)
print(datetime.now())

2020-11-26 00:17:04.838269
2020-11-26 00:17:10.987579


In [52]:
print(datetime.now())
bam = read_bam(bamfilepath, processes=2)
print(datetime.now())

2020-11-26 00:17:10.994089
2020-11-26 00:17:17.174397


In [53]:
print(datetime.now())
bam = read_bam(bamfilepath, processes=4)
print(datetime.now())

2020-11-26 00:17:17.179578
2020-11-26 00:17:23.461858
