In [None]:
import os
import json
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
import matplotlib.patches as patches

%matplotlib inline
plt.rcParams['figure.figsize'] = [16.5, 5]
plt.rcParams['font.size'] = 12

In [None]:
import pysam
import itertools

In [None]:
trgt_dir = "/Volumes/winchester:u6026198/quinlan-shared/data-shared/datasets/Palladium/TRGT/"

In [None]:
def get_reads_by_allele(bam, chrom, start):
    get_coord = lambda read: (read.reference_name, read.pos)
    for coord, reads in itertools.groupby(bam.fetch(chrom, start, until_eof=True), key=get_coord):
        reads = list(reads)
        reads_by_allele = [[] for _ in range(max(r.get_tag("AL") for r in reads) + 1)]
        for read in reads:
            reads_by_allele[read.get_tag("AL")].append(read.query_sequence)
        yield coord, reads_by_allele


def load_repeat_reads(bam, chrom, start):
    for coord, reads_by_allele in get_reads_by_allele(bam, chrom, start):
        print(coord)
        if (chrom, start) == coord:
            return reads_by_allele
    assert False




In [None]:
def calc_mad(values):
    median = np.median(values)
    mad = np.median([abs(v - median) for v in values])
    return mad

In [None]:
bam = pysam.AlignmentFile("/Volumes/winchester:u6026198/quinlan-shared/data-shared/datasets/Palladium/TRGT/2187_D_adotto_v02.spanning.sorted.bam", "r")
reads_by_allele = load_repeat_reads(bam, "chr5", 10694357)
lens_by_allele = [[len(r) - 100 for r in reads] for reads in reads_by_allele]
for lens in lens_by_allele:
    allele_len = np.median(lens)
    allele_mad = calc_mad(lens)
    lens_and_mads.append((allele_len, allele_mad))

In [None]:
lens_and_mads = []

bam = pysam.AlignmentFile("/Volumes/winchester:u6026198/quinlan-shared/data-shared/datasets/Palladium/TRGT/2187_D_adotto_v02.spanning.sorted.bam", "r")
reads_by_allele = bam.fetch( "chrX", 147912050)
for reads in reads_by_allele:
    print(reads)
lens_by_allele = [[len(r) - 100 for r in reads] for reads in reads_by_allele]
for lens in lens_by_allele:
    allele_len = np.median(lens)
    allele_mad = calc_mad(lens)
    lens_and_mads.append((allele_len, allele_mad))


In [None]:
# Plot an example locus in one sample

lens_and_mads = []

for f in os.listdir(trgt_dir):
    if f.startswith('200102_S_adotto_v02'):
            if not f.endswith(".bam") or f.endswith(".spanning.bam"):
                continue
            print(os.path.join(trgt_dir, f))
            bam = pysam.AlignmentFile(os.path.join(trgt_dir, f), "r")
            reads_by_allele = bam.fetch( "chrX", 147912050)
            lens_by_allele = [[len(r) - 100 for r in reads] for reads in reads_by_allele]
            for lens in lens_by_allele:
                allele_len = np.median(lens)
                allele_mad = calc_mad(lens)
                lens_and_mads.append((allele_len, allele_mad))
        

In [None]:
lens_and_mads = []

for file in os.listdir(trgt_dir):
    if not file.endswith(".bam") or file.endswith(".spanning.bam"):
        continue
    reads_by_allele = load_repeat_reads("chrX", 147912050, os.path.join(trgt_dir, file))
    lens_by_allele = [[len(r) - 100 for r in reads] for reads in reads_by_allele]
    for lens in lens_by_allele:
        allele_len = np.median(lens)
        allele_mad = calc_mad(lens)
        lens_and_mads.append((allele_len, allele_mad))


In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
lens = [l for l, m in lens_and_mads]
mads = [m for l, m in lens_and_mads]

ax.scatter(lens, mads)
ax.set_xlabel("Allele length (bps)")
ax.set_ylabel("MAD");