In [1]:
def passes_initial_filters(read):
    return (
        read.mapping_quality >= 30 and
        not read.is_unmapped and
        not read.mate_is_unmapped and
        not read.is_duplicate and
        not read.is_qcfail and
        not read.is_secondary and
        not read.is_supplementary
    )

In [2]:
nuclear_chromosomes = [f"{i}" for i in range(1,23)] + ["X", "Y"]
mt_chromosome = ["MT", "NC_012920.1"]

def is_nuclear_mt_pair(read):
    if read.next_reference_name is None:
        return False
    if read.reference_name in nuclear_chromosomes and read.next_reference_name in mt_chromosome:
        return True
    if read.reference_name in mt_chromosome and read.next_reference_name in nuclear_chromosomes:
        return True
    return False

In [3]:
import pysam
import pandas as pd

bam_path = "../results/sample_alignment_sorted.bam"

numt_candidates = []

with pysam.AlignmentFile(bam_path, "rb") as bamfile:
    for read in bamfile.fetch():
        if not passes_initial_filters(read):
            continue
        if not is_nuclear_mt_pair(read):
            continue

        mate_ref = read.next_reference_name
        numt_candidates.append({
            "query_name": read.query_name,
            "ref_name": read.reference_name,
            "ref_start": read.reference_start,
            "mate_ref_name": mate_ref,
            "mate_start": read.next_reference_start,
            "mapq": read.mapping_quality,
            "cigar": read.cigarstring,
            "is_reverse": read.is_reverse,
            "mate_is_reverse": read.mate_is_reverse
        })

print(f"Found {len(numt_candidates)} NUMT candidate reads")
df = pd.DataFrame(numt_candidates)
df.to_csv("../results/NUMT_candidates.csv", index=False)

Found 143 NUMT candidate reads
