In [1]:
# Function to define initial quality filters

MAPQ_THRESHOLD = 30

def passes_initial_filters(read):
    return (
        read.mapping_quality >= MAPQ_THRESHOLD and
        not read.is_unmapped and
        not read.mate_is_unmapped and
        not read.is_duplicate and
        not read.is_qcfail and
        not read.is_secondary and
        not read.is_supplementary
    )

In [2]:
# Functions to identify reads that align with nuclear and mitochondrial sequences

CHR_SYNONYMS = {
    "MT": {"MT", "chrM", "NC_012920.1", "M", "mt"},
    "NUC": set([*(map(str, range(1,23))), "X","Y", *[f"chr{i}" for i in range(1,23)], "chrX","chrY"])
}

def is_mt(ref):
    return ref is not None and ref.upper() in CHR_SYNONYMS["MT"]

def is_nuc(ref):
    return  ref is not None and ref.upper() in CHR_SYNONYMS["NUC"]

In [3]:
# Function to identify paired reads that contain both nuclear and mtDNA

def is_nuclear_mt_pair(read):
    r1, r2 = read.reference_name, read.next_reference_name
    if r1 is None or r2 is None:
        return False
    return (is_nuc(r1) and is_mt(r2)) or (is_mt(r1) and is_nuc(r2))

In [4]:
# Loops through BAM file to identify NUMT candidates

import pysam
import csv

BAM_PATH = "../results/sample_alignment_sorted.bam"
OUTPUT_CSV = "../results/NUMT_candidates.csv"

count = 0

with pysam.AlignmentFile(BAM_PATH, "rb") as bamfile, open(OUTPUT_CSV, "w", newline='') as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=[
        "query_name", "ref_name", "ref_start", "mate_ref_name", "mate_start",
        "read_length", "mapq", "cigar", "is_reverse", "mate_is_reverse"
    ])
    writer.writeheader()
    seen_pairs = set()

    for read in bamfile.fetch():
        if not passes_initial_filters(read):
            continue
        if not is_nuclear_mt_pair(read):
            continue

        pair_key = tuple(sorted([read.query_name, read.next_reference_name]))
        if pair_key in seen_pairs:
            continue
        seen_pairs.add(pair_key)

        insert_size = abs(read.template_length)

        writer.writerow({
            "query_name": read.query_name,
            "ref_name": read.reference_name,
            "ref_start": read.reference_start,
            "mate_ref_name": read.next_reference_name,
            "mate_start": read.next_reference_start,
            "read_length": read.query_length,
            "mapq": read.mapping_quality,
            "cigar": read.cigarstring,
            "is_reverse": read.is_reverse,
            "mate_is_reverse": read.mate_is_reverse
        })

        count += 1

print(f"Found {count} NUMT candidate reads")

Found 1917 NUMT candidate reads
