Helper functions for identifying and filtering clipped reads

In [None]:
# Function to identify clips in reads

import pysam

# This function identifies a soft clip within the cigar string of a read
def identify_clip(read):
    cigar = read.cigartuples
    seq = read.query_sequence

    clipped_seq = None
    if cigar[0][0] == 4: # 4 -> "S" which equates to a soft clip
        clipped_seq = seq[:cigar[0][1]]
    elif cigar[-1][0] == 4:
        clipped_seq = seq[-cigar[-1][1]:]

    return clipped_seq

# This function returns the quality score of a clipped read
def get_clipped_qual(read, clipped_seq):
    qual_scores = read.query_qualities
    if read.cigartuples[0][0] == 4:
        clipped_qual = qual_scores[:read.cigartuples[0][1]]
    elif read.cigartuples[-1][0] == 4:
        clipped_qual = qual_scores[-read.cigartuples[-1][1]:]
    else:
        return None
    return "".join([chr(q + 33) for q in clipped_qual])

Loops through the clusters and extracts soft clipped sequences - sequences that don't match the specific reference, but aligns somewhere else on the reference genome

In [None]:
import pandas as pd
import pysam

CSV_PATH = "../results/filtered_NUMT_candidates.csv"
BAM_PATH = "../results/sample_alignment_sorted.bam"
CLIPPED_FASTQ = "../results/clipped_reads.fq"
CLIPPED_CSV = "../results/clipped_reads_with_cluster.csv"

df_clusters = pd.read_csv(CSV_PATH)
bam = pysam.AlignmentFile(BAM_PATH, "rb")

BIN_SIZE = 50 # How big the bins are
BUFFER = 1000 # How many bases to the left and right of the genome the windows begin and end
MIN_CLIP_LEN = 10 # Required length of a clip to be counted

clipped_reads = []

for index, cluster in df_clusters.iterrows():
    chrom = cluster["ref_name"]
    start = max(0, cluster["bin_start"] - BUFFER)
    end = cluster["bin_start"] + BIN_SIZE + BUFFER

    for read in bam.fetch(chrom, start, end):
        if read.cigarstring is None:
            continue
        if "S" not in read.cigarstring:
            continue

        clipped_seq = identify_clip(read)
        if clipped_seq is None or len(clipped_seq) < MIN_CLIP_LEN:
            continue

        qual_str = get_clipped_qual(read, clipped_seq)
        if qual_str is None or len(qual_str) < MIN_CLIP_LEN:
            continue

        clipped_reads.append((read.query_name, clipped_seq, qual_str, index))

Wrote 1342 clipped reads to ../results/clipped_reads.fq and ../results/clipped_reads_with_cluster.csv


Writes the clipped reads to a FASTQ file and a new csv file

In [None]:
with open(CLIPPED_FASTQ, "w") as fq_out:
    for qname, seq, qual, _ in clipped_reads:
        fq_out.write(f"@{qname}\n{seq}\n+\n{qual}\n")

clipped_df = pd.DataFrame(clipped_reads, columns=["read_name", "seq", "qual", "cluster_idx"])
clipped_df.to_csv(CLIPPED_CSV, index=False)

print(f"Wrote {len(clipped_reads)} clipped reads to {CLIPPED_FASTQ} and {CLIPPED_CSV}")

Runs a series of bash commands to generate a sorted bam file that shows alignment of clipped sequences to the mt reference genome

In [None]:
import subprocess

BWA_PATH = "bwa"
MTDNA_REF = "../data/human_mtDNA.fa"

SAM_FILE = "../results/clipped_vs_mtDNA.sam"
BAM_FILE = "../results/clipped_vs_mtDNA.bam"
SORTED_BAM = "../results/clipped_vs_mtDNA_sorted.bam"

subprocess.run([BWA_PATH, "mem", MTDNA_REF, CLIPPED_FASTQ, "-o", SAM_FILE], check=True)
subprocess.run(["samtools", "view", "-bS", SAM_FILE, "-o", BAM_FILE], check=True)
subprocess.run(["samtools", "sort", BAM_FILE, "-o", SORTED_BAM], check=True)
subprocess.run(["samtools", "index", SORTED_BAM], check=True)

print(f"Sorted BAM and index ready: {SORTED_BAM}")