Reads in the NUMT candidate csv file and converts to a dataframe

In [None]:
import pandas as pd

CSV_PATH = "../results/NUMT_candidates.csv"

df = pd.read_csv(CSV_PATH)

Groups the reads into clusters that are close to each other

In [None]:
BIN_SIZE = 500 # How many bases contained in a bin
MIN_NUM_READS = 2 # How many reads must be contained in a cluster to be included

df["bin_start"] = (df["ref_start"] // BIN_SIZE) * BIN_SIZE

grouped = df.groupby(["ref_name", "bin_start", "is_reverse"])

clustered = grouped.agg(
    num_reads=("query_name", "count"),
    avg_ref_start=("ref_start", "mean"),
    avg_mate_start=("mate_start", "mean")
).reset_index()

filtered_clusters = clustered[clustered["num_reads"]  >= MIN_NUM_READS]

print(f"Clusters before filtering: {len(clustered)}")
print(f"Clusters after filtering: {len(filtered_clusters)}")

Clusters before filtering: 1716
Clusters after filtering: 184


Views the top 10 clusters and visulizes all filtered clusters

In [None]:
# View top 10 clusters

top_clusters = filtered_clusters.sort_values("num_reads", ascending=False).head(10)
print(top_clusters)

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 5))

sns.histplot(
    data=filtered_clusters,
    x="avg_ref_start",
    hue="ref_name",
    multiple="stack",
    bins=50,
    palette="tab20"     
)

plt.xlabel("Position on chromosome")
plt.ylabel("Number of clusters")
plt.title("NUMT Candidate Clusters per Chromosome")
plt.legend(title="Chromosome", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Saves the filtered clusters into a new csv file

In [5]:
# Save filtered clusters to csv file

NEW_CSV_PATH = "../results/filtered_NUMT_candidates.csv"

filtered_clusters.to_csv(NEW_CSV_PATH, index=False)