In [2]:
import os 
os.chdir("/Users/jazminvaleriano/Library/Mobile Documents/com~apple~CloudDocs/03 UNIFR MS/00. SP25/00.MASTER_THESIS/FINAL_CHAPTERS/13_variants_analyisis")

In [21]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from upsetplot import UpSet, from_memberships

# === CONFIG ===
mapping_file = "01_minimap/barcode42_all_parvoviridae.read_to_ref.tsv"
max_accessions = 10
title = "Read Mapping Overlaps by Accession"

# === Accession → Virus name map ===
accession_map = {
    "MW051666.1": "Bocavirus",
    "NC_038546.1": "Hokovirus",
    "MW853950.1": "Parvovirus 3",
    "PP993478.1": "Parvoviridae"
}

# === LOAD ===
map_df = pd.read_csv(
    mapping_file, sep="\t", header=None, usecols=[0, 1],
    names=["read_id", "accession"], dtype={"read_id": str, "accession": str}
).dropna(subset=["read_id", "accession"])

# --- PRE-CLEAN ---
map_df = map_df.drop_duplicates(subset=["read_id", "accession"])

# --- Filter: remove accession '*' and keep only those in accession_map ---
map_df = map_df[map_df["accession"].isin(accession_map.keys())]

if map_df.empty:
    raise ValueError("No valid mappings left after filtering (check accession map and input).")

# --- Replace accession IDs with human-friendly names ---
map_df["virus"] = map_df["accession"].map(accession_map)

# --- Keep top-N viruses ---
top_viruses = (
    map_df["virus"].value_counts()
    .head(max_accessions)
    .index
    .tolist()
)
filtered = map_df[map_df["virus"].isin(top_viruses)]

# --- Build memberships per read ---
memberships = (
    filtered.groupby("read_id")["virus"]
    .apply(lambda s: tuple(sorted(s.unique())))
    .tolist()
)

memberships = [m for m in memberships if m]

# --- Build data for UpSet ---
data = from_memberships(memberships)

data

Parvoviridae
True    1
True    1
True    1
True    1
True    1
True    1
True    1
True    1
True    1
Name: ones, dtype: int64

In [22]:

# --- Plot ---
plt.figure(figsize=(14.5, 6))
UpSet(
    data,
    sort_by="cardinality",
    sort_categories_by="cardinality",
    show_counts=True,
    subset_size="count"
).plot()

sample_name = os.path.basename(mapping_file).split("_")[0]
plt.suptitle(f"{title} — {sample_name}", fontsize=16, fontweight="bold")
plt.tight_layout()
# plt.savefig(f"{sample_name}_upset.pdf")
plt.show()


AttributeError: 'Index' object has no attribute 'levels'

<Figure size 1450x600 with 0 Axes>

In [None]:

# === PLOT UpSet ===
plt.figure(figsize=(14.5, 6))
up = UpSet(
    data,
    sort_by="cardinality",            # sort intersections by size
    sort_categories_by="cardinality", # sort individual accessions by total size
    show_counts=True
)
up.plot()

plt.suptitle(f"{title} — {sample_name}", fontsize=16, fontweight="bold")
plt.tight_layout()
# plt.savefig(f"{sample_name}_upset.pdf")
plt.show()