In [1]:
from pathlib import Path
import os, sys, json
import pandas as pd
from tqdm import tqdm

In [2]:
# --- project root & local modules ---
PROJ_ROOT = "/home/hernan_melmoth/Documents/phd_work/otu-taxa-foundation"
sys.path.append(os.path.join(PROJ_ROOT, "src"))

from otu_taxa.taxonomy_parsing import (
    load_sintax_table,
    split_tax_path,
    last_contiguous_valid_token,
    token_depth,
    RANKS,
)


In [4]:
# ============================================================
# Genus-corruption affected OTU list (FULL, repo-ready cell)
#   Definition:
#     An OTU is "affected" iff:
#       (1) Baseline (non-corrupted) taxonomy has a non-missing genus, and
#       (2) Corrupted taxonomy genus is missing OR different from baseline genus.
#
#   Output:
#     <dataset_dir>/affected_otu_ids_from_removed_genus.txt
# ============================================================

In [5]:

# -----------------------------
# Dataset output directory 
# -----------------------------
DATASET_ROOT = "/home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training"
dataset_folder_name = "dataset_full_top999"

dataset_dir = os.path.join(
    DATASET_ROOT,
    "level_97",
    "silva-138.2",
    "incomplete_genus_silva_sintax",
    dataset_folder_name,
)
DATASET_DIR = Path(dataset_dir)
assert DATASET_DIR.exists(), f"Missing dataset_dir: {DATASET_DIR}"

OUT_AFFECTED_PATH = DATASET_DIR / "affected_otu_ids_from_removed_genus.txt"

# -----------------------------
# SINTAX inputs (baseline vs genus-corrupted)
# -----------------------------
BASE_SINTAX_PATH = Path(
    "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/"
    "taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt"
)
assert BASE_SINTAX_PATH.exists(), f"Missing baseline sintax: {BASE_SINTAX_PATH}"

GENUS_CORRUPT_DIR = Path(
    "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/"
    "taxonomy_reference/silva-138.2/vsearch_incomplete_genus_fromOTUS_predictions"
)

# Adjust if your filename differs
CORRUPT_SINTAX_PATH = GENUS_CORRUPT_DIR / "repseqs_sintax_incomplete_genus.txt"
# CORRUPT_SINTAX_PATH = GENUS_CORRUPT_DIR / "repseqs_sintax_incomplete.txt"

assert CORRUPT_SINTAX_PATH.exists(), f"Missing genus-corrupted sintax: {CORRUPT_SINTAX_PATH}"

print("BASE_SINTAX_PATH    =", BASE_SINTAX_PATH)
print("CORRUPT_SINTAX_PATH =", CORRUPT_SINTAX_PATH)
print("OUT_AFFECTED_PATH   =", OUT_AFFECTED_PATH)

# ============================================================
# IO + parsing utilities
# ============================================================

RANKS = ["k", "p", "c", "o", "f", "g", "s"]

def load_sintax_table(path: str) -> pd.DataFrame:
    """
    Load SINTAX output into a 2-column dataframe:
      otu_id | taxonomy

    Supports common SINTAX tab layouts:
      - (0,3): otu_id, ..., taxonomy
      - (0,1): otu_id, raw_sintax (taxonomy embedded with confidences)
    """
    try:
        df = pd.read_csv(
            path,
            sep="\t",
            header=None,
            engine="python",
            usecols=[0, 3],
            names=["otu_id", "taxonomy"],
            dtype=str,
        )
    except Exception:
        df_raw = pd.read_csv(
            path,
            sep="\t",
            header=None,
            engine="python",
            usecols=[0, 1],
            names=["otu_id", "raw_sintax"],
            dtype=str,
        )

        def drop_conf(s: str) -> str:
            if pd.isna(s):
                return ""
            parts = []
            for p in s.strip().rstrip(";").split(","):
                if ":" not in p:
                    continue
                parts.append(p.split("(", 1)[0].strip())
            return ",".join(parts)

        df_raw["taxonomy"] = df_raw["raw_sintax"].apply(drop_conf)
        df = df_raw[["otu_id", "taxonomy"]]

    df["otu_id"] = df["otu_id"].astype(str)
    df["taxonomy"] = df["taxonomy"].fillna("").astype(str)
    return df

def _is_missing_name(x: str) -> bool:
    if not isinstance(x, str):
        return True
    s = x.strip()
    return (s == "") or (s.lower() in {"unknown", "__unknown"})

def parse_tax_to_cols(series: pd.Series) -> pd.DataFrame:
    """
    Parse SINTAX taxonomy strings into per-rank name columns (no rank prefix).
    Output columns: k,p,c,o,f,g,s (strings; missing -> "").
    """
    ranks = ["k","p","c","o","f","g","s"]

    def to_dict(tax: str):
        out = {r: "" for r in ranks}
        s = str(tax or "").strip().rstrip(";")
        if not s:
            return pd.Series(out)

        # SINTAX typically uses commas between tokens
        for part in s.split(","):
            part = part.strip()
            if ":" not in part:
                continue
            r, name = part.split(":", 1)
            r = r.strip().lower()
            name = name.split("(", 1)[0].strip()  # drop confidence if present
            if r in out:
                out[r] = name
        return pd.Series(out)

    return series.apply(to_dict)

# ============================================================
# Build genus-affected OTU list
# ============================================================

base_df = load_sintax_table(str(BASE_SINTAX_PATH)).drop_duplicates("otu_id")
corr_df = load_sintax_table(str(CORRUPT_SINTAX_PATH)).drop_duplicates("otu_id")

base_cols = parse_tax_to_cols(base_df["taxonomy"])
corr_cols = parse_tax_to_cols(corr_df["taxonomy"])

base = pd.concat([base_df[["otu_id"]], base_cols], axis=1)
corr = pd.concat([corr_df[["otu_id"]], corr_cols], axis=1)

m = base.merge(corr, on="otu_id", how="inner", suffixes=("_base", "_corr"))
print("[INFO] OTUs in common:", len(m))

mask_base_has_g = ~m["g_base"].apply(_is_missing_name)
mask_corr_diff_or_missing = (m["g_corr"].apply(_is_missing_name)) | (m["g_corr"].astype(str) != m["g_base"].astype(str))

affected_otus = m.loc[mask_base_has_g & mask_corr_diff_or_missing, "otu_id"].astype(str).tolist()

stats = {
    "common_otus": int(len(m)),
    "baseline_has_genus": int(mask_base_has_g.sum()),
    "affected_genus": int(len(affected_otus)),
    "affected_genus_rate_among_baseline_genus": float(len(affected_otus) / max(1, int(mask_base_has_g.sum()))),
    "affected_due_to_missing_genus_in_corrupted": int(
        m.loc[mask_base_has_g & mask_corr_diff_or_missing, "g_corr"].apply(_is_missing_name).sum()
    ),
    "affected_due_to_changed_genus_name": int(
        ((~m.loc[mask_base_has_g & mask_corr_diff_or_missing, "g_corr"].apply(_is_missing_name)) &
         (m.loc[mask_base_has_g & mask_corr_diff_or_missing, "g_corr"].astype(str) !=
          m.loc[mask_base_has_g & mask_corr_diff_or_missing, "g_base"].astype(str))).sum()
    ),
}

# ============================================================
# Save
# ============================================================

with open(OUT_AFFECTED_PATH, "w") as f:
    for otu in affected_otus:
        f.write(f"{otu}\n")

print("[SAVE] affected OTUs:", OUT_AFFECTED_PATH)
print("[INFO] stats:", stats)

# Optional: preview a few affected OTUs with baseline vs corrupted genus/family
preview = m.loc[mask_base_has_g & mask_corr_diff_or_missing, ["otu_id", "f_base", "g_base", "f_corr", "g_corr"]].head(20)
display(preview)


BASE_SINTAX_PATH    = /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt
CORRUPT_SINTAX_PATH = /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_incomplete_genus_fromOTUS_predictions/repseqs_sintax_incomplete_genus.txt
OUT_AFFECTED_PATH   = /home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training/level_97/silva-138.2/incomplete_genus_silva_sintax/dataset_full_top999/affected_otu_ids_from_removed_genus.txt
[INFO] OTUs in common: 111870
[SAVE] affected OTUs: /home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training/level_97/silva-138.2/incomplete_genus_silva_sintax/dataset_full_top999/affected_otu_ids_from_removed_genus.txt
[INFO] stats: {'common_otus': 111870, 'baseline_has_genus': 52669, 'affected_genus': 11393, 'affected_genus_rate_among_baseline_genus': 0.21631320131386583, 'affected_due_to_missi

Unnamed: 0,otu_id,f_base,g_base,f_corr,g_corr
15,90_7210;96_31636;97_40024,Rhodocyclaceae,Azospira,,
20,90_2541;96_51254;97_66013,Family_XII,Fusibacter,,
27,90_3972;96_64159;97_83210,Porphyromonadaceae,Candidatus_Armantifilum,Porphyromonadaceae,
34,90_278;96_4367;97_39663,Syntrophaceae,Smithella,Syntrophaceae,
36,90_16778;96_700;97_818,Planctomycetaceae,Planctomyces,Planctomycetaceae,Planctopirus
44,90_1;96_62565;97_81083,Burkholderiales_Incertae_Sedis,Thiomonas,,
55,90_17;96_1095;97_10583,Xanthomonadaceae,Lysobacter,Xanthomonadaceae,
61,90_2424;96_29098;97_36694,Lachnospiraceae,Lachnoclostridium,Lachnospiraceae,
64,90_494;96_12680;97_15390,Planctomycetaceae,Planctomyces,Planctomycetaceae,
73,90_273;96_11755;97_14224,Erysipelotrichaceae,Turicibacter,,
