In [1]:
from pathlib import Path
import os, sys, json
import pandas as pd
from tqdm import tqdm

In [2]:
# --- project root & local modules ---
PROJ_ROOT = "/home/hernan_melmoth/Documents/phd_work/otu-taxa-foundation"
sys.path.append(os.path.join(PROJ_ROOT, "src"))

from otu_taxa.taxonomy_parsing import (
    load_sintax_table,
    split_tax_path,
    last_contiguous_valid_token,
    token_depth,
    RANKS,
)


In [3]:

# ============================================================
# Configuration: dataset output directory 
# ============================================================
DATASET_ROOT = "/home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training"
dataset_folder_name = "dataset_full_top999"

dataset_dir = os.path.join(
    DATASET_ROOT,
    "level_97",
    "silva-138.2",
    "incomplete_genus_silva_sintax",
    dataset_folder_name,
)
DATASET_DIR = Path(dataset_dir)
assert DATASET_DIR.exists(), f"Missing dataset_dir: {DATASET_DIR}"

# Save here (inside dataset_dir)
OUT_AFFECTED_PATH = DATASET_DIR / "affected_otu_ids_from_removed_genus.txt"

# ============================================================
# Configuration: SINTAX inputs (genus corruption)
# ============================================================

# Baseline (non-corrupted) SINTAX
BASE_SINTAX_PATH = Path(
    "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/"
    "taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt"
)
assert BASE_SINTAX_PATH.exists(), f"Missing baseline sintax: {BASE_SINTAX_PATH}"

# Corrupted (genus-removed) SINTAX from OTU predictions
GENUS_CORRUPT_DIR = Path(
    "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/"
    "taxonomy_reference/silva-138.2/vsearch_incomplete_genus_fromOTUS_predictions"
)

# According to your earlier screenshot, the file is named like this:
CORRUPT_SINTAX_PATH = GENUS_CORRUPT_DIR / "repseqs_sintax_incomplete_genus.txt"

# If the filename differs, switch to the correct one here:
# CORRUPT_SINTAX_PATH = GENUS_CORRUPT_DIR / "repseqs_sintax_incomplete.txt"

assert CORRUPT_SINTAX_PATH.exists(), f"Missing genus-corrupted sintax: {CORRUPT_SINTAX_PATH}"

print("BASE_SINTAX_PATH     =", BASE_SINTAX_PATH)
print("CORRUPT_SINTAX_PATH  =", CORRUPT_SINTAX_PATH)
print("OUT_AFFECTED_PATH    =", OUT_AFFECTED_PATH)

# ============================================================
# Effective taxonomy extraction (contiguous-valid-prefix policy)
# ============================================================

def effective_label_and_depth(taxonomy_str: str):
    """
    Effective taxonomy assignment under the contiguous-valid-prefix policy.

    Returns
    -------
    label_token : str or None
        Deepest contiguous valid taxonomy token.
    depth : int
        Rank depth index (0=kingdom ... 6=species), or -1 if unavailable.
    """
    tokens = split_tax_path(taxonomy_str)
    if not tokens:
        return None, -1

    label = last_contiguous_valid_token(tokens)
    if label is None:
        return None, -1

    d = token_depth(label)
    return label, (-1 if d is None else int(d))


def build_otu_effective_map(df: pd.DataFrame):
    """
    Build mapping:
        otu_id -> (label_token, depth, raw_taxonomy_string)
    """
    df = df.drop_duplicates("otu_id")
    out = {}
    for otu, tax in zip(df["otu_id"].astype(str),
                        df["taxonomy"].fillna("").astype(str)):
        label, depth = effective_label_and_depth(tax)
        out[otu] = (label, depth, tax)
    return out


# ============================================================
# Load SINTAX annotations
# ============================================================
base_df = load_sintax_table(str(BASE_SINTAX_PATH))
corr_df = load_sintax_table(str(CORRUPT_SINTAX_PATH))

base_map = build_otu_effective_map(base_df)
corr_map = build_otu_effective_map(corr_df)

common_otus = sorted(set(base_map) & set(corr_map))
print("[INFO] OTUs in common:", len(common_otus))

# ============================================================
# Identify affected OTUs (genus corruption)
# ============================================================
affected_otus = []
stats = {
    "baseline_no_label_skipped": 0,
    "corrupted_no_label_affected": 0,
    "label_changed_affected": 0,
    "depth_changed_affected": 0,
}

for otu in tqdm(common_otus, desc="Detecting affected OTUs (genus corruption)"):
    base_label, base_depth, _ = base_map[otu]
    corr_label, corr_depth, _ = corr_map[otu]

    # Skip OTUs without a valid baseline taxonomy (cannot assess corruption impact)
    if base_label is None:
        stats["baseline_no_label_skipped"] += 1
        continue

    # Corrupted lost its usable taxonomy label
    if corr_label is None:
        stats["corrupted_no_label_affected"] += 1
        affected_otus.append(otu)
        continue

    # Effective label changed
    if corr_label != base_label:
        stats["label_changed_affected"] += 1
        affected_otus.append(otu)
        continue

    # Effective depth changed (rare but tracked)
    if corr_depth != base_depth:
        stats["depth_changed_affected"] += 1
        affected_otus.append(otu)
        continue

# ============================================================
# Save
# ============================================================
with open(OUT_AFFECTED_PATH, "w") as f:
    for otu in affected_otus:
        f.write(f"{otu}\n")

print("[SAVE] affected OTUs:", OUT_AFFECTED_PATH)
print("[INFO] total affected OTUs:", len(affected_otus))
print("[INFO] stats:", stats)


BASE_SINTAX_PATH     = /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt
CORRUPT_SINTAX_PATH  = /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_incomplete_genus_fromOTUS_predictions/repseqs_sintax_incomplete_genus.txt
OUT_AFFECTED_PATH    = /home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training/level_97/silva-138.2/incomplete_genus_silva_sintax/dataset_full_top999/affected_otu_ids_from_removed_genus.txt
[INFO] OTUs in common: 111870


Detecting affected OTUs (genus corruption): 100%|██████████| 111870/111870 [00:00<00:00, 977422.98it/s]

[SAVE] affected OTUs: /home/hernan_melmoth/Documents/phd_work/Microbeatlas_preprocess_training/level_97/silva-138.2/incomplete_genus_silva_sintax/dataset_full_top999/affected_otu_ids_from_removed_genus.txt
[INFO] total affected OTUs: 12883
[INFO] stats: {'baseline_no_label_skipped': 11042, 'corrupted_no_label_affected': 534, 'label_changed_affected': 12349, 'depth_changed_affected': 0}



