# setup

In [1]:
import os
import json
import re
import pandas as pd

# get the afected OTUs by the corruption of silva

In [2]:

# -------------------------
# Paths (edit if needed)
# -------------------------
true_path = "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt"

removed_species_path = "/home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_incomplete_species_fromOTUS_predictions/removed_species_20pct_seed123.json"


# -------------------------
# Helpers
# -------------------------
RANKS = ["k", "p", "c", "o", "f", "g", "s"]

def _clean_name(x):
    """Normalize a taxonomy name: remove confidence parentheses, trim, remove surrounding quotes."""
    if x is None:
        return ""
    s = str(x).strip()
    if not s:
        return ""
    # drop confidence "(...)" if present
    s = s.split("(", 1)[0].strip()
    # remove surrounding single/double quotes if present
    if len(s) >= 2 and ((s[0] == s[-1] == "'") or (s[0] == s[-1] == '"')):
        s = s[1:-1].strip()
    return s

def is_unknown(name):
    """Heuristic unknown detector consistent with your prior usage."""
    s = _clean_name(name)
    if s == "":
        return True
    sl = s.lower()
    return ("__unknown" in sl) or (sl == "unknown") or s.startswith("Unknown_")

def load_removed_species_list(json_path):
    with open(json_path) as f:
        items = json.load(f)
    if not isinstance(items, list):
        raise ValueError("removed_species JSON must be a list of strings.")
    out = set()
    for x in items:
        s = _clean_name(x)
        if s:
            out.add(s)
    return out

def drop_conf_and_extract_taxonomy(raw):
    """
    Convert a raw SINTAX string into a canonical comma-separated rank:name string.
    Handles strings like: "k:Bacteria(1.00),p:Firmicutes(0.99),...,s:Acholeplasma_laidlawii(0.97)"
    """
    if raw is None:
        return ""
    raw = str(raw).strip()
    if not raw:
        return ""
    # Split by ',' and keep only parts containing ':'
    parts = []
    for p in raw.rstrip(";").split(","):
        p = p.strip()
        if ":" not in p:
            continue
        r, name = p.split(":", 1)
        r = r.strip()
        name = _clean_name(name)
        if r and name:
            parts.append(r + ":" + name)
        elif r:
            parts.append(r + ":")
    return ",".join(parts)

def load_true_sintax_table(path):
    """
    Reads SINTAX output robustly.
    Tries to read otu_id + taxonomy from columns [0,3] first; otherwise uses [0,1].
    Returns df with columns: otu_id (str), taxonomy (str)
    """
    try:
        df = pd.read_csv(
            path, sep="\t", header=None, engine="python",
            usecols=[0, 3], names=["otu_id", "taxonomy"], dtype=str
        )
        df["taxonomy"] = df["taxonomy"].fillna("").astype(str)
        # If taxonomy column is unexpectedly empty, fallback:
        if (df["taxonomy"].str.len().sum() == 0):
            raise ValueError("taxonomy column empty; fallback to raw parsing")
        return df
    except Exception:
        df_raw = pd.read_csv(
            path, sep="\t", header=None, engine="python",
            usecols=[0, 1], names=["otu_id", "raw_sintax"], dtype=str
        )
        df_raw["raw_sintax"] = df_raw["raw_sintax"].fillna("").astype(str)
        df_raw["taxonomy"] = df_raw["raw_sintax"].apply(drop_conf_and_extract_taxonomy)
        return df_raw[["otu_id", "taxonomy"]]

def extract_species_from_taxonomy(tax_str):
    """
    Extract the species name (value after 's:') from canonical "k:...,p:...,s:Name" string.
    Returns "" if absent.
    """
    if tax_str is None:
        return ""
    tax_str = str(tax_str).strip()
    if not tax_str:
        return ""
    # Split by comma and find rank 's'
    for part in tax_str.rstrip(";").split(","):
        part = part.strip()
        if part.startswith("s:"):
            return _clean_name(part.split(":", 1)[1])
    return ""


removed_species = load_removed_species_list(removed_species_path)
print("Removed species list loaded:")
print("  Path:", removed_species_path)
print("  |removed_species| = {:,}".format(len(removed_species)))

true_df = load_true_sintax_table(true_path)
true_df["otu_id"] = true_df["otu_id"].astype(str).str.strip()
true_df["taxonomy"] = true_df["taxonomy"].fillna("").astype(str)

# Extract species for each OTU
true_df["species_true"] = true_df["taxonomy"].apply(extract_species_from_taxonomy)

# Valid species = non-empty AND not unknown-like
valid_species_mask = (~true_df["species_true"].apply(is_unknown))
true_df_valid = true_df[valid_species_mask].copy()

# Affected OTUs = valid species AND species in removed list
affected_mask = true_df_valid["species_true"].isin(removed_species)
affected_df = true_df_valid[affected_mask].copy()

# Report
n_total = len(true_df)
n_valid_species = len(true_df_valid)
n_affected = len(affected_df)

print("\nTrue SINTAX table loaded:")
print("  Path:", true_path)
print("  Total OTUs in table: {:,}".format(n_total))
print("  OTUs with valid species label: {:,}".format(n_valid_species))
print("  Affected OTUs (true species in removed list): {:,}".format(n_affected))

if n_valid_species > 0:
    print("  Affected fraction among OTUs with valid species: {:.2%}".format(n_affected / float(n_valid_species)))
else:
    print("  Affected fraction: n/a (no valid species labels found)")

# Optional: keep the affected OTU id set for the next steps
affected_otu_ids = set(affected_df["otu_id"].astype(str).str.strip().tolist())

# Optional: save for later use
out_path = os.path.join(os.path.dirname(removed_species_path), "affected_otu_ids_from_removed_species.txt")
with open(out_path, "w") as f:
    for oid in sorted(affected_otu_ids):
        f.write(str(oid) + "\n")

print("\nSaved affected OTU ids to:")
print("  {}".format(out_path))
print("  |affected_otu_ids| = {:,}".format(len(affected_otu_ids)))


Removed species list loaded:
  Path: /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_incomplete_species_fromOTUS_predictions/removed_species_20pct_seed123.json
  |removed_species| = 2,058

True SINTAX table loaded:
  Path: /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_all_OTUs/repseqs_sintax_v123.txt
  Total OTUs in table: 111,870
  OTUs with valid species label: 7,359
  Affected OTUs (true species in removed list): 1,518
  Affected fraction among OTUs with valid species: 20.63%

Saved affected OTU ids to:
  /home/hernan_melmoth/Documents/phd_work/Bio_ontology/MicrobeAtlas/level_97/taxonomy_reference/silva-138.2/vsearch_incomplete_species_fromOTUS_predictions/affected_otu_ids_from_removed_species.txt
  |affected_otu_ids| = 1,518
