In [None]:
import json
import os
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

# File paths
INPUT_FILE = "outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.jsonl"
OUTPUT_IMAGE_FILE = "outputs/Vaxjo_PMIDs_mechanism_subtype_frequency.png"
OUTPUT_TEXT_FILE = "outputs/Vaxjo_PMIDs_mechanism_subtypes_alphabetical.txt"
OUTPUT_CSV_FILE = "outputs/Vaxjo_PMIDs_mechanism_subtypes_frequency.csv"

def clean_raw_string(raw_str):
    """
    Removes markdown formatting and escapes from LLM output strings.
    """
    raw_str = raw_str.strip()

    # Remove markdown backticks
    if raw_str.startswith("```json"):
        raw_str = raw_str[len("```json"):].strip()
    elif raw_str.startswith("```"):
        raw_str = raw_str[len("```"):].strip()
    if raw_str.endswith("```"):
        raw_str = raw_str[:-3].strip()

    return raw_str

def parse_inner_json(raw_str, line_num):
    """
    Parses the inner raw JSON string after cleaning.
    """
    try:
        raw_clean = clean_raw_string(raw_str)
        return json.loads(raw_clean)
    except json.JSONDecodeError:
        print(f"‚ö†Ô∏è  Skipping line {line_num}: failed to parse inner JSON.")
        return None

def analyze_subtype_frequency(filepath):
    all_subtypes = []

    # Ensure output directory exists
    os.makedirs(os.path.dirname(OUTPUT_IMAGE_FILE), exist_ok=True)

    print(f"üìÇ Reading data from {filepath}...\n")

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                try:
                    outer = json.loads(line)
                except json.JSONDecodeError:
                    print(f"‚ö†Ô∏è  Skipping line {i+1}: invalid outer JSON.")
                    continue

                raw = outer.get("raw")
                if not raw:
                    continue

                inner_data = parse_inner_json(raw, i + 1)
                if not inner_data:
                    continue

                subtypes = inner_data.get("mechanism_subtypes", [])
                for entry in subtypes:
                    subtype = entry.get("mechanism subtype")
                    if subtype:
                        all_subtypes.append(subtype)

    except FileNotFoundError:
        print(f"‚ùå File not found: {filepath}")
        return

    if not all_subtypes:
        print("‚ö†Ô∏è  No mechanism subtypes found.")
        return

    print("\n‚úÖ --- Analysis Complete ---")

    # Frequency count ‚Üí DataFrame
    counts = Counter(all_subtypes)
    df = pd.DataFrame(counts.items(), columns=['Subtype', 'Frequency'])

    print(f"\nüî¢ Unique mechanism subtypes: {len(df)}")

    # Top 30 preview
    print("\nüîù Top 30 Most Common Mechanism Subtypes:")
    for subtype, count in counts.most_common(30):
        print(f"- {subtype}: {count}")

    # Alphabetically sorted DataFrame
    df_sorted = df.sort_values(by="Subtype", key=lambda col: col.str.casefold())

    # Save to TXT (pretty aligned)
    with open(OUTPUT_TEXT_FILE, "w", encoding="utf-8") as f:
        f.write(f"Unique mechanism subtypes: {len(df_sorted)}\n\n")
        f.write(f"{'Subtype':50s} | Frequency\n")
        f.write("-" * 65 + "\n")
        for _, row in df_sorted.iterrows():
            f.write(f"{row['Subtype']:<50} | {row['Frequency']}\n")
    print(f"\nüìù Alphabetical list with frequencies saved to '{OUTPUT_TEXT_FILE}'")

    # Save to CSV (for Excel/analysis)
    df_sorted.to_csv(OUTPUT_CSV_FILE, index=False)
    print(f"üßæ CSV saved to '{OUTPUT_CSV_FILE}'")

    # Plotting (top 25 by frequency)
    df_plot = df.sort_values(by="Frequency", ascending=False).head(25)

    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(12, 10))

    ax.barh(df_plot['Subtype'], df_plot['Frequency'], color='skyblue')
    ax.invert_yaxis()
    ax.set_xlabel('Frequency Count', fontsize=12)
    ax.set_title('Top Immune Mechanism Subtypes', fontsize=16, pad=20)
    ax.tick_params(axis='y', labelsize=10)

    for i, v in enumerate(df_plot['Frequency']):
        ax.text(v + 0.5, i, str(v), color='gray', va='center', fontweight='medium')

    plt.tight_layout()
    plt.savefig(OUTPUT_IMAGE_FILE, dpi=300)
    plt.close(fig)

    print(f"\nüìä Bar chart saved as '{OUTPUT_IMAGE_FILE}'")

if __name__ == "__main__":
    analyze_subtype_frequency(INPUT_FILE)


In [None]:
import pandas as pd
import plotly.express as px
import os
import re

# === File paths ===
INPUT_CSV = "outputs/Vaxjo_PMIDs_mechanism_subtypes_frequency.csv"
OUTPUT_GROUPED_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_grouped.csv"
OUTPUT_SUNBURST_HTML = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_sunburst.html"
OUTPUT_TREEMAP_HTML = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_treemap.html"

# === Keyword-based grouping rules ===
FAMILY_RULES = {
    "Dendritic cell activation": ["dendritic"],
    "TLR signaling": ["TLR"],
    "Pattern recognition / PRR sensing": ["PRR", "pattern recognition"],
    "NLRP3 inflammasome activation": ["NLRP3", "inflammasome"],
    "T cell activation / polarization": [
        "T cell", "Th1", "Th2", "Th17", "T helper", "T-cell"
    ],
    "B cell / antibody production": [
        "B cell", "antibody", "humoral", "IgG", "plasma cell"
    ],
    "Cytokine signaling / production": [
        "cytokine", "interleukin", "IFN", "TNF", "IL-"
    ],
    "Macrophage / innate immune activation": [
        "macrophage", "innate", "monocyte", "neutrophil", "NK cell", "natural killer"
    ],
    "Complement / depot / formulation": [
        "complement", "depot", "emulsion", "formulation", "alum"
    ],
    "Interferon / antiviral signaling": ["interferon", "IFN"],
    "Inflammatory response": ["inflamm"],
    "Adjuvant synergy / immune modulation": [
        "modulat", "enhanc", "promotion", "stimul", "activation of immune"
    ],
    "Antigen presentation / APCs": [
        "antigen", "APC", "presentation", "cross-presentation"
    ],
    "STING / TRIF / MyD88 / RIG-I signaling": [
        "STING", "TRIF", "MyD88", "RIG", "NOD"
    ],
}

# === Grouping function ===
def assign_family(subtype: str) -> str:
    for family, keywords in FAMILY_RULES.items():
        for kw in keywords:
            if re.search(kw, subtype, flags=re.IGNORECASE):
                return family
    return "Other / Unclassified"

# === Load and process data ===
df = pd.read_csv(INPUT_CSV)
if "Subtype" not in df.columns or "Frequency" not in df.columns:
    raise ValueError("CSV must contain columns: 'Subtype' and 'Frequency'")

df["Family"] = df["Subtype"].apply(assign_family)

# === Summarize grouped frequencies ===
grouped = df.groupby("Family", as_index=False)["Frequency"].sum().sort_values(
    "Frequency", ascending=False
)

# === Save grouped data ===
os.makedirs("outputs", exist_ok=True)
df.to_csv(OUTPUT_GROUPED_CSV, index=False)
print(f"‚úÖ Grouped data saved to: {OUTPUT_GROUPED_CSV}")

# === Plot: Sunburst ===
fig_sunburst = px.sunburst(
    df,
    path=["Family", "Subtype"],
    values="Frequency",
    color="Family",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Immune Mechanism Hierarchy (Sunburst)",
)
fig_sunburst.update_traces(textinfo="label+value+percent parent")
fig_sunburst.write_html(OUTPUT_SUNBURST_HTML)
print(f"üåû Sunburst plot saved to: {OUTPUT_SUNBURST_HTML}")

# === Plot: Treemap ===
fig_treemap = px.treemap(
    df,
    path=["Family", "Subtype"],
    values="Frequency",
    color="Family",
    color_discrete_sequence=px.colors.qualitative.Pastel1,
    title="Immune Mechanism Hierarchy (Treemap)",
)
fig_treemap.update_traces(textinfo="label+value+percent parent")
fig_treemap.write_html(OUTPUT_TREEMAP_HTML)
print(f"üå≥ Treemap plot saved to: {OUTPUT_TREEMAP_HTML}")

# === Optional display ===
fig_sunburst.show()
fig_treemap.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import os

# === File paths ===
INPUT_GROUPED_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_grouped.csv"
OUTPUT_BAR_PNG = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_families_bar1.png"
OUTPUT_TREEMAP_PNG = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_families_treemap1.png"

# === Load grouped data ===
df = pd.read_csv(INPUT_GROUPED_CSV)
if "Family" not in df.columns or "Frequency" not in df.columns:
    raise ValueError("Input file must include columns: 'Family' and 'Frequency'")

# === Summarize by top-level family ===
family_summary = (
    df.groupby("Family", as_index=False)["Frequency"]
    .sum()
    .sort_values("Frequency", ascending=True)
)

os.makedirs("outputs", exist_ok=True)

# === 1Ô∏è‚É£ Horizontal Bar Chart ===
plt.style.use("seaborn-v0_8-whitegrid")
fig, ax = plt.subplots(figsize=(10, 6))

bars = ax.barh(family_summary["Family"], family_summary["Frequency"], color="steelblue", alpha=0.8)
ax.set_xlabel("Frequency Count", fontsize=12)
ax.set_title("Immune Mechanism Families", fontsize=15, pad=15)
ax.tick_params(axis="y", labelsize=10)

# Add value labels
for bar in bars:
    width = bar.get_width()
    ax.text(width + 2, bar.get_y() + bar.get_height()/2,
            f"{int(width)}", va='center', fontsize=9, color='gray')

plt.tight_layout()
plt.savefig(OUTPUT_BAR_PNG, dpi=400)
plt.close(fig)
print(f"‚úÖ Horizontal bar chart saved to: {OUTPUT_BAR_PNG}")

# === 2Ô∏è‚É£ Simplified Treemap (Top Families Only) ===
fig_treemap = px.treemap(
    family_summary,
    path=["Family"],
    values="Frequency",
    color="Frequency",
    color_continuous_scale="Blues",
    title="Immune Mechanism Families (Simplified Treemap)",
)

# Export as static PNG (high-res)
# With this:
fig_treemap.write_html("outputs/Vaxjo_PMIDs_mechanism_families_treemap.html")
print("‚úÖ Saved interactive treemap as HTML instead of PNG")


# === Optional interactive preview ===
fig_treemap.show()


In [None]:
import pandas as pd
import os
from collections import defaultdict

# === Input and output ===
INPUT_GROUPED_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_grouped.csv"
OUTPUT_TREE_TXT = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_tree.txt"

# === Load data ===
df = pd.read_csv(INPUT_GROUPED_CSV)
if not {"Family", "Subtype", "Frequency"}.issubset(df.columns):
    raise ValueError("CSV must have columns: Family, Subtype, Frequency")

# === Compute total frequency per family ===
family_totals = df.groupby("Family")["Frequency"].sum().to_dict()

# === Group subtypes under each family ===
tree = defaultdict(list)
for _, row in df.iterrows():
    tree[row["Family"]].append((row["Subtype"], int(row["Frequency"])))

# === Sort by total frequency (descending) and subtype frequency (descending) ===
sorted_families = sorted(family_totals.items(), key=lambda x: x[1], reverse=True)

# === Build tree-like text ===
lines = []
for fam, fam_total in sorted_families:
    lines.append(f"{fam} ({fam_total})")
    subs = sorted(tree[fam], key=lambda x: x[1], reverse=True)
    for i, (sub, freq) in enumerate(subs):
        connector = "‚îî‚îÄ" if i == len(subs) - 1 else "‚îú‚îÄ"
        lines.append(f"   {connector} {sub} ({freq})")
    lines.append("")  # blank line between families

# === Save to file ===
os.makedirs("outputs", exist_ok=True)
with open(OUTPUT_TREE_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"‚úÖ Hierarchical text tree saved to: {OUTPUT_TREE_TXT}")


In [None]:
import pandas as pd
import os
import re
from collections import defaultdict

# === File paths ===
INPUT_GROUPED_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_grouped.csv"
OUTPUT_TREE_TXT = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_tree_multilevel.txt"

# === Load data ===
df = pd.read_csv(INPUT_GROUPED_CSV)
if not {"Family", "Subtype", "Frequency"}.issubset(df.columns):
    raise ValueError("CSV must contain columns: Family, Subtype, Frequency")

# --- Utility: simple plural/singular normalization ---
def normalize_text(s):
    s = str(s).strip()
    s = re.sub(r"\bresponses\b", "response", s, flags=re.IGNORECASE)
    s = re.sub(r"\bactivations\b", "activation", s, flags=re.IGNORECASE)
    s = re.sub(r"\bcells\b", "cell", s, flags=re.IGNORECASE)
    s = re.sub(r"\bcytokines\b", "cytokine", s, flags=re.IGNORECASE)
    s = re.sub(r"\bantibodies\b", "antibody", s, flags=re.IGNORECASE)
    s = re.sub(r"\bmechanisms\b", "mechanism", s, flags=re.IGNORECASE)
    s = re.sub(r"\bpathways\b", "pathway", s, flags=re.IGNORECASE)
    return s

df["Subtype"] = df["Subtype"].apply(normalize_text)

# === Define sub-branch keyword rules ===
SUBBRANCH_RULES = {
    "TLR signaling": {
        "TLR2 branch": ["TLR2"],
        "TLR3 branch": ["TLR3"],
        "TLR4 branch": ["TLR4"],
        "TLR5 branch": ["TLR5"],
        "TLR7/8 branch": ["TLR7", "TLR8"],
        "TLR9 branch": ["TLR9"],
        "Other TLR-related": ["toll-like receptor"]
    },
    "T cell activation / polarization": {
        "Th1 related": ["Th1"],
        "Th2 related": ["Th2"],
        "Th17 related": ["Th17"],
        "CD4/CD8 related": ["CD4", "CD8"],
        "Tfh related": ["Tfh"],
    },
    "Dendritic cell activation": {
        "DC maturation": ["maturation"],
        "DC polarization": ["polarization"],
        "Plasmacytoid DC": ["plasmacytoid"],
        "Other DC activation": ["dendritic"],
    },
    "Cytokine signaling / production": {
        "Interleukins": ["IL-"],
        "Interferons": ["IFN"],
        "TNF": ["TNF"],
        "Other cytokines": ["cytokine"],
    },
    "Pattern recognition / PRR sensing": {
        "PRR family": ["PRR"],
        "RIG-I-like": ["RIG"],
        "NOD-like": ["NOD"],
        "Pattern recognition": ["pattern recognition"],
    },
    "Macrophage / innate immune activation": {
        "Macrophage": ["macrophage"],
        "NK / Monocyte": ["NK", "monocyte"],
        "Innate immune cells": ["innate"],
    },
}

# === Build multilevel tree ===
tree = defaultdict(lambda: defaultdict(list))
family_totals = df.groupby("Family")["Frequency"].sum().to_dict()

for _, row in df.iterrows():
    fam = row["Family"]
    subtype = row["Subtype"]
    freq = int(row["Frequency"])
    matched = False

    if fam in SUBBRANCH_RULES:
        for subbranch, kws in SUBBRANCH_RULES[fam].items():
            if any(re.search(kw, subtype, re.IGNORECASE) for kw in kws):
                tree[fam][subbranch].append((subtype, freq))
                matched = True
                break

    if not matched:
        tree[fam]["Other"].append((subtype, freq))

# === Build nicely formatted tree ===
lines = []
for fam, fam_total in sorted(family_totals.items(), key=lambda x: x[1], reverse=True):
    lines.append(f"{fam} ({fam_total})")
    subbranches = tree[fam]

    # sort sub-branches: all except 'Other' by size desc, 'Other' last
    def branch_sort(item):
        name, entries = item
        if name.lower() == "other":
            return (1e6, 0)  # push to bottom
        return (-sum(freq for _, freq in entries), 0)

    for subbranch, entries in sorted(subbranches.items(), key=branch_sort):
        branch_total = sum(freq for _, freq in entries)
        lines.append(f"   ‚îú‚îÄ {subbranch} ({branch_total})")

        entries_sorted = sorted(entries, key=lambda x: x[1], reverse=True)
        for i, (subtype, freq) in enumerate(entries_sorted):
            connector = "   ‚îÇ    ‚îî‚îÄ" if i == len(entries_sorted) - 1 else "   ‚îÇ    ‚îú‚îÄ"
            lines.append(f"{connector} {subtype} ({freq})")
    lines.append("")

# === Save output ===
os.makedirs("outputs", exist_ok=True)
with open(OUTPUT_TREE_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"‚úÖ Clean multi-level hierarchy saved to: {OUTPUT_TREE_TXT}")


In [None]:
919 810 7910

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Build a multi-level immune mechanism hierarchy (Family ‚Üí Sub-branch ‚Üí Subtype)
from outputs/Vaxjo_PMIDs_mechanism_grouped.csv and write a clean text tree
for manuscript/supplementary material.

Features:
- Full ontology mapping for all families (see FAMILY_TO_SUBBRANCH).
- Case-insensitive regex matching for sub-branch assignment.
- Singular/plural normalization to reduce duplicates.
- Deterministic sorting by frequency (desc) with "Other" always printed last.
- Optional per-family text files and a summary CSV.

Output:
- outputs/Vaxjo_PMIDs_mechanism_tree_master.txt
- outputs/Vaxjo_PMIDs_mechanism_tree_summary.csv
- outputs/trees/<Family>.txt  (optional; toggle PER_FAMILY_FILES)
"""

import os
import re
import pandas as pd
from collections import defaultdict

# =========== CONFIG ===========
INPUT_GROUPED_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_grouped.csv"
OUTPUT_TREE_TXT = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_tree_master1.txt"
OUTPUT_SUMMARY_CSV = "outputs/grouped_tree/Vaxjo_PMIDs_mechanism_tree_summary1.csv"
PER_FAMILY_FILES = True  # set to False if you don't want per-family text files

os.makedirs("outputs", exist_ok=True)
if PER_FAMILY_FILES:
    os.makedirs("outputs/trees", exist_ok=True)

# ========= NORMALIZATION HELPERS =========
def normalize_text(s: str) -> str:
    """Light singular/plural + minor typography normalization."""
    s = str(s).strip()
    # common plurals -> singular
    s = re.sub(r"\bresponses\b", "response", s, flags=re.IGNORECASE)
    s = re.sub(r"\bactivations\b", "activation", s, flags=re.IGNORECASE)
    s = re.sub(r"\bcells\b", "cell", s, flags=re.IGNORECASE)
    s = re.sub(r"\bcytokines\b", "cytokine", s, flags=re.IGNORECASE)
    s = re.sub(r"\bantibodies\b", "antibody", s, flags=re.IGNORECASE)
    s = re.sub(r"\bpathways\b", "pathway", s, flags=re.IGNORECASE)
    s = re.sub(r"\bmechanisms\b", "mechanism", s, flags=re.IGNORECASE)
    # tidy spaces / hyphens / dots
    s = re.sub(r"\s+", " ", s)
    s = s.replace("‚Äì", "-").replace("‚Äî", "-").replace("¬∑", "-")
    return s

# Collapsing near-duplicates (case-insensitive). Add more rules if needed.
CANONICAL_REPLACEMENTS = [
    (r"\bT[- ]?cell\b", "T cell"),
    (r"\bT[- ]?cells\b", "T cell"),
    (r"\bT[- ]?helper\b", "T helper"),
    (r"\bTh[- ]?1\b", "Th1"),
    (r"\bTh[- ]?2\b", "Th2"),
    (r"\bTh[- ]?17\b", "Th17"),
    (r"\bIFN ?- ?Œ≥\b", "IFN-Œ≥"),
    (r"\bNF.?Œ∫B\b", "NF-Œ∫B"),
]

def canonicalize(s: str) -> str:
    s = normalize_text(s)
    for pat, repl in CANONICAL_REPLACEMENTS:
        s = re.sub(pat, repl, s, flags=re.IGNORECASE)
    return s

# ========= MAPPING: FAMILY ‚Üí SUB-BRANCH ‚Üí KEYWORDS =========
FAMILY_TO_SUBBRANCH = {
    "T cell activation / polarization": {
        "T cell branch": [r"T cell", r"T-cell", r"T lymphocyte"],
        "Th1 branch": [r"Th1"],
        "Th2 branch": [r"Th2"],
        "Th17 branch": [r"Th17"],
        "CD4/CD8 branch": [r"CD4", r"CD8"],
        "Tfh branch": [r"Tfh"],
        "Regulatory T cell branch": [r"Treg", r"regulatory T"],
    },
    "Dendritic cell activation": {
        "DC maturation": [r"maturation"],
        "DC polarization": [r"polarization"],
        "Plasmacytoid DC": [r"plasmacytoid"],
        "Antigen presentation-related DC": [r"antigen", r"\bAPC\b", r"presentation"],
        "TLR-related DC": [r"\bTLR"],
        "Other DC activation": [r"dendritic"],
    },
    "TLR signaling": {
        "TLR2 branch": [r"\bTLR2\b"],
        "TLR3 branch": [r"\bTLR3\b"],
        "TLR4 branch": [r"\bTLR4\b"],
        "TLR5 branch": [r"\bTLR5\b"],
        "TLR7/8 branch": [r"\bTLR7\b", r"\bTLR8\b"],
        "TLR9 branch": [r"\bTLR9\b"],
        "MyD88/TRIF-related": [r"MyD88", r"TRIF"],
        "Other TLR-related": [r"toll-?like receptor", r"\bTLR\b"],
    },
    "Cytokine signaling / production": {
        "Interleukins": [r"\bIL[- ]?\d", r"interleukin"],
        "Interferons": [r"\bIFN"],
        "TNF": [r"\bTNF"],
        "Chemokines": [r"chemokine", r"\bCCL", r"\bCXCL"],
        "Inflammasome / IL-1 family": [r"\bIL-?1", r"inflammasome"],
        "Other cytokines": [r"cytokine"],
    },
    "Macrophage / innate immune activation": {
        "Macrophage": [r"macrophage"],
        "NK / Monocyte": [r"\bNK\b", r"monocyte"],
        "Innate immune cells": [r"innate"],
        "Neutrophils / Granulocytes": [r"neutrophil", r"granulocyte"],
        "Other innate activation": [r"activation"],
    },
    "Pattern recognition / PRR sensing": {
        "PRR family": [r"\bPRR\b"],
        "RIG-I-like": [r"\bRIG"],
        "NOD-like": [r"\bNOD"],
        "Pattern recognition": [r"pattern recognition"],
        "C-type lectin receptors": [r"Dectin", r"Mincle", r"\bMCL\b"],
        "Other pattern sensors": [r"recognition", r"sensing"],
    },
    "NLRP3 inflammasome activation": {
        "NLRP3 core branch": [r"\bNLRP3\b"],
        "MAPK/JNK pathway": [r"\bMAPK\b", r"\bJNK\b"],
        "Caspase / pyroptosis": [r"caspase", r"pyroptosis"],
        "Other inflammasome activity": [r"inflammasome"],
    },
    "Antigen presentation / APCs": {
        "APC activation": [r"activation", r"\bAPC\b"],
        "Cross-presentation": [r"cross-?presentation", r"\bcross\b"],
        "MHC / Co-stimulation": [r"\bMHC\b", r"\bCD40\b", r"\bCD80\b", r"\bCD86\b", r"co-?stimul"],
        "Migration / trafficking": [r"migration", r"traffick"],
        "Antigen processing / uptake": [r"antigen", r"uptake", r"processing"],
        "Other APC function": [r"presentation"],
    },
    "B cell / antibody production": {
        "B cell activation": [r"\bB cell\b", r"\bB-cell\b"],
        "Antibody production": [r"antibody", r"\bIgG\b", r"\bIgA\b", r"\bIgM\b", r"\bIgE\b"],
        "Humoral immunity": [r"humoral"],
        "Plasma cell / differentiation": [r"\bplasma\b", r"plasmablast"],
        "Germinal center / memory": [r"germinal", r"memory"],
        "Other B cell mechanisms": [r"\bB\b", r"antibody"],
    },
    "Complement / depot / formulation": {
        "Complement activation": [r"complement"],
        "Depot / release mechanisms": [r"depot", r"release"],
        "Adjuvant formulation / emulsions": [r"\balum\b", r"emulsion", r"formulation"],
        "Other": [r"activation"],
    },
    "STING / TRIF / MyD88 / RIG-I signaling": {
        "STING": [r"\bSTING\b"],
        "TRIF": [r"\bTRIF\b"],
        "MyD88": [r"\bMyD88\b"],
        "RIG-I-like": [r"\bRIG"],
        "NOD-like": [r"\bNOD"],
        "Other signaling adaptors": [r"adaptor", r"signaling"],
    },
    "Inflammatory response": {
        "Pro-inflammatory genes": [r"inflamm", r"NF[- ]?Œ∫B", r"NF[- ]?kB", r"NFkB"],
        "Cytokine-mediated inflammation": [r"cytokine"],
        "Chemokine signaling": [r"chemokine", r"\bCCL", r"\bCXCL"],
        "Immune suppression / regulation": [r"regulation", r"inhibition"],
        "Other": [r"response", r"activation"],
    },
    "Adjuvant synergy / immune modulation": {
        "Immune enhancement": [r"enhanc", r"promotion"],
        "Costimulation": [r"co-?stimul", r"\bCD40\b", r"\bCD86\b"],
        "Immune modulation": [r"modulat"],
        "Synergy": [r"synerg", r"combination", r"co-?activation"],
        "Other": [r"activation"],
    },
}

# ========= DATA LOAD =========
df = pd.read_csv(INPUT_GROUPED_CSV)
required_cols = {"Family", "Subtype", "Frequency"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV must contain columns: {required_cols}")

# Normalize subtypes
df["Subtype"] = df["Subtype"].apply(canonicalize)

# ========= BUILD TREE =========
# tree[Family][Sub-branch]["entries"] = list of (Subtype, Frequency)
tree = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

# Precompute family totals
family_totals = df.groupby("Family")["Frequency"].sum().to_dict()

# Assign each row to a sub-branch (first match wins). Otherwise goes to "Other".
for _, row in df.iterrows():
    fam = row["Family"]
    subtype = str(row["Subtype"])
    freq = int(row["Frequency"])
    matched = False

    # If family is in mapping, test sub-branches
    if fam in FAMILY_TO_SUBBRANCH:
        for subbranch_name, kw_list in FAMILY_TO_SUBBRANCH[fam].items():
            if any(re.search(kw, subtype, flags=re.IGNORECASE) for kw in kw_list):
                tree[fam][subbranch_name]["entries"].append((subtype, freq))
                matched = True
                break

    if not matched:
        tree[fam]["Other"]["entries"].append((subtype, freq))

# ========= OUTPUT BUILDERS =========
def sort_branches_for_print(branch_items):
    """
    Sort branches by total frequency (desc), with 'Other' always last.
    branch_items: iterable of (branch_name, branch_dict)
    """
    def key_fn(item):
        name, data = item
        total = sum(f for _, f in data.get("entries", []))
        if name.lower() == "other":
            return (1e12, 0)  # huge positive to push last
        return (-total, 0)
    return sorted(branch_items, key=key_fn)

def render_family_block(family_name: str) -> str:
    """
    Render one family block:
    Family (total)
       ‚îú‚îÄ Branch (branch_total)
       ‚îÇ    ‚îú‚îÄ Subtype (freq)
       ‚îÇ    ‚îî‚îÄ ...
    """
    lines = []
    fam_total = family_totals.get(family_name, 0)
    lines.append(f"{family_name} ({fam_total})")

    branches = tree.get(family_name, {})
    # Sort branches with 'Other' last
    for branch_name, bdict in sort_branches_for_print(branches.items()):
        entries = bdict.get("entries", [])
        branch_total = sum(freq for _, freq in entries)
        lines.append(f"   ‚îú‚îÄ {branch_name} ({branch_total})")

        # sort entries by freq desc, then alpha
        entries_sorted = sorted(entries, key=lambda x: (-x[1], x[0].lower()))
        for i, (sub, freq) in enumerate(entries_sorted):
            connector = "   ‚îÇ    ‚îî‚îÄ" if i == len(entries_sorted) - 1 else "   ‚îÇ    ‚îú‚îÄ"
            lines.append(f"{connector} {sub} ({freq})")

    lines.append("")  # spacer line
    return "\n".join(lines)

# ========= WRITE MASTER TEXT TREE =========
families_sorted = sorted(family_totals.items(), key=lambda x: x[1], reverse=True)
master_lines = []
summary_rows = []

for fam, total in families_sorted:
    block = render_family_block(fam)
    master_lines.append(block)

    # summary per branch for CSV
    for branch_name, bdict in tree.get(fam, {}).items():
        branch_total = sum(freq for _, freq in bdict.get("entries", []))
        summary_rows.append({
            "Family": fam,
            "Branch": branch_name,
            "Branch_Total": branch_total,
            "Family_Total": total
        })

with open(OUTPUT_TREE_TXT, "w", encoding="utf-8") as f:
    f.write("\n".join(master_lines))

# ========= OPTIONAL: PER-FAMILY FILES =========
if PER_FAMILY_FILES:
    for fam, _ in families_sorted:
        fam_text_path = os.path.join("outputs", "trees", f"{fam.replace('/', '_')}.txt")
        with open(fam_text_path, "w", encoding="utf-8") as ff:
            ff.write(render_family_block(fam))

# ========= SUMMARY CSV =========
summary_df = pd.DataFrame(summary_rows)
summary_df = summary_df.sort_values(["Family", "Branch_Total"], ascending=[True, False])
summary_df.to_csv(OUTPUT_SUMMARY_CSV, index=False)

print(f"‚úÖ Master tree written to: {OUTPUT_TREE_TXT}")
print(f"‚úÖ Summary CSV written to: {OUTPUT_SUMMARY_CSV}")
if PER_FAMILY_FILES:
    print(f"‚úÖ Per-family trees in: outputs/trees/")


In [None]:
%pip install anytree

In [None]:
%pip install plotly

In [None]:
%pip install --upgrade kaleido

In [None]:
def extract_nlrp3_subset(filepath, output_csv="outputs/Vaxjo_PMIDs_NLRP3_subset.csv"):
    target_phrases = {
        "activation of nlrp3 inflammasome",
        "nlrp3 inflammasome activation"
    }

    rows = []

    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                outer = json.loads(line)
            except json.JSONDecodeError:
                continue

            raw = outer.get("raw")
            if not raw:
                continue

            inner_data = parse_inner_json(raw, i + 1)
            if not inner_data:
                continue

            adjuvant = inner_data.get("adjuvant")
            summary = inner_data.get("summary")

            for entry in inner_data.get("mechanism_subtypes", []):
                subtype = entry.get("mechanism subtype", "")
                evidence = entry.get("evidence_refs", [])

                if subtype and subtype.lower().strip() in target_phrases:
                    rows.append({
                        "adjuvant": adjuvant,
                        "summary": summary,
                        "mechanism_subtype": subtype,
                        "evidence_refs": ", ".join(evidence)
                    })

    df = pd.DataFrame(rows, columns=["adjuvant", "summary", "mechanism_subtype", "evidence_refs"])
    print(f"‚úÖ Extracted {len(df)} rows (expected ~23)")
    df.to_csv(output_csv, index=False)
    print(f"üßæ Saved to {output_csv}")
    return df

if __name__ == "__main__":
    #analyze_subtype_frequency(INPUT_FILE)   # your full analysis
    extract_nlrp3_subset(INPUT_FILE)        # narrow NLRP3-only extraction

