In [None]:
import pandas as pd
import json
import os
import re

# --- Configuration ---
LLM_FILE_PATH = 'outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.jsonl'
LLM_JSON_KEY = 'adjuvant'

# Gold Standard Configuration
GOLD_FILE_PATH = 'Dataset/Josh/vaccine_adjuvant_Master.xlsx'
GOLD_SHEET_NAME = 'vaccine_adjuvant_VO_temmplate'
GOLD_COL_LABEL = 'LABEL'
GOLD_COL_ALT = 'alternative label'
GOLD_ALT_SEPARATOR = '|'

# --- Rule 1: Synonym Map ---
SYNONYM_MAP = {
    # Acronyms
    'lipopolysaccharide': 'lps', 'oligodeoxynucleotide': 'odn', 'muramyl dipeptide': 'mdp',
    'e. coli heat-labile toxin': 'lt', 'heat-labile toxin': 'lt', 'cholera toxin': 'ct',
    'polyinosinic-polycytidylic acid': 'piclc', 'poly(i:c)': 'piclc', 'polyethyleneimine': 'pei',
    'poly-ϵ-caprolactone': 'pcl', 'poly(dl-lactide-co-glycolide)': 'plga', 'chimpanzee adenovirus': 'chad',
    # Common Names
    'aluminum hydroxide': 'alum', 'aluminium hydroxide': 'alum', 'aluminum salts': 'alum', 'aluminium salts': 'alum',
    # Domain Synonyms
    'agonist': 'ligand', 'curdlan': 'beta-glucan', 'trehalose-6,6-dibehenate': 'tdb',
    'wtlt': 'lt', 'mlt': 'lt', 'raspi': 'aspi'
}

# --- Rule 2: Prefixes/Suffixes ---
PREFIXES_TO_STRIP = [
    r'^m-', r'^r-', r'^h-', r'^p-', r'^wt-', r'^ov-',
    r'^recombinant ', r'^murine ', r'^human ', r'^wild-type '
]
SUFFIXES_TO_STRIP = ['vaccine adjuvant', 'vaccine', 'adjuvants', 'adjuvant']
# --- End Configuration ---


def load_llm_names(file_path, key):
    unique_names = set()
    if not os.path.exists(file_path):
        return unique_names
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue
                try:
                    data = json.loads(line)
                    value = data.get(key)
                    if not value:
                        continue
                    if isinstance(value, str):
                        cleaned_value = value.strip()
                        if cleaned_value:
                            unique_names.add(cleaned_value)
                    elif isinstance(value, list):
                        for item in value:
                            if isinstance(item, str):
                                cleaned_item = item.strip()
                                if cleaned_item:
                                    unique_names.add(cleaned_item)
                except json.JSONDecodeError:
                    pass
    except Exception as e:
        print(f"Error reading LLM file: {e}")
    return unique_names


def load_gold_standard_entities(file_path, sheet_name, label_col, alt_label_col, separator):
    gold_entities = []
    if not os.path.exists(file_path):
        return gold_entities
    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        if label_col not in df.columns:
            return gold_entities
        print(f"Loading gold standard entities from {len(df)} rows...")
        for index, row in df.iterrows():
            raw_names = set()
            primary_label = None
            label = row.get(label_col)
            if pd.notna(label):
                cleaned = str(label).strip()
                if cleaned:
                    raw_names.add(cleaned)
                    primary_label = cleaned
            alt_labels = row.get(alt_label_col)
            if pd.notna(alt_labels):
                parts = str(alt_labels).split(separator)
                for part in parts:
                    cleaned = part.strip()
                    if cleaned:
                        raw_names.add(cleaned)
            if raw_names:
                strict_keys = set().union(*(get_normalized_keys_strict(name) for name in raw_names))
                potential_keys = set().union(*(get_normalized_keys_potential(name) for name in raw_names))
                if strict_keys or potential_keys:
                    gold_entities.append({
                        'raw_names': raw_names,
                        'norm_keys_strict': strict_keys,
                        'norm_keys_potential': potential_keys,
                        'primary_label': primary_label if primary_label else next(iter(raw_names))
                    })
        print(f"Successfully created {len(gold_entities)} gold standard entities.")
        return gold_entities
    except Exception as e:
        print(f"Error reading Excel file/sheet: {e}")
        return []


def _normalize_base(raw_name):
    if not isinstance(raw_name, str) or not raw_name.strip():
        return set()
    base_names = {raw_name.lower()}

    # Synonyms
    temp_names = set()
    sorted_synonyms = sorted(SYNONYM_MAP.items(), key=lambda x: len(x[0]), reverse=True)
    for name in base_names:
        for long, short in sorted_synonyms:
            name = name.replace(long, short)
        temp_names.add(name)
    base_names = {n for n in temp_names if n.strip()}

    # Expansions ((), [], /, +, and)
    expanded = set()
    for name in base_names:
        paren_matches = re.findall(r'[\(\[](.*?)[\)\]]', name)
        is_only_paren = re.fullmatch(r'[\(\[](.*?)[\)\]]', name) is not None
        current_names_to_split = set()
        if paren_matches:
            for m in paren_matches:
                expanded.add(m.strip())
            if not is_only_paren:
                outside = re.sub(r'[\(\[].*?[\)\]]', '', name).strip()
                if outside:
                    current_names_to_split.add(outside)
        elif name:
            current_names_to_split.add(name)

        combo_expanded = set()
        for n in current_names_to_split:
            parts = re.split(r'\s+and\s+', n, flags=re.IGNORECASE)
            for p in parts:
                combo_expanded.update(re.split(r'[/\+]', p))
        expanded.update(p.strip() for p in combo_expanded if p.strip())
    base_names = {n for n in expanded if n.strip()}

    # Suffix stripping
    suffixed_stripped_names = set()
    for name in base_names:
        clean_name = name
        for suffix in SUFFIXES_TO_STRIP:
            if clean_name.endswith(suffix):
                clean_name = clean_name[:-len(suffix)]
        clean_name = clean_name.strip()
        if clean_name:
            suffixed_stripped_names.add(clean_name)
    base_names = suffixed_stripped_names

    # Hyphen/space and alphanumeric forms
    final_keys = set()
    for name in base_names:
        key1 = re.sub(r'[\s\-]+', ' ', name).strip()
        key2 = re.sub(r'[^a-z0-9]', '', key1)
        if key1:
            final_keys.add(key1)
        if key2 and key2 != key1:
            final_keys.add(key2)
    return final_keys


def get_normalized_keys_strict(raw_name):
    return _normalize_base(raw_name)


def get_normalized_keys_potential(raw_name):
    base_keys = _normalize_base(raw_name)
    potential_keys = set()
    for key in base_keys:
        clean_key = key
        for prefix in PREFIXES_TO_STRIP:
            clean_key = re.sub(prefix, '', clean_key, flags=re.IGNORECASE)
        clean_key = clean_key.strip()
        if clean_key and clean_key != key:
            potential_keys.add(clean_key)
            key2 = re.sub(r'[^a-z0-9]', '', clean_key)
            if key2 and key2 != clean_key:
                potential_keys.add(key2)
    return potential_keys


def main():
    raw_llm_names = load_llm_names(LLM_FILE_PATH, LLM_JSON_KEY)
    gold_entities = load_gold_standard_entities(
        GOLD_FILE_PATH, GOLD_SHEET_NAME, GOLD_COL_LABEL, GOLD_COL_ALT, GOLD_ALT_SEPARATOR
    )
    if not raw_llm_names or not gold_entities:
        return

    print(f"\n--- Starting Categorized Matching for {len(raw_llm_names)} LLM names ---")

    # Build maps
    all_gold_strict_keys_map = {}
    all_gold_potential_keys_map = {}
    for i, entity in enumerate(gold_entities):
        for key in entity['norm_keys_strict']:
            all_gold_strict_keys_map.setdefault(key, set()).add(i)
        for key in entity['norm_keys_potential']:
            all_gold_potential_keys_map.setdefault(key, set()).add(i)

    all_gold_strict_keys = set(all_gold_strict_keys_map)
    all_gold_potential_keys = set(all_gold_potential_keys_map)

    results = []
    processed_llm_names = set()

    for llm_name in raw_llm_names:
        if llm_name in processed_llm_names:
            continue

        best_match_type = 'No Match'
        best_match_entity_index = -1

        strict_llm_keys = get_normalized_keys_strict(llm_name)
        potential_llm_keys = get_normalized_keys_potential(llm_name)

        # --- Exact match (strict) ---
        found_exact = False
        for llm_key in strict_llm_keys:
            if llm_key in all_gold_strict_keys_map:
                best_match_type = 'Exact'
                best_match_entity_index = next(iter(all_gold_strict_keys_map[llm_key]))
                found_exact = True
                break
        if found_exact:
            gold_entity = gold_entities[best_match_entity_index]
            primary = gold_entity['primary_label']
            alts = '; '.join(sorted(gold_entity['raw_names'] - {primary}))
            results.append({'LLM predicted': llm_name, 'Gold Label': primary,
                            'gold Alternative': alts or None, 'Match': best_match_type})
            processed_llm_names.add(llm_name)
            continue

        # --- Partial match (substring containment) ---
        found_partial = False
        for llm_key in strict_llm_keys:
            for gold_key in all_gold_strict_keys:
                if not llm_key or not gold_key:
                    continue
                if llm_key in gold_key or gold_key in llm_key:
                    best_match_type = 'Partial'
                    best_match_entity_index = next(iter(all_gold_strict_keys_map[gold_key]))
                    found_partial = True
                    break
            if found_partial:
                break

        if found_partial:
            gold_entity = gold_entities[best_match_entity_index]
            primary = gold_entity['primary_label']
            alts = '; '.join(sorted(gold_entity['raw_names'] - {primary}))
            results.append({'LLM predicted': llm_name, 'Gold Label': primary,
                            'gold Alternative': alts or None, 'Match': best_match_type})
            processed_llm_names.add(llm_name)
            continue

        # --- Potential match (prefix-stripped keys) ---
        effective_potential_llm_keys = potential_llm_keys - strict_llm_keys
        found_potential = False
        for llm_key in effective_potential_llm_keys:
            if llm_key in all_gold_potential_keys_map:
                best_match_type = 'Potential'
                best_match_entity_index = next(iter(all_gold_potential_keys_map[llm_key]))
                found_potential = True
                break

        if found_potential:
            gold_entity = gold_entities[best_match_entity_index]
            primary = gold_entity['primary_label']
            alts = '; '.join(sorted(gold_entity['raw_names'] - {primary}))
            results.append({'LLM predicted': llm_name, 'Gold Label': primary,
                            'gold Alternative': alts or None, 'Match': best_match_type})
            processed_llm_names.add(llm_name)
            continue

        # --- No Match ---
        results.append({'LLM predicted': llm_name, 'Gold Label': None,
                        'gold Alternative': None, 'Match': 'No Match'})
        processed_llm_names.add(llm_name)

    # --- Results ---
    final_df = pd.DataFrame(results)
    order = {'Exact': 1, 'Partial': 2, 'Potential': 3, 'No Match': 4, 'Normalization Failed': 5}
    final_df['Sort_Order'] = final_df['Match'].map(order)
    final_df = final_df.sort_values(by=['Sort_Order', 'LLM predicted']).reset_index(drop=True)
    final_df = final_df.drop(columns=['Sort_Order'])

    print(f"\n--- Final Categorized Matching DataFrame ({len(final_df)} LLM names analyzed) ---")
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
        print(final_df)

    final_df.to_csv("outputs/llm_match_analysis_categorized_final.csv", index=False)
    # Save results to Excel instead of CSV
    output_excel_path = "outputs/llm_match_analysis_categorized_final.xlsx"
    final_df.to_excel(output_excel_path, index=False, sheet_name="Match Results")
    
    print(f"\nResults saved to Excel: {output_excel_path}")


    print("\n--- Match Category Summary ---")
    print(final_df['Match'].value_counts())


if __name__ == "__main__":
    main()


Josh went through this list and found that "I reviewed the terms in the "No Match" category and found 27/44 should be added as new adjuvants that are not already in Vaxjo! The others were mostly duplicates. This should complete the final list of vaccine adjuvants. I added two columns, one is "new potential adjuvant" and in that I describe if it should be added to vaxjo or not and why. The second column is "pmid" and has the pmid source to be listed in VO if it is a new adjuvant that should be added. "
I have uploaded this file as "outputs/llm_match_analysis_categorized_final.xlsx"


In [None]:
import pandas as pd
import json

# --- File paths ---
EXCEL_PATH = "outputs/llm_match_analysis_categorized_final_Josh.xlsx"  # Josh’s reviewed file
TXT_PATH = "outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.txt"
OUTPUT_PATH = "outputs/llm_match_analysis_categorized_final_Josh_with_summary_mechanisms.xlsx"

# --- Helper: Robust JSON extraction (brace-balancing) ---
def extract_json_objects(text):
    objs = []
    depth = 0
    start = None
    in_string = False
    esc = False

    for i, ch in enumerate(text):
        if in_string:
            if esc:
                esc = False
            elif ch == '\\':
                esc = True
            elif ch == '"':
                in_string = False
            continue
        else:
            if ch == '"':
                in_string = True
                continue
            if ch == '{':
                if depth == 0:
                    start = i
                depth += 1
            elif ch == '}':
                depth -= 1
                if depth == 0 and start is not None:
                    candidate = text[start:i+1]
                    try:
                        objs.append(json.loads(candidate))
                    except json.JSONDecodeError:
                        pass
                    start = None
    return objs


# --- Load and parse the TXT file ---
with open(TXT_PATH, "r", encoding="utf-8") as f:
    txt = f.read()

records = extract_json_objects(txt)

# --- Build lookup maps ---
def norm(s):
    return s.strip().lower() if isinstance(s, str) else None

summary_map = {}
mech_map = {}

for rec in records:
    adj = rec.get("adjuvant")
    if not adj:
        continue
    key = norm(adj)
    summary_map[key] = rec.get("summary")
    mech_map[key] = json.dumps(rec.get("mechanism_subtypes", []), ensure_ascii=False)

# --- Load Josh’s Excel and enrich it ---
df = pd.read_excel(EXCEL_PATH)
df["__key__"] = df["LLM predicted"].astype(str).str.strip().str.lower()

df["LLM summary"] = df["__key__"].map(summary_map)
df["LLM mechanism_subtypes"] = df["__key__"].map(mech_map)

# --- Sorting: Bring “yes” or relevant entries to top ---
df_sorted = df.sort_values(by="New Potential Adjuvant?", ascending=False)

# --- Diagnostics ---
total = len(df)
matched = df["LLM summary"].notna().sum()
print(f"Parsed {len(records)} JSON blocks from TXT.")
print(f"Matched {matched}/{total} adjuvants from Josh’s Excel.")
if matched < total:
    print("Unmatched examples:")
    print(df.loc[df['LLM summary'].isna(), 'LLM predicted'].head(10).tolist())

# --- Save final output ---
df_sorted.drop(columns=["__key__"], inplace=True)
df_sorted.to_excel(OUTPUT_PATH, index=False)

print(f"\n✅ Final enriched file saved to:\n{OUTPUT_PATH}")


In [None]:
import pandas as pd
import json

# --- File paths ---
EXCEL_PATH = "outputs/llm_match_analysis_categorized_final_Josh.xlsx"
TXT_PATH = "outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.txt"
OUTPUT_PATH = "outputs/llm_match_analysis_categorized_final_Josh_with_summary_mechanisms_raw_multiline.xlsx"

# --- Helper: Robust JSON extraction ---
def extract_json_objects(text):
    objs = []
    depth = 0
    start = None
    in_string = False
    esc = False

    for i, ch in enumerate(text):
        if in_string:
            if esc:
                esc = False
            elif ch == '\\':
                esc = True
            elif ch == '"':
                in_string = False
            continue
        else:
            if ch == '"':
                in_string = True
                continue
            if ch == '{':
                if depth == 0:
                    start = i
                depth += 1
            elif ch == '}':
                depth -= 1
                if depth == 0 and start is not None:
                    candidate = text[start:i+1]
                    try:
                        objs.append(json.loads(candidate))
                    except json.JSONDecodeError:
                        pass
                    start = None
    return objs


# --- Load TXT file ---
with open(TXT_PATH, "r", encoding="utf-8") as f:
    txt = f.read()

records = extract_json_objects(txt)

# --- Build lookup maps ---
def norm(s):
    return s.strip().lower() if isinstance(s, str) else None

summary_map = {}
mech_map = {}

for rec in records:
    adj = rec.get("adjuvant")
    if not adj:
        continue
    key = norm(adj)
    summary_map[key] = rec.get("summary")
    mechanisms = rec.get("mechanism_subtypes", [])
    if mechanisms:
        mech_map[key] = "\n".join(json.dumps(m, ensure_ascii=False) for m in mechanisms)
    else:
        mech_map[key] = None

# --- Load Josh’s Excel ---
df = pd.read_excel(EXCEL_PATH)
df["__key__"] = df["LLM predicted"].astype(str).str.strip().str.lower()

df["LLM summary"] = df["__key__"].map(summary_map)
df["LLM mechanism_subtypes"] = df["__key__"].map(mech_map)

# --- Sort by “New Potential Adjuvant?” ---
df_sorted = df.sort_values(by="New Potential Adjuvant?", ascending=False)

# --- Save ---
df_sorted.drop(columns=["__key__"], inplace=True)
df_sorted.to_excel(OUTPUT_PATH, index=False)

print(f"✅ Saved Excel with raw JSON subtypes separated by newlines:\n{OUTPUT_PATH}")
