# Phase I: Llama Inferencing

In [None]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face token
login(token="your_token_here")

print("Successfully logged in!")


In [None]:

n_gpus = torch.cuda.device_count()
print("N GPUS: ", n_gpus)

# Set memory limits per GPU
model_vram_limit_mib = 8192  #12000
max_memory = f'{model_vram_limit_mib}MiB'
max_memory_dict = {i: max_memory for i in range(n_gpus)}

model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load model and tokenizer with memory limits
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    max_memory=max_memory_dict
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Now create the pipeline using pre-loaded model/tokenizer
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


In [None]:

# Custom system prompt for extracting adjuvants and immune mechanisms
system_prompt = (
        """You are a biomedical assistant with expertise in immunology. Extract all substances explicitly described as vaccine adjuvants from the input article.

For each adjuvant, extract the following:
- "adjuvant": The name of the adjuvant (e.g., Alum, MPLA, QS-21)
- "immune_response_mechanism": A brief description of how the adjuvant works to stimulate or enhance the immune response, as described in the text.

Guidelines:
- Include only substances that are explicitly described as adjuvants in the input.
- Do not include delivery systems (e.g., liposomes, virosomes, VLPs) unless they are clearly described as adjuvants.
- Do not infer or guess mechanisms that are not mentioned; leave them empty if not described.
- If no adjuvants are found, say 'No adjuvants mentioned.
- Return valid JSON in the following format:


[
  {
    "adjuvant": "Alum",
    "immune_response_mechanism": "Activates NLRP3 inflammasome and forms a depot for slow antigen release."
  },
  {
    "adjuvant": "MPLA",
    "immune_response_mechanism": "Engages TLR4 pathway to promote Th1 responses."
  }
]"""
    )

# Load paper (replace path with your paper path)
#with open("Dataset/Dataset_PMC_CleanedXML/PMC8707864.xml", encoding="utf-8") as f:
with open("Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC8483762.txt", encoding="utf-8") as f:
    paper_text = f.read()

# Create messages in chat format
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": paper_text},
]

# Run generation
outputs = pipe(
    messages,
    max_new_tokens=256,  # increase if needed
)

# Print the extracted adjuvant information
#print(outputs[0]["generated_text"])
print(outputs[0]["generated_text"][-1]["content"])


In [None]:
import json
import os
import csv
import torch
import gc
from datetime import datetime

def extract_and_save_adjuvant_response_text_from_string(
    pmc_id: str,
    text: str,
    system_prompt: str,
    pipe,
    output_file: str,
    max_new_tokens: int = 512
):
    """
    Processes a single abstract string and saves the response with PMID to output_file.
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text},
    ]
    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    response_text = outputs[0]["generated_text"][-1]["content"]
    with open(output_file, "a", encoding="utf-8") as out_f:
        out_f.write(f"=== {pmc_id} ===\n")
        out_f.write(response_text.strip() + "\n\n")
    print(f"Saved response for {pmc_id} to {output_file}")

def process_all_papers(
    input_json_file: str,
    system_prompt: str,
    pipe,
    output_txt_file: str,
    log_csv_file: str,
    max_new_tokens: int = 512
):
    """
    Processes all abstracts from a JSON file and logs each result (success/failure) to a CSV.
    Args:
        input_json_file (str): Path to input JSON file with {PMID: {title, abstract}} entries.
        system_prompt (str): System prompt for the LLM.
        pipe: Hugging Face pipeline object.
        output_txt_file (str): Path to save model responses (plain text).
        log_csv_file (str): Path to save logs (CSV).
        max_new_tokens (int): Generation token limit.
    """
    # Read all abstracts into a dict
    with open(input_json_file, encoding="utf-8") as f:
        papers_dict = json.load(f)

    print(f"Found {len(papers_dict)} papers. Starting extraction...\n")

    # Prepare log CSV (write headers if file doesn't exist)
    log_exists = os.path.exists(log_csv_file)
    with open(log_csv_file, "a", newline='', encoding="utf-8") as log_f:
        log_writer = csv.writer(log_f)
        if not log_exists:
            log_writer.writerow(["PMID", "Status", "Message", "Timestamp"])

        for i, (pmcid, paper_data) in enumerate(papers_dict.items(), 1):
            abstract_text = paper_data.get("abstract", "") or ""
            status = "success"
            message = "Processed successfully"
            try:
                extract_and_save_adjuvant_response_text_from_string(
                    pmcid, abstract_text, system_prompt, pipe, output_txt_file, max_new_tokens
                )
            except Exception as e:
                status = "error"
                message = str(e)[:500]
                print(f"❌ Error processing PMID {pmcid}: {message}")
            else:
                print(f"✅ [{i}/{len(papers_dict)}] Processed PMID {pmcid}")
            finally:
                log_writer.writerow([pmcid, status, message, datetime.now().isoformat()])
                log_f.flush()
                torch.cuda.empty_cache()
                gc.collect()


In [None]:
import os
import gc
import csv
import torch
from datetime import datetime

input_json_file = "Dataset/Vaxjo/All PMID abstracts.txt"  # should contain JSON, not plain text
output_txt_file = "Outputs/Vaxjo_PMIDs_adjuvant_responses.txt"
log_csv_file = "Outputs/Vaxjo_PMIDs_adjuvant_log.csv"

process_all_papers(
    input_json_file,
    system_prompt,
    pipe,
    output_txt_file,
    log_csv_file,
    max_new_tokens=512
)
print("Phase I LLM inferencing complete")

# Intermediate Processing

In [None]:
import re
import json
import pandas as pd

# Load the content from the text file
with open("Outputs/Vaxjo_PMIDs_adjuvant_responses.txt", "r", encoding="utf-8") as file:
    content = file.read()

# Extract each PMID block
entries = re.split(r'===\s+(\d+)\s+===', content)[1:]
pmid_ids = entries[::2]   # PMIDs
texts = entries[1::2]    # Corresponding text blocks

# Extract data
data = []
for pmid_id, text in zip(pmid_ids, texts):
    if "No adjuvants mentioned." in text:
        continue
    # Find JSON-like blocks
    json_blocks = re.findall(r"\[\s*{.*?}\s*]", text, re.DOTALL)
    for block in json_blocks:
        try:
            adjuvant_list = json.loads(block)
            for item in adjuvant_list:
                data.append({
                    "PMID": pmid_id,
                    "adjuvant": item.get("adjuvant"),
                    "immune_response_mechanism": item.get("immune_response_mechanism")
                })
        except json.JSONDecodeError:
            continue

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("Outputs/Vaxjo_PMIDs_adjuvant_extracted.csv", index=False)

print("Extraction complete. Saved to 'Vaxjo_PMIDs_adjuvant_extracted.csv'.")
df

In [None]:
# Count unique adjuvants
unique_adjuvants = df["adjuvant"].nunique()
print(f"Number of unique adjuvants: {unique_adjuvants}")


In [None]:
# Find unique immune response mechanisms containing the word "not" (case-insensitive)
mechanisms = df["immune_response_mechanism"].dropna().unique()

mechanisms_with_not_none = [
    m for m in mechanisms if ("not " in m.lower() or "none " in m.lower() in m.lower())
]

for mechanism in mechanisms_with_not_none:
    print(mechanism)

In [None]:
not_valid_keywords = [
    "None mentioned",
"Not explicitly described",
"None mentioned.",
"Not specified",
"None described",
"Not explicitly described in the text",
"Mechanism of action not yet completely known, but it is comprised of Quillaja saponins, cholesterol and phospholipid, and induces recruitment of leukocytes to draining lymph nodes (dLNs) and spleen, activation of central immune cells, and elevation of cytokines and co-stimulatory molecules.",
"None specified",
"Not mentioned",
"Mechanism not explicitly described",
"Not explicitly stated, but based on the context, it appears to stimulate a robust immune response by enhancing the production of various cytokines and antibody-secreting cells.",
"Not specified in the text.",
"Not explicitly described, but it is known that aluminum salts activate NLRP3 inflammasome and form a depot for slow antigen release.",
"Not explicitly described, but it is known that saponins like QS21 can stimulate immune responses by interacting with TLR4 and TLR8.",
"Not described",
"Mechanism not described",
"Not explicitly described, left blank."
]


# Drop rows with NaN or empty strings (after stripping whitespace)
df_cleaned = df[df["immune_response_mechanism"].notna()].copy()
df_cleaned = df_cleaned[df_cleaned["immune_response_mechanism"].str.strip() != ""]

# Remove rows containing any invalid keyword (case-insensitive)
df_cleaned = df_cleaned[
    ~df_cleaned["immune_response_mechanism"].str.lower().apply(
        lambda x: any(keyword in x for keyword in not_valid_keywords)
    )
].reset_index(drop=True)

# Save to CSV
df_cleaned.to_csv("Outputs/Vaxjo_PMIDs_adjuvant_extracted_clean.csv", index=False)

# Preview the result
df_cleaned

In [None]:
# Get and print the sorted list of unique adjuvant names
unique_adjuvant_names = sorted(df_cleaned["adjuvant"].unique())
print("Unique adjuvants:")
for name in unique_adjuvant_names:
    print(name)


In [None]:
mapping = {
    # ---- Adjuplex / ADJ ----
    "ADJ": "Adjuplex (ADJ)",

    # ---- Alum / Aluminum hydroxide family (exact naming variants only) ----
    "Alum": "Alum (aluminum hydroxide)",
    "Al(OH)3": "Alum (aluminum hydroxide)",
    "Alhydrogel": "Alum (aluminum hydroxide)", #commercial brand name for an aluminum hydroxide gel suspension
    "Aluminium Hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminium hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum Hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum Hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide (AH)": "Alum (aluminum hydroxide)",
    "Aluminum hydroxide (Alum)": "Alum (aluminum hydroxide)",
    "Aluminum oxyhydroxide": "Alum (aluminum hydroxide)", #Aluminum oxyhydroxide is the formal chemical name for the hydrated crystalline species often used in alum adjuvants (sometimes cited as boehmite-like AlO(OH)), which is the same active form as in common alum formulations. So mapping to “Alum (aluminum hydroxide)” is also correct.
    # ---- Advax / delta inulin (brand vs generic, diacritics, ™/®) ----
    "Advax": "Advax (delta inulin)",
    "Advax delta inulin": "Advax (delta inulin)",
    "Advax® delta inulin": "Advax (delta inulin)",
    "Advax™": "Advax (delta inulin)",
    "Delta Inulin (DI)": "Advax (delta inulin)",
    "Delta inulin": "Advax (delta inulin)",
    "delta inulin": "Advax (delta inulin)",
    "delta inulin (DI)": "Advax (delta inulin)",
    "delta inulin adjuvant (Advax™)": "Advax (delta inulin)",
    "δ-inulin": "Advax (delta inulin)",
    "delta-inulin": "Advax (delta inulin)",

    # Keep specific Advax *formulations* distinct:
    # Advax-2 / -M / -P / -SM / CpG / CpG55.2 etc. are NOT collapsed.

    # ---- BECC spelling/spacing ----
    "BECC 438": "BECC438",
    "BECC 470": "BECC470",

    # ---- Calcium phosphate shorthand ----
    "CAP": "Calcium Phosphate (CAP)",

    # ---- CAF01 encoding variant ----
    "CAF\u00b01": "CAF01",

    # ---- Cholera toxin naming ----
    "Cholera toxin": "Cholera toxin (CT)",

    # ---- Compound 48/80 naming ----
    "C48/80": "Compound 48/80 (C48/80)",

    # ---- CoVaccine HT trademark ----
    "CoVaccine HT™": "CoVaccine HT",

    # ---- CpG ODN (shorthands & plurals used as the same thing here) ----
    "CpG": "CpG ODN",
    "CpG ODNs": "CpG ODN",
    #"CpG motifs": "CpG ODN",
    "CpG oligodeoxynucleotide (CpG-ODN)": "CpG ODN",
    "CpG-ODN": "CpG ODN",

    # Keep specific sequences distinct (not collapsed):
    # "CpG ODN 1826", "CpG M362", "ODN2006" remain separate.


    "Complete Freund's adjuvant": "Complete Freund's Adjuvant (CFA)",
    
    # ---- EM-005 aka GLA-SE ----
    "EM-005 (GLA-SE)": "GLA-SE",

    # ---- Flagellin (case only) ----
    "flagellin": "Flagellin",

    # (FliC, FlaB kept distinct—specific flagellins.)

    # ---- GLA naming variants ----
    "Glucopyranosyl Lipid Adjuvant": "GLA",
    "Glucopyranosyl Lipid Adjuvant (GLA)": "GLA",
    "Glucopyranosyl lipid A": "GLA",
    #"Glucopyranosyl lipid A (G100)": "GLA",
    "Glucopyranosyl lipid adjuvant (GLA)": "GLA",
    "ID93/glucopyranosyl lipid adjuvant (GLA)": "GLA",
    # GLA-AF / GLA-SE / GLA-LSQ are formulations—kept distinct.
    
    "GLA-SE (components included: glucopyranosyl lipid A, squalene emulsion)": "GLA-SE",
    "GLA-squalene emulsion": "GLA-SE",
    "Glucopyranosyl lipid A (G100)": "GLA",
    
    # ---- IFN naming (alpha/beta symbol) ----
    "IFN-alpha": "IFN-α",
    "IFNβ": "IFN-β",

    # ---- Imiquimod alias ----
    "imiquimod (R837)": "Imiquimod (R837)",
    "Imiquimod": "Imiquimod (R837)",

    # ---- Matrix-M trademark ----
    "Matrix-M™": "Matrix-M",

    # ---- MCT trademark ----
    "MCT®": "MCT",

    # ---- MPL naming variants ----
    '''"MPL": "MPL (monophosphoryl lipid A)",
    "MPL®": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid A (MPLA)": "MPLA",'''
    
    # Keep "MPLA" as is (canonical already).
    "MPL": "MPL (monophosphoryl lipid A)",
    "MPL®": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid": "MPL (monophosphoryl lipid A)",
    "Monophosphoryl lipid A (MPLA)": "MPL (monophosphoryl lipid A)",

    "MPL (monophosphoryl lipid A)": "MPLA",
    "MPL": "MPLA",   # redundant safety check, in case not yet mapped
    
    # "MPLA" stays as is (already canonical).
    
    
    
    "M7": "Mastoparan-7 (M7)",

    
    # Keep MPL-SE / MPL+TDM / MPL/DDA distinct (formulations/combinations).

    # ---- Montanide ISA 51 spacing ----
    "Montanide ISA-51": "Montanide ISA 51",
    "Montanide ISA51": "Montanide ISA 51",
    
    # --- Mastoparan naming ---
    "Mastoparan 7": "Mastoparan-7 (M7)",
   
    # ---- PCL abbreviation expansion ----
    "PCL/chitosan NPs": "poly-ϵ-caprolactone/chitosan NPs",

    # ---- Poly(I:C) case ----
    "poly(I:C)": "Poly(I:C)",

    # ---- Polyclonal Antibody Stimulator naming ----
    "polyclonal antibody stimulator-PAS": "Polyclonal Antibody Stimulator (PAS)",

    # ---- QS-21 dash/spacing ----
    "QS21": "QS-21",

    # ---- R848 (resiquimod) alias ----
    "R848": "Resiquimod (R848)",
    "resiquimod": "Resiquimod (R848)",

    # ---- SE (squalene emulsion) naming (keep brands separate like MF59/AS03) ----
    "Squalene-based oil-in-water emulsion system (SE)": "SE",
    "squalene oil-in-water emulsion (SE)": "SE",

    # ---- SWE (Sepivac SWE) naming ----
    "Sepivac SWE": "SWE",
    "SEPIVAC SWETM": "SWE",

    # ---- Squalene case only ----
    "squalene": "Squalene",

    # ---- TDM (trehalose-6,6'-dimycolate) spelling variants ----
    "6,6'-trehalose dimycolate (TDM)": "Trehalose-6,6'-dimycolate (TDM)",

    # ---- α-GalCer naming variants ----
    "α-Galactosylceramide (α-GC)": "α-GalCer",
    "α-Galactosylceramide (αGalCer)": "α-GalCer",
}
df_cleaned["adjuvant_canonical"] = df_cleaned["adjuvant"].replace(mapping)
df_cleaned

In [None]:
print("Before:", df_cleaned["adjuvant"].nunique())
print("After :", df_cleaned["adjuvant_canonical"].nunique())


In [None]:
# Get and print the sorted list of unique adjuvant names
unique_adjuvant_names_canonical = sorted(df_cleaned["adjuvant_canonical"].unique())
print("Unique adjuvants_canonical:")
for name in unique_adjuvant_names_canonical:
    print(name)


In [None]:
df_grouped = (
    df_cleaned
    .assign(immune_response_mechanism=lambda x: x["immune_response_mechanism"] + " [" + x["PMID"].astype(str) + "]")
    .groupby("adjuvant_canonical")["immune_response_mechanism"]
    .agg(lambda x: " | ".join(sorted(set(x))))
    .reset_index()
    .sort_values(by="adjuvant_canonical", key=lambda s: s.str.lower())
)

df_grouped.to_csv("Outputs/Vaxjo_PMIDs_adjuvant_mechanism_collapsed.csv", index=False)
df_grouped


# # Phase II: Llama Inferencing

In [None]:
import pandas as pd

system_prompt = """You are an expert immunologist and biomedical research assistant.
TASK: Analyze the provided text on a vaccine adjuvant's immune response. Extract and structure the key mechanistic information according to the specified JSON schema.

## Instructions for the "summary" field:
- **Synthesize the information into a cohesive, mechanistic narrative of approximately 3-5 sentences.**
- This summary should not be a simple list of facts. Instead, it should describe the sequence of immunological events initiated by the adjuvant.
- For example, describe how the adjuvant is initially sensed (e.g., by PRRs like TLRs), how this leads to innate cell activation (e.g., dendritic cells), and how this subsequently shapes the adaptive response (e.g., T cell polarization and antibody production).
- Integrate the corresponding PMIDs directly into the text immediately following the claims they support.

## Instructions for the "mechanism_subtypes" field:
- Identify **every distinct** immunological mechanism.
- For each identified subtype, list all unique PMIDs cited as evidence for it in the source text.
- Do not merge related subtypes; for example, if both "dendritic cell" and "TLR4" are mentioned, create separate entries for each.

## General Rules:
- **Strict JSON Output:** The entire response MUST be a single, valid JSON object with no surrounding text or explanations.
- **Source Adherence:** Use ONLY the information and PMIDs present in the provided text. Do not infer or add external knowledge.

## JSON Schema:
{
  "adjuvant": "<string>",
  "summary": "<A cohesive, mechanistic narrative of 3-5 sentences describing the sequence of immune events, with inline PMIDs.>",
  "mechanism_subtypes": [
    {
      "mechanism subtype": "<mechanism subtype_1>",
      "evidence_refs": ["########", "..."]
    },
    {
      "mechanism subtype": "<mechanism subtype_2>",
      "evidence_refs": ["########", "..."]
    },...
  ]
}

"""

# Load one row from your collapsed CSV
df = pd.read_csv("Outputs/Vaxjo_PMIDs_adjuvant_mechanism_collapsed.csv")
# Rename the column
df = df.rename(columns={"adjuvant_canonical": "adjuvant"})
df

In [None]:
# Iterate over the whole DataFrame, run generation, and save raw outputs to a .txt file
# Assumes you already have: df, system_prompt, and pipe(...) defined.

import json
import os

OUT_TXT = "Outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.txt"   # plain text (human-readable)
# (optional) also keep a machine-friendly JSONL:
OUT_JSONL = "Outputs/Vaxjo_PMIDs_mechanism_summary_raw_outputs_llama3.2.jsonl"

os.makedirs(os.path.dirname(OUT_TXT) or ".", exist_ok=True)

# If you want to change token budget, tweak here:
GEN_MAX_NEW_TOKENS = 4028

# Open files once and append per row (flush to avoid losing progress mid-run)
with open(OUT_TXT, "w", encoding="utf-8") as f_txt, open(OUT_JSONL, "w", encoding="utf-8") as f_jsonl:
    for idx, row in df.iterrows():
        adjuvant = str(row.get("adjuvant", ""))
        mechanism = str(row.get("immune_response_mechanism", ""))

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"Adjuvant: {adjuvant}\nImmune response mechanism:\n{mechanism}"}
        ]

        status = "ok"
        try:
            outputs = pipe(messages, max_new_tokens=GEN_MAX_NEW_TOKENS)
            raw = outputs[0]["generated_text"][-1]["content"]
        except Exception as e:
            status = "error"
            raw = f"__ERROR__: {e}"

        # ---- Write human-readable TXT ----
        header = f"===== ROW {idx} | {adjuvant} | {status} =====\n"
        f_txt.write(header)
        f_txt.write((raw or "").strip() + "\n\n")
        f_txt.flush()

        # ---- (Optional) also write JSONL per row ----
        f_jsonl.write(json.dumps({
            "row_index": int(idx),
            "adjuvant": adjuvant,
            "status": status,
            "raw": raw
        }, ensure_ascii=False) + "\n")
        f_jsonl.flush()

print(f"Saved outputs to:\n- {OUT_TXT}\n- {OUT_JSONL} (optional JSONL)")
