In [1]:
import pandas as pd

# Load the CSV file
file_path = "outputs/adjuvant_log.csv"
df = pd.read_csv(file_path)

# Display the column names
df.columns.tolist(), df.head()


(['PMC_ID', 'Status', 'Message', 'Timestamp'],
         PMC_ID   Status                                            Message  \
 0  PMC10013823    error  CUDA out of memory. Tried to allocate 16.60 Gi...   
 1  PMC10014862  success                             Processed successfully   
 2  PMC10048722  success                             Processed successfully   
 3  PMC10059828  success                             Processed successfully   
 4  PMC10064963  success                             Processed successfully   
 
                     Timestamp  
 0  2025-07-30T19:06:35.775127  
 1  2025-07-30T19:06:49.681018  
 2  2025-07-30T19:07:32.345241  
 3  2025-07-30T19:07:44.806953  
 4  2025-07-30T19:08:04.350920  )

In [2]:
# Count the number of each status
status_counts = df['Status'].value_counts()

# Display the result
print(status_counts)


Status
error      330
success    273
Name: count, dtype: int64


In [3]:
# Load the text file and check the frequency of the specific phrase
file_path_txt = "outputs/adjuvant_responses.txt"

with open(file_path_txt, "r", encoding="utf-8") as file:
    content = file.read()

# Count the occurrences of the target phrase
target_phrase = "No adjuvants mentioned."
frequency = content.count(target_phrase)

frequency


95

In [4]:
import re
import json
import pandas as pd

# Load the content from the text file
with open("outputs/adjuvant_responses.txt", "r", encoding="utf-8") as file:
    content = file.read()

# Extract each PMC block
entries = re.split(r'===\s+(PMC\d+)\s+===', content)[1:]
pmc_ids = entries[::2]   # PMC IDs
texts = entries[1::2]    # Corresponding text blocks

# Extract data
data = []
for pmc_id, text in zip(pmc_ids, texts):
    if "No adjuvants mentioned." in text:
        continue
    # Find JSON-like blocks
    json_blocks = re.findall(r"\[\s*{.*?}\s*]", text, re.DOTALL)
    for block in json_blocks:
        try:
            adjuvant_list = json.loads(block)
            for item in adjuvant_list:
                data.append({
                    "PMC ID": pmc_id,
                    "adjuvant": item.get("adjuvant"),
                    "immune_response_mechanism": item.get("immune_response_mechanism")
                })
        except json.JSONDecodeError:
            continue

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("outputs/adjuvant_extracted.csv", index=False)

print("Extraction complete. Saved to 'adjuvant_extracted.csv'.")
df

Extraction complete. Saved to 'adjuvant_extracted.csv'.


Unnamed: 0,PMC ID,adjuvant,immune_response_mechanism
0,PMC10048722,Alum,Activates NLRP3 inflammasome and forms a depot...
1,PMC10048722,MPLA,Engages TLR4 pathway to promote Th1 responses.
2,PMC10048722,Chitosan,"Induces the production of cytokines (e.g., TNF..."
3,PMC10048722,Resiquimod (RC),Triggers toll-like receptor (TLR) 9 (TLR9) sig...
4,PMC10048722,Alginate,Preserves antigens in an acidic environment an...
...,...,...,...
556,PMC9998285,SAS,Not explicitly described
557,PMC9998285,"Poly-IC, squalene emulsion",Not explicitly described
558,PMC9998285,Aluminum hydroxide,Not explicitly described
559,PMC9998285,CpG-1826,Not explicitly described


In [5]:
# Count unique adjuvants
unique_adjuvants = df["adjuvant"].nunique()
print(f"Number of unique adjuvants: {unique_adjuvants}")


Number of unique adjuvants: 204


In [6]:
# Find unique immune response mechanisms containing the word "not" (case-insensitive)
not_mechanisms = df["immune_response_mechanism"].dropna().unique()
not_mechanisms_with_not = [m for m in not_mechanisms if "not" in m.lower()]

# Display the result
for mechanism in not_mechanisms_with_not:
    print(mechanism)


Not explicitly described in the text
Not explicitly described
Not specified
Not mentioned in the text.
Not explicitly described, but mentioned as a potential adjuvant for parasite vaccines.
The text does not explicitly describe how Alum works to stimulate or enhance the immune response.
The text does not explicitly describe how MPLA works to stimulate or enhance the immune response.
Recruits immune cells and increases antigen uptake, although it does not stimulate a CD8+ T cell response.
Alum alone cannot induce a potent Th1 response unless combined with another adjuvant, such as glucopyranosyl lipid A (a synthetic TLR-4 agonist).
Not explicitly described in the text.
Not explicitly described in the text, but mentioned in the context of ovalbumin model
Mechanism not explicitly described.
Not mentioned
Not mentioned in the text
Activated murine and human DC and induces TLR3/RIG-I mediated antitumor immunity, demonstrated great potential in cancer immunotherapy.
not mentioned in the text

In [7]:
# Find unique immune response mechanisms containing the word "not" (case-insensitive)
not_mechanisms = df["immune_response_mechanism"].dropna().unique()
not_mechanisms_with_not = [m for m in not_mechanisms if "none" in m.lower()]

# Display the result
for mechanism in not_mechanisms_with_not:
    print(mechanism)


None specified
None mentioned
None described


In [8]:
not_valid_keywords = [
    "not explicitly described in the text",
    "not explicitly described",
    "not specified",
    "not mentioned in the text",
    "mechanism not explicitly described",
    "not mentioned",
    "not explicitly mentioned",
    "none specified",
    "none mentioned",
    "none described"
]


# Drop rows with NaN or empty strings (after stripping whitespace)
df_cleaned = df[df["immune_response_mechanism"].notna()].copy()
df_cleaned = df_cleaned[df_cleaned["immune_response_mechanism"].str.strip() != ""]

# Remove rows containing any invalid keyword (case-insensitive)
df_cleaned = df_cleaned[
    ~df_cleaned["immune_response_mechanism"].str.lower().apply(
        lambda x: any(keyword in x for keyword in not_valid_keywords)
    )
].reset_index(drop=True)

# Save to CSV
df_cleaned.to_csv("outputs/adjuvant_extracted_clean.csv", index=False)

# Preview the result
df_cleaned

Unnamed: 0,PMC ID,adjuvant,immune_response_mechanism
0,PMC10048722,Alum,Activates NLRP3 inflammasome and forms a depot...
1,PMC10048722,MPLA,Engages TLR4 pathway to promote Th1 responses.
2,PMC10048722,Chitosan,"Induces the production of cytokines (e.g., TNF..."
3,PMC10048722,Resiquimod (RC),Triggers toll-like receptor (TLR) 9 (TLR9) sig...
4,PMC10048722,Alginate,Preserves antigens in an acidic environment an...
...,...,...,...
402,PMC9979891,Alum,Toll-like receptor agonist complexed with alum...
403,PMC9979891,AS03,"α-tocopherol, squalene and polysorbate 80 in a..."
404,PMC9979891,CpG,Toll-like receptor agonist complexed with alum...
405,PMC9998285,CpG 1018,Engages TLR4 pathway to promote Th1 responses


In [9]:
# Count unique adjuvants
unique_adjuvants = df_cleaned["adjuvant"].nunique()
print(f"Number of unique adjuvants: {unique_adjuvants}")


Number of unique adjuvants: 131


In [10]:
# Get and print the sorted list of unique adjuvant names
unique_adjuvant_names = sorted(df_cleaned["adjuvant"].unique())
print("Unique adjuvants:")
for name in unique_adjuvant_names:
    print(name)


Unique adjuvants:
1,25-dihydroxyvitamin D3 (VD3)
3M-052
3M-052/SE
5′pppdsRNA
AS01
AS02
AS03
AS0378
AS04
AS15
ATP
AddaVax
Advax
Advax-CpG55.2
Al hydroxide
Al hydroxyphosphate sulfate
Alginate
Alhydrogel
AlhydrogelTM
Alhydroxiquim-II
Alum
Alum-CpG
Aluminum hydroxide
Aluminum salt
Azoximer bromide
BCG
BECC/Alhydrogel
CAF01
CAP
CBP-12
Chitosan
Cholera toxin (CT)
Cholera toxin B
CpG
CpG 1,018
CpG 10,101
CpG 1018
CpG DNA
CpG ODN
CpG ODNs
CpG oligodeoxynucleotides
CpG oligonucleotides
CpG-1826 and IC31
CpG-ODN (Cytosine-phosphate-guanine oligodeoxynucleotides)
Curdlan
DDA
DOPE
Di-ABZI
Dimethyloxoxanthenyl acetic acid (DMXAA)
Double-stranded RNA (dsRNA)
Flagellin
GLA
GLA-squalene emulsion (SE)
GM-CSF
GPI-0100
HBV nucleocapsid
HSP90
Heat-labile enterotoxin of E. coli (LT)
IC31
Imidazoquinolines
Indoximod (IND)
K3 CpG + cGAMP
LPS
LPS-Rhodobacter sphaeroides
LT(R192G/L211A)
Layered double hydroxide (LDH)
Lipooligosaccharide (LOS)
Lipopolysaccharide (LPS)
MALP-2 (Macrophage-activating lipopeptide 

In [11]:
from collections import defaultdict

# Group by adjuvant only
adjuvant_grouped = defaultdict(list)

for _, row in df_cleaned.iterrows():
    adjuvant_grouped[row["adjuvant"]].append(
        f'{row["immune_response_mechanism"]} [{row["PMC ID"]}]'
    )

# Collapse to one row per adjuvant
df_single_row = pd.DataFrame([
    {"adjuvant": adj, "immune_response_mechanism": " | ".join(set(mechs))}
    for adj, mechs in adjuvant_grouped.items()
])

# Sort and save
df_single_row = df_single_row.sort_values(by="adjuvant", key=lambda x: x.str.lower()).reset_index(drop=True)
df_single_row.to_csv("outputs/adjuvant_mechanism_collapsed.csv", index=False)

df_single_row


Unnamed: 0,adjuvant,immune_response_mechanism
0,"1,25-dihydroxyvitamin D3 (VD3)",Suppresses immune responses and promotes toler...
1,3M-052,Formulated in a lipid-based nanoparticle (3M-0...
2,3M-052/SE,Overcomes hyporesponsiveness to the pneumococc...
3,5′pppdsRNA,Co-encapsulated with antigen peptides in lipid...
4,AddaVax,MF59-like adjuvant [PMC7918810]
...,...,...
126,VSA-2,Shows a consistently higher IgG2a/IgG1 ratio t...
127,ZPSs,"Presented on MHCII complexes, leading to T cel..."
128,α-GalCer,Strongly enhances antigen-specific antibody re...
129,β-defensin,Enhances immune response [PMC10383774]


In [1]:
########################################################## Summarize by LLM  ######################################
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:

n_gpus = torch.cuda.device_count()
print("N GPUS: ", n_gpus)

# Set memory limits per GPU
model_vram_limit_mib = 8192  #12000
max_memory = f'{model_vram_limit_mib}MiB'
max_memory_dict = {i: max_memory for i in range(n_gpus)}

model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load model and tokenizer with memory limits
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    max_memory=max_memory_dict
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Now create the pipeline using pre-loaded model/tokenizer
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)



N GPUS:  8


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here's a summary of the immune response mechanism for each of the adjuvants mentioned, along with references from the PubMed Central database:

1. **1,25-dihydroxyvitamin D3 (VD3)**: Suppresses immune responses and promotes tolerogenic effects. [PMC8033039]
VD3 modulates the immune response by inhibiting the activation of immune cells, such as T cells and macrophages, and promoting the production of anti-inflammatory cytokines.

2. **3M-052**: Formulated in a lipid-based nanoparticle (3M-052-AF), is being evaluated as an adjuvant for a preventive HIV vaccine. [PMC9927404]
3M-052 works by stimulating the innate immune response, which leads to the activation of antigen-presenting cells and the induction of a strong adaptive immune response.

3. **3M-052/SE**: Overcomes hyporesponsiveness to the pneumococcal conjugate vaccine at birth. [PMC7858666]
3M-052/SE is a combination of 3M-052 and a squalene-based adjuvant, which enhances the immune response to the pneumococcal conjugate vaccine.


In [None]:
import pandas as pd

# Load your cleaned input CSV
df_input = pd.read_csv("outputs/adjuvant_mechanism_collapsed.csv")

# System prompt
system_prompt = (
    "You are a biomedical assistant with expertise in immunology. "
    "Summarize the immune response mechanism for this adjuvant based on the information provided, with PMC references."
)

# List to store outputs
results = []

# Iterate over each row
for index, row in df_input.iterrows():
    adjuvant = row["adjuvant"]
    mechanism = row["immune_response_mechanism"]

    # Compose chat messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Adjuvant: {adjuvant}\nMechanism: {mechanism}"}
    ]

    try:
        # Run generation
        output = pipe(messages, max_new_tokens=512)
        summary = output[0]["generated_text"][-1]["content"]
    except Exception as e:
        summary = f"Error: {e}"

    # Append result
    results.append({
        "adjuvant": adjuvant,
        "input_mechanism": mechanism,
        "summary": summary
    })

# Save the results
df_summary = pd.DataFrame(results)
df_summary.to_csv("outputs/adjuvant_mechanism_summarized.csv", index=False)

# Preview a few rows
print(df_summary.head())


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to