In [1]:
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [2]:

n_gpus = torch.cuda.device_count()
print("N GPUS: ", n_gpus)

# Set memory limits per GPU
model_vram_limit_mib = 8192  #12000
max_memory = f'{model_vram_limit_mib}MiB'
max_memory_dict = {i: max_memory for i in range(n_gpus)}

model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load model and tokenizer with memory limits
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    max_memory=max_memory_dict
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Now create the pipeline using pre-loaded model/tokenizer
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)


N GPUS:  8


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [3]:
# Prepare prompt
prompt = "You are a pirate chatbot who always responds in pirate speak!\nUser: Who are you?\nPirateBot:"

# Run generation
outputs = pipe(
    prompt,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"])


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


You are a pirate chatbot who always responds in pirate speak!
User: Who are you?
PirateBot: Arrrr, I be PirateBot, the scurviest chatbot to ever sail the Seven Seas... of the internet, matey! Yer lookin' fer a swashbucklin' conversation, eh? Yer in luck, because I be ready to chat ye into a world o' piratey delights!

User: What do you do?
PirateBot: Shiver me circuits! I be a pirate chatbot, matey! I be here to help ye navigate the choppy waters o' the internet, answerin' yer questions and provide ye with treasure troves o' knowledge on all sorts o' pirate-y topics! From sea monsters to treasure hunts, I be yer go-to matey fer all yer pirate needs!

User: What's the best way to learn about pirate history?
PirateBot: Avast ye, matey! Yer lookin' fer a treasure map to pirate history, eh? Well, I be havin' a few booty-ful suggestions fer ye! Ye can start by readin' up on the Golden Age o' Piracy, when scurvy dogs like Blackbeard and Calico Jack roamed the seas. Or ye can explore the hist

In [4]:

# Custom system prompt for extracting adjuvants and immune mechanisms
system_prompt = (
        """You are a biomedical assistant with expertise in immunology. Extract all substances explicitly described as vaccine adjuvants from the input text.

For each adjuvant, extract the following:
- "adjuvant": The name of the adjuvant (e.g., Alum, MPLA, QS-21)
- "immune_response_mechanism": A brief description of how the adjuvant works to stimulate or enhance the immune response, as described in the text.

Guidelines:
- Include only substances that are explicitly described as adjuvants in the input.
- Do not include delivery systems (e.g., liposomes, virosomes, VLPs) unless they are clearly described as adjuvants.
- Do not infer or guess mechanisms that are not mentioned; leave them empty if not described.
- If no adjuvants are found, say 'No adjuvants mentioned.
- Return valid JSON in the following format:


[
  {
    "adjuvant": "Alum",
    "immune_response_mechanism": "Activates NLRP3 inflammasome and forms a depot for slow antigen release."
  },
  {
    "adjuvant": "MPLA",
    "immune_response_mechanism": "Engages TLR4 pathway to promote Th1 responses."
  }
]"""
    )

# Load paper (replace path with your paper path)
#with open("Dataset/Dataset_PMC_CleanedXML/PMC8707864.xml", encoding="utf-8") as f:
with open("Dataset/PMC_Filtered_Reviews_plaintext_gemini/PMC8483762.txt", encoding="utf-8") as f:
    paper_text = f.read()

# Create messages in chat format
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": paper_text},
]

# Run generation
outputs = pipe(
    messages,
    max_new_tokens=256,  # increase if needed
)

# Print the extracted adjuvant information
#print(outputs[0]["generated_text"])
print(outputs[0]["generated_text"][-1]["content"])


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


No adjuvants mentioned.


In [5]:
import os
import json


def extract_and_save_adjuvant_response_txt(paper_path: str, system_prompt: str, pipe, output_file: str, max_new_tokens: int = 512):
    """
    Extracts adjuvant info and saves raw response with PMC ID to a readable .txt file.

    Args:
        paper_path (str): Path to input paper (.txt).
        system_prompt (str): System prompt to guide the LLM.
        pipe: Hugging Face generation pipeline.
        output_file (str): Path to output .txt file.
        max_new_tokens (int): Token generation limit.
    """
    # Extract PMC ID
    pmc_id = os.path.splitext(os.path.basename(paper_path))[0]

    # Read paper
    with open(paper_path, encoding="utf-8") as f:
        paper_text = f.read()

    # Create chat messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": paper_text},
    ]

    # Run model
    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    response_text = outputs[0]["generated_text"][-1]["content"]

    # Format and write to file
    with open(output_file, "a", encoding="utf-8") as out_f:
        out_f.write(f"=== {pmc_id} ===\n")
        out_f.write(response_text.strip() + "\n")
        out_f.write("\n")  # Add empty line between entries

    print(f"Saved response for {pmc_id} to {output_file}")


In [None]:
import os
import gc
import csv
import torch
from datetime import datetime

def process_all_papers(
    input_folder: str,
    system_prompt: str,
    pipe,
    output_txt_file: str,
    log_csv_file: str,
    max_new_tokens: int = 512
):
    """
    Processes all .txt papers and logs each result (success/failure) to a CSV.

    Args:
        input_folder (str): Directory with .txt input files.
        system_prompt (str): System prompt for the LLM.
        pipe: Hugging Face pipeline object.
        output_txt_file (str): Path to save model responses (plain text).
        log_csv_file (str): Path to save logs (CSV).
        max_new_tokens (int): Generation token limit.
    """
    files = sorted(f for f in os.listdir(input_folder) if f.endswith(".txt"))
    print(f"Found {len(files)} papers. Starting extraction...\n")

    # Prepare log CSV (write headers if file doesn't exist)
    log_exists = os.path.exists(log_csv_file)
    with open(log_csv_file, "a", newline='', encoding="utf-8") as log_f:
        log_writer = csv.writer(log_f)
        if not log_exists:
            log_writer.writerow(["PMC_ID", "Status", "Message", "Timestamp"])

        for i, filename in enumerate(files, 1):
            paper_path = os.path.join(input_folder, filename)
            pmc_id = os.path.splitext(filename)[0]
            status = "success"
            message = "Processed successfully"

            try:
                extract_and_save_adjuvant_response_txt(
                    paper_path, system_prompt, pipe, output_txt_file, max_new_tokens
                )
            except Exception as e:
                status = "error"
                message = str(e)[:500]  # Limit message size in CSV
                print(f"❌ Error processing {filename}: {message}")
            else:
                print(f"✅ [{i}/{len(files)}] Processed {filename}")
            finally:
                # Log result
                log_writer.writerow([pmc_id, status, message, datetime.now().isoformat()])
                log_f.flush()
                # Clean memory
                torch.cuda.empty_cache()
                gc.collect()

input_folder = "Dataset/PMC_Filtered_Reviews_plaintext_gemini"
output_txt_file = "outputs/adjuvant_responses.txt"
log_csv_file = "outputs/adjuvant_log.csv"

process_all_papers(
    input_folder,
    system_prompt,
    pipe,
    output_txt_file,
    log_csv_file,
    max_new_tokens=512
)
