# BioLLM x Plants - Procedure Design - Q-As

Rachel K. Luu, Ming Dao, Subra Suresh, Markus J. Buehler (2025) ENHANCING SCIENTIFIC INNOVATION IN LLMS: A FRAMEWORK APPLIED TO PLANT MECHANICS RESEARCH [full reference to be updated to be included here]

## Load BioLLM

In [None]:
import pandas as pd
import json
import itertools
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from typing import List, Optional, Sequence

def completion_to_prompt(completion):
    return "<|start_header_id|>system<|end_header_id|>\n<eot_id>\n<|start_header_id|>user<|end_header_id|>\n" + \
           f"{completion}<eot_id>\n<|start_header_id|>assistant<|end_header_id|>\n"

def messages_to_prompt(messages):
    prompt = "<|start_header_id|>system<|end_header_id|>\n<eot_id>\n"  
    for message in messages:
        if message.role == "system":
            prompt += f"system message<eot_id>\n"
        elif message.role == "user":
            prompt += f"<|start_header_id|>user<|end_header_id|>\n{message.content}<eot_id>\n"
        elif message.role == "assistant":
            prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{message.content}<eot_id>\n"
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
    return prompt

model_url = "https://huggingface.co/rachelkluu/Llama3.1-8b-Instruct-CPT-SFT-DPO-09022024-Q8_0-GGUF/resolve/main/llama3.1-8b-instruct-cpt-sft-dpo-09022024-q8_0.gguf"
bioinspiredllm_q8 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=.1,
    max_new_tokens=2048,
    context_window=16000,
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
)

## Load RAG Index

In [None]:
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_response

Settings.llm = bioinspiredllm_q8
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

documents = SimpleDirectoryReader(
    "./PlantPapers/"
).load_data()

Settings.chunk_size = 128
Settings.chunk_overlap = 50

vector_index = VectorStoreIndex.from_documents(documents)
query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=10) 


# Generate Questions

In [None]:
def extract_bullet_points(response_text):
    lines = response_text.split('\n')
    bullet_points = set()
    for line in lines:
        line = line.strip()
        if line.startswith(('- ', 'â€¢ ', '* ')):
            bullet_points.add(line[2:].strip())
        elif line and line[0].isdigit() and line[1:3] == '. ':
            bullet_points.add(line[3:].strip())
        elif line.startswith('[') and line[1].isdigit() and line[2] == ']':
            bullet_points.add(line[3:].strip())
    
    return list(bullet_points)

def get_technical_qs(num_generations, prompt):
    """ Function to query for QUESTIONS wrt procedure prompt, extracts as bullet point list"""
    all_questions = []
    data_for_df = []

    for gen_num in range(num_generations):
        txt = f"In order to '{prompt}', create a concise list of very basic and fundamental questions that explore the essential properties, definitions, and background relevant to this topic."
        response = query_engine.query(txt)
        questions = extract_bullet_points(response.response)
        all_questions.append(questions)
        for ques in questions:
            data_for_df.append({"Prompt": prompt, "Question": ques})
    flat_questions = list(itertools.chain.from_iterable(all_questions))
    qcount = len(flat_questions)
    df = pd.DataFrame(data_for_df, columns=["Prompt", "Question"])

    return df, flat_questions, qcount


In [None]:
prompt = "Design a procedure that makes a composite out of pollen grains and rhapis excelsa leaves." #procedure task 

num_gen = 1 #number of sampling generations
df, all_questions, qcount = get_technical_qs(num_gen, prompt) 

print(f"{qcount} total questions were generated!")
print(f"Here are the generated questions:")
for question in all_questions:
    print(f"- {question}")

#df is the dataframe holding all the data (prompt, question)
#all_questions is list of all questions 
#qcount is # of generated questions 

# Generate Answers

In [None]:
def ans_technicals(df):
    """Function that intakes the previously generated df to generate ANSWERS for the QUESTIONS"""
    answers = []
    for idx, row in df.iterrows():
        question = row["Question"]
        txt =f"{question}. Answer concisely and accurately. If you don't know the answer or there isn't enough context from the provided information, state that this area needs further exploration. Do not use citations."
        answer = query_engine.query(txt).response
        answers.append(answer)
        df.at[idx, "Answer"] = answer

    return df, answers


In [None]:
df, answers = ans_technicals(df)

for index, row in df.iterrows():
    print(f"Question: {row['Question']}")
    print(f"Answer: {row['Answer']}")
    print() 

#df is overwritten now to contain (prompt, question, answer)
#answers is a list of answers 

# Save Final Data to JSON File to be used in Multi-Agent

In [None]:
filename = "rhapispollenpaper"

json_data = df.to_json(orient='records', lines=True)
with open(f"{filename}.json", 'w') as json_file:
    json_file.write(json_data)