# BioLLM x Plants - Idea Mining, Divergent-Convergent Script

Rachel K. Luu, Ming Dao, Subra Suresh, Markus J. Buehler (2025) ENHANCING SCIENTIFIC INNOVATION IN LLMS: A FRAMEWORK APPLIED TO PLANT MECHANICS RESEARCH [full reference to be updated to be included here]

## Load Model Functions

For the divergent generation phase, BioinspiredLLM quantized to 8bit is used. For the convergent evaluation phase, Llama-3.1-8b-instruct quantized to 8bit is used. Depending on your system, you may need to load these models separately.

In [None]:
from llama_index.core import PromptTemplate
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from typing import List, Optional, Sequence
import pandas as pd

"""Llama 3.1 Prompt Template"""

def completion_to_prompt(completion):
    return "<|start_header_id|>system<|end_header_id|>\n<eot_id>\n<|start_header_id|>user<|end_header_id|>\n" + \
           f"{completion}<eot_id>\n<|start_header_id|>assistant<|end_header_id|>\n"

def messages_to_prompt(messages):
    prompt = "<|start_header_id|>system<|end_header_id|>\n<eot_id>\n"  
    for message in messages:
        if message.role == "system":
            prompt += f"system message<eot_id>\n"
        elif message.role == "user":
            prompt += f"<|start_header_id|>user<|end_header_id|>\n{message.content}<eot_id>\n"
        elif message.role == "assistant":
            prompt += f"<|start_header_id|>assistant<|end_header_id|>\n{message.content}<eot_id>\n"
    prompt += "<|start_header_id|>assistant<|end_header_id|>\n"
    
    return prompt


## Load BioLLM Q8

In [None]:
model_url = "https://huggingface.co/rachelkluu/Llama3.1-8b-Instruct-CPT-SFT-DPO-09022024-Q8_0-GGUF/resolve/main/llama3.1-8b-instruct-cpt-sft-dpo-09022024-q8_0.gguf"
bioinspiredllm_q8 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=1,
    max_new_tokens=2048,
    context_window=16000,
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=False,
     
)

## Load Llama3.1 Q8

In [None]:
model_url = "https://huggingface.co/rachelkluu/Meta-Llama-3.1-8B-Instruct-Q8_0-GGUF/resolve/main/meta-llama-3.1-8b-instruct-q8_0.gguf"
llama31_q8 = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=.1,
    max_new_tokens=5000,
    context_window=16000,
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
     
)

## Divergent Generation
For the divergent generation phase, BioinspiredLLM quantized to 8bit is used.

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_response

Settings.llm = bioinspiredllm_q8 # BIOLLM USED HERE
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

documents = SimpleDirectoryReader(
    "./PlantPapers/"   # FOLDER TO PAPERS OF INTEREST
).load_data()

Settings.chunk_size = 128
Settings.chunk_overlap = 50

vector_index = VectorStoreIndex.from_documents(documents)
query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=10) 

In [None]:
from difflib import SequenceMatcher

def extract_bullet_points(response_text):
    lines = response_text.split('\n')
    bullet_points = set()
    for line in lines:
        line = line.strip()
        if line.startswith(('- ', '• ', '* ')):
            bullet_points.add(line[2:].strip())
        elif line and line[0].isdigit() and line[1:3] == '. ':
            bullet_points.add(line[3:].strip())
        elif line.startswith('[') and line[1].isdigit() and line[2] == ']':
            bullet_points.add(line[3:].strip())
    return list(bullet_points)

def are_similar(a, b, threshold=0.8):
    return SequenceMatcher(None, a, b).ratio() > threshold

def sample_bullets(num_generations, num_ideas, prompt):
    """Samples the LLM num_generations number of times, scraping the bullet points of ideas generated"""
    all_bullet_points = set()
    bullet_points_count = []
    bullet_points_per_generation = []
    data_for_df =[]
    for gen_num in range(num_generations):
        txt = f"{prompt} Be creative. Concisely brainstorm {num_ideas} different ideas into a bullet point list."
        response = query_engine.query(txt)
        bullet_points = extract_bullet_points(response.response)
        all_bullet_points.update(bullet_points)
        for bullet_point in bullet_points:
            data_for_df.append({"Prompt": prompt, "Idea": bullet_point})
        bullcount = len(all_bullet_points)
    df = pd.DataFrame(data_for_df, columns=["Prompt", "Idea"])
    return df, all_bullet_points, bullcount


def filter_ideas(new_ideas, unique_ideas, similarity_threshold=0.8):
    """Filters ideas in the list by similarity, can adjust similarity threshold"""
    filtered_ideas = []
    for idea in new_ideas:
        is_unique = all(not are_similar(idea, existing_idea, similarity_threshold) for existing_idea in unique_ideas)
        if is_unique and all(not are_similar(idea, filtered_idea, similarity_threshold) for filtered_idea in filtered_ideas):
            filtered_ideas.append(idea)
    return filtered_ideas

def filter_unique_ideas(df_existing, df_new, similarity_threshold=0.8):
    unique_ideas = df_existing['Idea'].tolist()
    new_ideas = df_new['Idea'].tolist()
    filtered_ideas = filter_ideas(new_ideas, unique_ideas, similarity_threshold)
    df_filtered = df_new[df_new['Idea'].isin(filtered_ideas)]
    return pd.concat([df_existing, df_filtered], ignore_index=True)


### Divergent Inference

In [None]:
prompt = "Beyond hemorrhage control, where else could the high absorption properties of pollen-based cryogels be effectively applied?"
num_per_gen = "" #can optionally specify how many ideas per generation
sim_thres = 0.7 #adjusts the similarity threshold, with values closer to 1 being greater similarity 
num_ideas = 100 #number of ideas desired in total

##############################

finaldf = pd.DataFrame({
    "Prompt":[],
    "Idea":[],
})

while len(finaldf) < num_ideas:
    gendf, bullets, bullcount = sample_bullets(1, num_per_gen, prompt) 
    print(bullets)
    print(f"{bullcount} Ideas were generated")
    
    finaldf = filter_unique_ideas(finaldf, gendf, similarity_threshold= sim_thres)
    print(finaldf)
    print(f"Unique rows were added. Current number of rows: {len(finaldf)}")

finaldf 

finaldf.to_csv(f'{prompt}.csv', index=False)

## Convergent Evaluation
For the convergent evaluation phase, Llama-3.1-8b-instruct quantized to 8bit is used.

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex
from llama_index.core.response.notebook_utils import display_response

Settings.llm = llama31_q8 
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

documents = SimpleDirectoryReader(
    "./PlantPapers/"   # FOLDER TO PAPERS OF INTEREST
).load_data()

Settings.chunk_size = 128
Settings.chunk_overlap = 50

vector_index = VectorStoreIndex.from_documents(documents)
query_engine = vector_index.as_query_engine(response_mode="compact", similarity_top_k=10) 

In [None]:
import random
import json
import pandas as pd

def clean_json_string(json_string):
    cleaned_string = json_string.replace("\n", " ")
    return cleaned_string

def load_ideas_from_files(filenames):
    all_ideas = []
    for filename in filenames:
        df = pd.read_csv(filename)
        ideas = df.iloc[:, 1].tolist()
        all_ideas.extend(ideas)
    return all_ideas


def pairwise_elim_and_rate(filenames, prompt, round_limit, outputfilename, ratings_outputfile):
    all_ideas = load_ideas_from_files(filenames)
    random.shuffle(all_ideas)
    round_number = 1
    all_rounds_data = []  # List to collect data for each round
    ratings_data = []  # List to store ratings data for each idea

    while len(all_ideas) > 1 and round_number <= round_limit:
        print(f"--- Round {round_number} ---")
        winners = []
        # Compare ideas in pairs
        for i in range(0, len(all_ideas), 2):
            if i + 1 < len(all_ideas):  
                idea_1 = all_ideas[i]
                idea_2 = all_ideas[i + 1]

                # Only rate the ideas in the first round
                if round_number == 1:
                    # Prepare the rating prompt for these two ideas
                    rating_prompt = f"""You must critically rate each idea out of 10 for categories: novelty and effectiveness.
                    The response must only contain the ratings in the following strict JSON format:
                    {{
                    "Idea 1": {{"novelty": X, "effectiveness": Y}},
                    "Idea 2": {{"novelty": X, "effectiveness": Y}}
                    }}"""
                    
                    ideas_str = f"Idea 1: {idea_1}\nIdea 2: {idea_2}"
                    rating_txt = f"{rating_prompt}\n\n{ideas_str}"
                    rating_response = query_engine.query(rating_txt)
                    cleaned_rating_response = clean_json_string(rating_response.response)

                    try:
                        ratings = json.loads(cleaned_rating_response)
                        ratings_data.append({
                            'Idea': idea_1,
                            'Novelty': ratings.get('Idea 1', {}).get('novelty', None),
                            'Effectiveness': ratings.get('Idea 1', {}).get('effectiveness', None)
                        })
                        ratings_data.append({
                            'Idea': idea_2,
                            'Novelty': ratings.get('Idea 2', {}).get('novelty', None),
                            'Effectiveness': ratings.get('Idea 2', {}).get('effectiveness', None)
                        })
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON: {e}")
                        continue  

                compare_txt = f"""To answer this question: {prompt} Which idea is better based on novelty and effectiveness?
                Idea 1: {idea_1} 
                Idea 2: {idea_2} 
                Respond with the winner as 'Idea 1' or 'Idea 2' in the following JSON format:
                {{ "winner": "Idea 1" or "Idea 2" }}
                """

                comparison_response = query_engine.query(compare_txt)
                cleaned_comparison_response = clean_json_string(comparison_response.response)

                try:
                    result = json.loads(cleaned_comparison_response)
                    winner = idea_1 if result["winner"] == "Idea 1" else idea_2
                    winners.append(winner)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}")
            
            else:
                # If there's an odd number of ideas, move the last one directly to the next round
                winners.append(all_ideas[i])

        # Store the remaining ideas after the round
        round_data = [{'Round': round_number, 'Idea': idea} for idea in winners]
        all_rounds_data.extend(round_data)

        # Move the winners to the next round
        all_ideas = winners
        random.shuffle(all_ideas)  # Randomize for the next round
        round_number += 1

    print(f"Final round winners (Top {len(all_ideas)} ideas): {[idea for idea in all_ideas]}")
    
    df_winners = pd.DataFrame(all_ideas)
    df_winners['Round'] = round_number - 1  
    output_file = outputfilename
    df_all_rounds = pd.DataFrame(all_rounds_data)
    final_output_df = pd.concat([df_all_rounds, df_winners])
    final_output_df.to_csv(output_file, index=False)

    print(f"Results (including rounds) exported to {output_file}.")

    df_ratings = pd.DataFrame(ratings_data)
    df_ratings.to_csv(ratings_outputfile, index=False)
    
    print(f"Ratings from the first round exported to {ratings_outputfile}.")


### Convergent Inference

In [None]:
prompt = "Beyond hemorrhage control, where else could the high absorption properties of pollen-based cryogels be effectively applied?"
filename=[f"{prompt}.csv",]
filetagline = "PollenAbsorption"



# Maximum number of elimination rounds to perform
round_limit = 6

# Filename to store the elimination results and final winners
outputfilename = f'{filetagline}_elim.csv'

# Filename to store the ratings collected in the first round
ratings_outputfile = f'{filetagline}_rate.csv'

# Run the pairwise elimination and rating function
pairwise_elim_and_rate(filename, prompt, round_limit, outputfilename, ratings_outputfile)