In [24]:
import os
import openai
from PyPDF2 import PdfReader
import json
import string
from models import GPT_Model
# Set your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")


### Make QA

In [5]:
behavior_choices = ["forward","backwards","turning","grooming","oviposition"]
modality_choices = ["olfactory", "hygrosensory","thermosensory","visual"]
#modality_choices = ["olfactory","thermohygro", "hygrosensory","thermosensory","walking",'flight',"grooming","landing","escape_takeoff","reproduction","neuropeptidergic","neuromodulatory","foreleg","turning","rest","puff","halting"]

In [9]:
MBON_types = ['MBON01','MBON02','MBON03','MBON04','MBON05','MBON06','MBON07','MBON09','MBON10','MBON11','MBON12','MBON13','MBON14','MBON15','MBON15-like','MBON16','MBON17','MBON17-like','MBON18','MBON19','MBON20','MBON21','MBON22','MBON23','MBON24','MBON25','MBON26','MBON27','MBON28','MBON29','MBON30','MBON31','MBON32','MBON33','MBON34','MBON35']

In [41]:
def generate_questions_from_entities(entities, modality_choices):
    """
    Generate questions for a list of entities with labeled multiple-choice options.

    Parameters:
    - entities (list): List of entities to generate questions for.
    - modality_choices (list): List of sensory modalities.

    Returns:
    - questions (list): List of questions in JSON format.
    """
    # Assign labels (A, B, C, ...) to choices
    labeled_choices = [f"{label}. {choice}" for label, choice in zip(string.ascii_uppercase, modality_choices + ["modality information not described"])]

    questions = []
    for entity in entities:
        # First question: Is the entity mentioned in this chunk?
        mention_question = {
            "entity": entity,
            "question": f"Is the entity '{entity}' mentioned or discussed in this chunk? Answer only in Yes or No. Answer:",
            "answer": ""  # To be filled after processing
        }
        #questions.append(mention_question)
        
        # Second question: If mentioned, what sensory modality is it associated with?
        modality_question = {
            "entity": entity,
            "question": f"Given the paper chunk, what sensory modality is the entity '{entity}' associated with? Choose from:\n" + "\n".join(labeled_choices)+". \nAnswer only in the form of the letter. Answer:",
            "answer": ""  # To be filled after processing
        }
        summary_question = {
            "entity": entity,
            "question": f"Given the paper chunk, summarize key information related to '{entity}'. Answer:",
            "answer": ""  # To be filled after processing
        }
        #questions.append(modality_question)
        questions.append(summary_question)
    
    return questions

def save_questions_to_json(questions, output_file):
    """
    Save generated questions to a JSON file.

    Parameters:
    - questions (list): List of questions to save.
    - output_file (str): Path to the output JSON file.
    """
    with open(output_file, "w") as file:
        json.dump(questions, file, indent=4)
    print(f"Questions saved to {output_file}.")



In [39]:
# Generate questions
questions = generate_questions_from_entities(MBON_types, modality_choices)

# Save questions to JSON
output_json_path = "questions/mbon_questions.json"
save_questions_to_json(questions, output_json_path)

Questions saved to questions/mbon_questions.json.


In [42]:
# Generate questions
questions = generate_questions_from_entities(MBON_types, modality_choices)

# Save questions to JSON
output_json_path = "questions/mbon_summary.json"
save_questions_to_json(questions, output_json_path)

Questions saved to questions/mbon_summary.json.


### Extraction

In [36]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def chunk_text(text, max_tokens=120000):
    """Split text into chunks that fit within the token limit."""
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 1 > max_tokens:
            chunks.append(current_chunk)
            current_chunk = sentence + ". "
        else:
            current_chunk += sentence + ". "
    if current_chunk:  # Append the remaining chunk
        chunks.append(current_chunk)
    return chunks


def ask_questions_and_fill_answers(chunks, questions_file, gpt_model):
    """
    Process each chunk to answer all questions efficiently, and fill the answers in the same JSON file.

    Parameters:
    - chunks (list): List of text chunks from the paper.
    - questions_file (str): Path to the JSON file with questions.
    - gpt_model (GPT_Model): An instance of the GPT_Model class.

    Returns:
    - None: Updates the JSON file in place.
    """
    # Load questions from the JSON file
    with open(questions_file, "r") as file:
        questions = json.load(file)

    # Process each chunk
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}...")

        # Iterate through each question
        for question_dict in questions:
            question = question_dict.get("question")
            if not question or question_dict.get("answer"):  # Skip if already answered
                continue

            # Construct the prompt for the question
            prompt = f"""
            The following is a section from a research paper:

            {chunk}

            Based on this section, answer the following question:

            Q: {question}
            """

            # Use GPT_Model to get the response
            response = gpt_model.get_response(prompt)

            # If a valid response is returned, update the answer field
            if response and response.strip():
                # If it's the first question asking about mentioning the entity, check the response
                if "mentioned" in question.lower():
                    if "no" in response.lower():
                        # Skip further questions for this entity if it's not mentioned
                        break

                question_dict["answer"] = response.strip()

    # Save the updated questions back to the same JSON file
    with open(questions_file, "w") as file:
        json.dump(questions, file, indent=4)

    print(f"Answers have been filled and saved back to {questions_file}.")

def ask_questions_and_fill_answers_2(chunks, questions_file, gpt_model):
    """
    Process each chunk once to ask all questions efficiently, and append answers to the existing ones in the same JSON file.

    Parameters:
    - chunks (list): List of text chunks from the paper.
    - questions_file (str): Path to the JSON file with questions.
    - gpt_model (GPT_Model): An instance of the GPT_Model class.

    Returns:
    - None: Updates the JSON file in place.
    """
    # Load questions from the JSON file
    with open(questions_file, "r") as file:
        questions = json.load(file)

    # Process each chunk
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}...")

        # Prepare a single prompt with all questions
        prompt = f"""
        The following is a section from a research paper:

        {chunk}

        Based on this section, answer the following questions:
        """
        for idx, question_dict in enumerate(questions, 1):
            prompt += f"\n{idx}. {question_dict['question']}"

        prompt += "\n\nProvide your answers in the format:\n1. [Answer to question 1]\n2. [Answer to question 2]\n..."

        # Get the response from GPT
        response = gpt_model.get_response(prompt)

        # Parse the response and append answers
        if response:
            try:
                lines = response.split("\n")
                for line in lines:
                    # Match format: "1. [Answer]"
                    if ". " in line:
                        idx, answer = line.split(". ", 1)
                        idx = int(idx) - 1  # Convert to 0-based index
                        if idx < len(questions):
                            # Append the new answer to the existing answer
                            existing_answer = questions[idx].get("answer", "").strip()
                            if existing_answer:
                                questions[idx]["answer"] = existing_answer + " " + answer.strip()
                            else:
                                questions[idx]["answer"] = answer.strip()
            except Exception as e:
                print(f"Error parsing response: {e}")

    # Save the updated questions back to the same JSON file
    with open(questions_file, "w") as file:
        json.dump(questions, file, indent=4)

    print(f"Answers have been updated and saved back to {questions_file}.")


In [40]:
# Path to your PDF file
pdf_path = "papers/elife-62576-v2.pdf"

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Split the text into manageable chunks
chunks = chunk_text(pdf_text)
# Initialize the GPT_Model
gpt_model = GPT_Model(
    model="gpt-4-turbo",
    api_key=openai.api_key,
    system_prompt="You are a helpful assistant for a research scientist. You have been given a research paper and asked to answer questions based on the content.",
)

# Load questions from a JSON file
questions_json_file = "questions/mbon_questions.json"
ask_questions_and_fill_answers_2(chunks, questions_json_file, gpt_model)

Processing chunk 1/3...
Processing chunk 2/3...
Processing chunk 3/3...
Answers have been updated and saved back to questions/mbon_questions.json.


In [43]:
# Path to your PDF file
pdf_path = "papers/elife-62576-v2.pdf"

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Split the text into manageable chunks
chunks = chunk_text(pdf_text)
# Initialize the GPT_Model
gpt_model = GPT_Model(
    model="gpt-4o-mini",
    api_key=openai.api_key,
    system_prompt="You are a helpful assistant for a research scientist. You have been given a research paper and asked to answer questions based on the content.",
)

# Load questions from a JSON file
questions_json_file = "questions/mbon_summary.json"
ask_questions_and_fill_answers_2(chunks, questions_json_file, gpt_model)

Processing chunk 1/3...
Processing chunk 2/3...
Processing chunk 3/3...
Error parsing response: invalid literal for int() with base 10: 'The provided text does not contain specific information about MBON01 through MBON35'
Answers have been updated and saved back to questions/mbon_summary.json.
