In [None]:
import openai
import spacy
import csv
import pdfplumber

# Initialize OpenAI API and spaCy
openai.api_key = "here-api-key"
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path, start_page=None, end_page=None):
    """
    Extract text from a range of pages in a PDF file.
    """
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        start = start_page - 1 if start_page else 0
        end = end_page if end_page else len(pdf.pages)
        for page in pdf.pages[start:end]:
            text += page.extract_text() + "\n"
    return text

def chunk_text(text, max_length=500):
    """
    Split text into manageable chunks for processing.
    """
    chunks = []
    doc = nlp(text)
    temp_chunk = []
    temp_length = 0

    for sentence in doc.sents:
        temp_chunk.append(sentence.text)
        temp_length += len(sentence.text)
        if temp_length >= max_length:
            chunks.append(" ".join(temp_chunk))
            temp_chunk = []
            temp_length = 0

    if temp_chunk:
        chunks.append(" ".join(temp_chunk))

    return chunks

def generate_qa(document_text):
    """
    Generate QA pairs for a given chunk of text, avoiding questions about context, references, or metadata.
    """
    prompt = f"""
    You are a highly specialized AI designed to create accurate, detailed, and context-relevant questions and answers from medical documents.

    Please generate question-answer pairs based on the following guidelines:
    1. Relevance: Ensure that the questions are directly tied to the **content** of the document. Avoid questions about the title, topic, references, repeated text in answers, or context of the document.
    2. Accuracy: Use information explicitly mentioned in the document to form both the questions and answers.
    3. Language Match: Retain the tone, terminology, and phrasing used in the document for both questions and answers.
    4. Answer Length: Ensure answers are detailed but concise, with each answer strictly limited to 250–300 words.
    5. Focus on the Information: Do not generate questions related to the document’s structure (e.g., "What is the topic?" or "What references does this document cite?").
    6. Multiple Questions for the Same Answer: Where applicable, generate multiple relevant questions for the same context to provide diverse entry points to the information.

    The following text is a chunk from a medical document:
    {document_text}

    Your task is to:
    - Generate a list of questions that are focused on the **specific information** presented in the text.
    - Provide detailed and precise answers to those questions, ensuring the answers are directly supported by the document.
    - Limit each answer to between 250 and 300 words.

    Output format:
    - Question: [Generated question]
    - Answer: [Generated detailed answer]
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=1000,
            temperature=0.7
        )
        # Accessing the response content correctly for chat-based models
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        return f"Error: {str(e)}"

# Define PDF and Page Range
pdf_path = "/home/hamza/Desktop/qa_system/Medical_book.pdf"
start_page = 71  # Specify start page
end_page = 80  # Specify end page (or None for all pages)

# Extract text from the selected page range
text = extract_text_from_pdf(pdf_path, start_page=start_page, end_page=end_page)

# Split text into chunks and generate QA pairs
chunks = chunk_text(text)
qa_pairs = []

for i, chunk in enumerate(chunks, start=1):
    print(f"Processing chunk {i}...")
    qa_output = generate_qa(chunk)

    if not qa_output or "Error:" in qa_output:
        print(f"Error or empty output for chunk {i}: {qa_output}")
        continue  # Skip invalid chunks

    question, answer = None, None  # Initialize variables for safety
    for line in qa_output.split("\n"):
        if line.startswith("- Question:"):
            question = line.replace("- Question:", "").strip()
        elif line.startswith("- Answer:"):
            answer = line.replace("- Answer:", "").strip()
            if question and answer:  # Add only if both are present
                qa_pairs.append([question, answer])
                question, answer = None, None  # Reset for the next pair

# Check if QA pairs were generated
if qa_pairs:
    # Save QA pairs to a CSV file
    output_csv_file = f"qa_dataset_pages_{start_page}_to_{end_page}.csv"
    with open(output_csv_file, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Question", "Answer"])  # Write header
        writer.writerows(qa_pairs)  # Write QA pairs

    print(f"QA Dataset Generated and Saved to {output_csv_file}!")
else:
    print("No QA pairs were generated. Please check the input text or AI output.")

Processing chunk 1...
Error or empty output for chunk 1: Error: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Processing chunk 2...
Error or empty output for chunk 2: Error: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Processing chunk 3...
Error or empty output for chunk 3: Error: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
Processing chunk 4...
Error or empty output for chunk 4: Error: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/