In [None]:
import os
import json
import logging
from typing import Dict, List
import time
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
import google.generativeai as genai
import time
from google.api_core.exceptions import ResourceExhausted

os.environ["GOOGLE_API_KEY"] = "insert-code"  # Set before calling genai.configure()
# Configure Gemini
genai.configure(api_key=os.environ.get("insert-code"))
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 40,
    "max_output_tokens": 8192,  # Increased for potential longer responses
    "response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
    model_name="gemini-1.5-pro-002", generation_config=generation_config
)


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class QuestionGenerator:
    def __init__(self, pdf_path: str):
        """Initialize the question generator with the path to the PDF file."""
        self.pdf_path = pdf_path
        self.model = model  # Use the Gemini model

        # Template for question generation (same as before)
        self.qa_template = PromptTemplate(
            input_variables=["page_content", "page_number"],
            template="""
            You are an expert at creating technical questions and answers from documentation.
            Below is the content from page {page_number} of a technical manual.

            Content:
            {page_content}

            Create exactly ONE question and its corresponding answer based on specific factual information from this page.
            The question should be about important technical details, measurements, procedures, or specifications.

            Requirements:
            1. The answer must be found explicitly in the text
            2. The answer should be concise and specific
            3. For numerical values, include units
            4. Avoid questions about general concepts or definitions
            5. Focus on actionable information or specific parameters

            Respond in this exact JSON format:
            {{"question": "your question here", "answer": "your concise answer here"}}

            Response:
            """,
        )

    def load_pdf(self) -> List[dict]:
        """Load and process the PDF document."""
        try:
            loader = PyPDFLoader(self.pdf_path)
            pages = loader.load()
            
            processed_pages = []
            for page in pages:
                content = page.page_content
                page_num = page.metadata['page']
                processed_pages.append({
                    'content': content,
                    'page': page_num
                })
            return processed_pages
        except Exception as e:
            logger.error(f"Error loading PDF: {str(e)}")
            raise

    def generate_qa_pairs(self) -> Dict[str, List]:
        """Generate question-answer pairs for each page of the PDF."""
        pages = self.load_pdf()
        qa_pairs = {}

        for page in pages:
            try:
                # Skip pages with very little content
                if len(page['content'].strip()) < 200:
                    continue
                # skip pages 1 ... 22 and 142 ... 165
                if page['page'] < 23 or page['page'] > 141:
                    continue
                logger.info(f"Processing page {page['page']}")
                
                # Generate question and answer for the page
                prompt = self.qa_template.format(
                    page_content=page["content"], page_number=page["page"] + 1
                )
                retries = 0
                max_retries = 3  # Adjust as needed
                while retries < max_retries:
                    try:
                        response = self.model.generate_content(contents=[prompt])
                        generated_text = response.text

                        # Extract JSON string (Corrected Logic)
                        if generated_text.startswith("```json\n") and generated_text.endswith("```\n"):
                            json_string = generated_text[len("```json\n"):-len("```\n")]
                        elif generated_text.startswith("```json\n") and generated_text.endswith("```"):
                            json_string = generated_text[len("```json\n"):-len("```")]
                        else:
                            json_string = generated_text

                        # Parse JSON only ONCE (Corrected)
                        qa_data = json.loads(json_string)  # Use json_string here
                        question = qa_data["question"]
                        answer = qa_data["answer"]

                        qa_pairs[question] = [answer, page["page"] + 1]
                        time.sleep(0.5)  # Sleep to avoid rate limiting
                        break  
                    except ResourceExhausted as e:
                        retries += 1
                        wait_time = 4**retries  # Exponential backoff
                        logger.warning(f"Rate limited. Retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                    except Exception as e:  # Handle other errors
                        logger.error(f"Error processing page: {e}")
                        break  # Or retry as needed for other error types
                else:  # If the loop completes without a successful request
                    logger.error(f"Failed to process page after multiple retries.")

            except json.JSONDecodeError as e:
                logger.error(f"Failed to parse JSON: {e}  Raw response: {generated_text}")
                continue
            except Exception as e:
                logger.error(f"Error processing page {page['page'] + 1}: {str(e)}")
                continue

        return qa_pairs
    
    def save_qa_pairs(self, qa_pairs: Dict[str, List], output_file: str = "generated_qa_pairs.json"):
        """Save the generated QA pairs to a file."""
        try:
            with open(output_file, 'w') as f:
                json.dump(qa_pairs, f, indent=4)
            logger.info(f"Successfully saved QA pairs to {output_file}")
        except Exception as e:
            logger.error(f"Error saving QA pairs: {str(e)}")

def main():
    # Initialize generator
    pdf_path = "PDF/EN-A148703540-2.pdf"
    generator = QuestionGenerator(pdf_path)
    
    try:
        # Generate QA pairs
        qa_pairs = generator.generate_qa_pairs()
        
        # Print results
        print("\nGenerated Question-Answer Pairs:")
        print("=" * 50)
        for question, answer_info in qa_pairs.items():
            print(f"\nQuestion: {question}")
            print(f"Answer: {answer_info[0]}")
            print(f"Page: {answer_info[1]}")
        
        # Save results
        generator.save_qa_pairs(qa_pairs)
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Processing page 23
INFO:__main__:Processing page 24
INFO:__main__:Processing page 25
INFO:__main__:Processing page 26
INFO:__main__:Processing page 27
INFO:__main__:Processing page 28
ERROR:__main__:Failed to process page after multiple retries.
INFO:__main__:Processing page 29
INFO:__main__:Processing page 30
INFO:__main__:Processing page 31
INFO:__main__:Processing page 32
INFO:__main__:Processing page 33
INFO:__main__:Processing page 34
ERROR:__main__:Failed to process page after multiple retries.
INFO:__main__:Processing page 35
ERROR:__main__:Failed to process page after multiple retries.
INFO:__main__:Processing page 36
ERROR:__main__:Failed to process page after multiple retries.
INFO:__main__:Processing page 37
INFO:__main__:Processing page 38
INFO:__main__:Processing page 39
INFO:__main__:Processing page 40
INFO:__main__:Processing page 41
INFO:__main__:Processing page 42
ERROR:__main__:Failed to process page after multiple retries.
INFO:__main__:Processing page 


Generated Question-Answer Pairs:

Question: How many dosing openings are used in the fine application setting of the spreader, and how does this compare to the normal dosing setting?
Answer: Two of the three dosing openings are closed during fine application, leaving one open.  Normal dosing utilizes all three openings.
Page: 24

Question: What is the maximum acceptable overlap percentage for a trapezoidal spreading pattern with the GEOSPREAD system?
Answer: Less than 100%
Page: 25

Question: How is working width reduction visually represented on the main screen when using GEOCONTROL?
Answer: By progressively removing portions of the black triangles representing the sections, and displaying the updated working width.
Page: 26

Question: What is the maximum number of measuring points used for a rate map when using the MULTIRATE functionality on an ISOBUS weighing spreader?
Answer: 8 measuring points
Page: 27

Question: What percentage of overlap is achieved with a full field spreading 

In [11]:
%pip install -qU langchain-google-genai

Note: you may need to restart the kernel to use updated packages.
