In [1]:
from langchain_ollama.llms import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
import json
import logging
from typing import Dict, List
import time

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class QuestionGenerator:
    def __init__(self, pdf_path: str):
        """Initialize the question generator with the path to the PDF file."""
        self.pdf_path = pdf_path
        self.llm = OllamaLLM(model="llama3.1")
        
        # Template for question generation
        self.qa_template = PromptTemplate(
            input_variables=["page_content", "page_number"],
            template="""
            You are an expert at creating technical questions and answers from documentation.
            Below is the content from page {page_number} of a technical manual.
            
            Content:
            {page_content}

            Create exactly ONE question and its corresponding answer based on specific factual information from this page.
            The question should be about important technical details, measurements, procedures, or specifications.
            
            Requirements:
            1. The answer must be found explicitly in the text
            2. The answer should be concise and specific
            3. For numerical values, include units
            4. Avoid questions about general concepts or definitions
            5. Focus on actionable information or specific parameters
            
            Respond in this exact JSON format:
            {{"question": "your question here", "answer": "your concise answer here"}}
            
            Response:
            """
        )

    def load_pdf(self) -> List[dict]:
        """Load and process the PDF document."""
        try:
            loader = PyPDFLoader(self.pdf_path)
            pages = loader.load()
            
            processed_pages = []
            for page in pages:
                content = page.page_content
                page_num = page.metadata['page']
                processed_pages.append({
                    'content': content,
                    'page': page_num
                })
            return processed_pages
        except Exception as e:
            logger.error(f"Error loading PDF: {str(e)}")
            raise

    def generate_qa_pairs(self) -> Dict[str, List]:
        """Generate question-answer pairs for each page of the PDF."""
        pages = self.load_pdf()
        qa_pairs = {}
        
        for page in pages:
            try:
                # Skip pages with very little content
                if len(page['content'].strip()) < 200:
                    continue
                # skip pages 1 ... 22 and 142 ... 165 as they are not useful
                if page['page'] < 23 or page['page'] > 141:
                    continue
                logger.info(f"Processing page {page['page']}")
                
                # Generate question and answer for the page
                prompt = self.qa_template.format(
                    page_content=page['content'],
                    page_number=page['page'] + 1  # Convert to 1-based page numbering
                )
                
                # Get response from model
                response = self.llm.invoke(prompt)
                
                try:
                    # Parse JSON response
                    qa_data = json.loads(response)
                    question = qa_data['question']
                    answer = qa_data['answer']
                    
                    # Store in the format requested
                    qa_pairs[question] = [answer, page['page'] + 1]
                    
                    # Add small delay to avoid overwhelming the model
                    time.sleep(0.5)
                    
                except json.JSONDecodeError:
                    logger.error(f"Failed to parse JSON response for page {page['page'] + 1}")
                    continue
                
            except Exception as e:
                logger.error(f"Error processing page {page['page'] + 1}: {str(e)}")
                continue
        
        return qa_pairs

    def save_qa_pairs(self, qa_pairs: Dict[str, List], output_file: str = "generated_qa_pairs.json"):
        """Save the generated QA pairs to a file."""
        try:
            with open(output_file, 'w') as f:
                json.dump(qa_pairs, f, indent=4)
            logger.info(f"Successfully saved QA pairs to {output_file}")
        except Exception as e:
            logger.error(f"Error saving QA pairs: {str(e)}")

def main():
    # Initialize generator
    pdf_path = "PDF/EN-A148703540-2.pdf"
    generator = QuestionGenerator(pdf_path)
    
    try:
        # Generate QA pairs
        qa_pairs = generator.generate_qa_pairs()
        
        # Print results
        print("\nGenerated Question-Answer Pairs:")
        print("=" * 50)
        for question, answer_info in qa_pairs.items():
            print(f"\nQuestion: {question}")
            print(f"Answer: {answer_info[0]}")
            print(f"Page: {answer_info[1]}")
        
        # Save results
        generator.save_qa_pairs(qa_pairs)
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Processing page 23
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 24
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 25
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 26
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 27
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 28
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 29
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 30
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Processing page 31
INFO:httpx:HTTP Request: POST h


Generated Question-Answer Pairs:

Question: How many dosing openings are closed when using the fine dosing setting?
Answer: 2 of 3
Page: 24

Question: What is the maximum allowed percentage for an overlapping area when a trapezoidal spreading pattern results?
Answer: less than 100%
Page: 25

Question: What is the working width reduction displayed per section in the main screen?
Answer: 24 meters
Page: 26

Question: How many measuring points are supported by the rate map for MULTIRATE?
Answer: 8
Page: 27

Question: What is the percentage overlap of the full field spreading pattern?
Answer: 100%
Page: 28

Question: What is the width of the ExactLine for border spreading?
Answer: 1/2 Working width
Page: 29

Question: What type of sensor measures the current required driving speed and localisation?
Answer: gps sensor
Page: 30

Question: What is the time interval for automatic calibration tests by the spreader?
Answer: every 75 kg or every minute
Page: 31

Question: What is required for th