In [None]:
from langchain_ollama.llms import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
import time
from typing import Dict, List, Tuple
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
WINDOW_OF_PAGES_TO_INSERT_BEFORE_AND_AFTER_TARGET = 5 # Number of pages to include before and after the target page

class PDFAnalyzer:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.models = {
            "llama3.2:1b": OllamaLLM(model="llama3.2:1b"),
            "llama3.2": OllamaLLM(model="llama3.2"),
            "llama3.1": OllamaLLM(model="llama3.1")
        }
        
        self.questions_and_answers = {
            "At which flowrate of fertilizer in kg/min is it advised to use the fine application?": 
                ["40kg/min", 67],
            "What is the maximum operating angle of the coupling shaft?": 
                ["30 degrees", 54],
            "What materials are required to perform the tray test?": 
                ["You'll need a measuring tape or ruler, a spirit level, 7 troughs, 7 graduated tubes, a funnel, a notebook, pen, calculator, this manual, and the software's instruction manual.", 77],
            "What is the general formula for calculating the required flowrate of fertilizer in kg/min?":
                ["Working width (m) x Driving speed (km/h) x Application rate (kg/ha) / 600", 68]
        }
        
        self.prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""
            Below is a Context from a technical manual. Please answer the question at the end based only on the context provided.
            The context includes much unnecessary information. Therefore ignore tables. Include the relevant sentence in your answer, and only answer the question if you are very sure. 
            The answer should be concise and to the point.

            Context:
            {context}
            End of Context.
            Question: {question}

            Answer: Let me analyze the provided context and answer your question...
            """
        )

    def load_pdf(self) -> List[dict]:
        try:
            loader = PyPDFLoader(self.pdf_path)
            pages = loader.load()
            
            processed_pages = []
            for page in pages:
                content = page.page_content
                page_num = page.metadata['page']
                processed_pages.append({
                    'content': content,
                    'page': page_num
                })
            return processed_pages
        except Exception as e:
            logger.error(f"Error loading PDF: {str(e)}")
            raise

    def get_page_range_content(self, pages: List[dict], target_page: int, window: int = WINDOW_OF_PAGES_TO_INSERT_BEFORE_AND_AFTER_TARGET) -> str:
        """Get content for a range of pages centered around the target page."""
        start_page = max(0, target_page - window)
        end_page = target_page + window - 1
        
        content = []
        for page in pages:
            if start_page <= page['page'] <= end_page:
                content.append(f"[Page {page['page']}] {page['content']}")
        
        return "\n\n".join(content)

    def run_model_evaluation(self) -> Dict:
        results = {}
        pages = self.load_pdf()
        
        for model_name, model in self.models.items():
            logger.info(f"Evaluating model: {model_name}")
            model_results = []
            
            for question, expected_answer in self.questions_and_answers.items():
                try:
                    target_page = expected_answer[1] - 1
                    page_content = self.get_page_range_content(pages, target_page)
                    
                    prompt = self.prompt_template.format(
                        context=page_content,
                        question=question
                    )
                    
                    start_time = time.time()
                    response = model.invoke(prompt)
                    end_time = time.time()
                    
                    model_results.append({
                        'question': question,
                        'expected_answer': expected_answer,
                        'response': response,
                        'time_taken': end_time - start_time
                    })
                    
                except Exception as e:
                    logger.error(f"Error with model {model_name} on question: {question}")
                    logger.error(str(e))
                    model_results.append({
                        'question': question,
                        'expected_answer': expected_answer,
                        'response': f"Error: {str(e)}",
                        'time_taken': None
                    })
            
            results[model_name] = model_results
        
        return results

    def format_results(self, results: Dict) -> str:
        report = "PDF Analysis Results\n" + "="*20 + "\n\n"
        report += f"Number of pages included in processing: {WINDOW_OF_PAGES_TO_INSERT_BEFORE_AND_AFTER_TARGET*2}\n\n"
        for model_name, model_results in results.items():
            report += f"Model: {model_name}\n" + "-"*50 + "\n"
            
            for result in model_results:
                report += f"Question: {result['question']}\n"
                report += f"Expected Answer: {result['expected_answer']}\n"
                report += f"Model Response: {result['response']}\n"
                if result['time_taken']:
                    report += f"Time taken: {result['time_taken']:.2f} seconds\n"
                report += "\n"
            
            report += "\n"
            
        return report

def main():
    pdf_path = "PDF/EN-A148703540-2.pdf"
    analyzer = PDFAnalyzer(pdf_path)
    
    try:
        results = analyzer.run_model_evaluation()
        report = analyzer.format_results(results)
        print(report)
        
        with open("analysis_results_multi-page_context.txt", "w") as f:
            f.write(report)
            
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Evaluating model: llama3.2:1b
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Evaluating model: llama3.2
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Evaluating model: llama3.1
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http

PDF Analysis Results

Model: llama3.2:1b
--------------------------------------------------
Question: At which flowrate of fertilizer in kg/min is it advised to use the fine application?
Expected Answer: ['40kg/min', 67]
Model Response: According to the text, it's not explicitly stated when or at what flowrate of fertilizer (in kg/min) should be used for the fine application. However, I can provide some general guidelines based on common practices in agriculture.

Typically, a fine application rate is considered as follows:

* 20-30 kg/min: For areas with moderate weed pressure and few weeds
* 15-25 kg/min: For areas with moderate weed pressure and some weeds
* Less than 15 kg/min: For areas with heavy weed pressure or many weeds

Please note that these are general guidelines and the optimal fine application rate may vary depending on the specific field conditions, crop type, and pest management practices.

It's also worth mentioning that the text mentions "variable, externally driven 