In [None]:
from langchain_ollama.llms import OllamaLLM
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
import time
from typing import Dict, List, Tuple
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PDFAnalyzer:
    def __init__(self, pdf_path: str):
        """Initialize the PDF analyzer with the path to the PDF file."""
        self.pdf_path = pdf_path
        self.models = {
            "llama3.2:1b": OllamaLLM(model="llama3.2:1b"),
            "llama3.2": OllamaLLM(model="llama3.2"),
            "llama3.1": OllamaLLM(model="llama3.1")
        }
        
        # Create a separate evaluator model
        self.evaluator_model = OllamaLLM(model="llama3.2:1b")
        
        # Format: question, then expected answers + page number inside a list
        self.questions_and_answers = {
            "At which flowrate of fertilizer in kg/min is it advised to use the fine application?": 
                ["40kg/min", 67],
            "What is the maximum operating angle of the coupling shaft?": 
                ["30 degrees", 54],
            "What materials are required to perform the tray test?": 
                ["You'll need a measuring tape or ruler, a spirit level, 7 troughs, 7 graduated tubes, a funnel, a notebook, pen, calculator, this manual, and the software's instruction manual.", 77],
            "What is the general formula for calculating the required flowrate of fertilizer in kg/min?":
                ["Working width (m) x Driving speed (km/h) x Application rate (kg/ha) / 600", 68]
        }
        
        # Template for zero-shot learning
        self.prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""
            Below is a Context from a technical manual. Please answer the question at the end based only on the context provided.
            The context includes much unnecessary information. Therefore ignore tables. Include the relevant sentence in your answer, and only answer the question if you are very sure. 
            The answer should be concise and to the point. Don't just copy the context but answer in a manner that makes sense. Double-check your answer before submitting.

            Context:
            {context}
            End of Context.
            Question: {question}

            Answer: Let me analyze the provided context and answer your question...
            """
        )

        # Improved evaluation template for more accurate assessment
        self.evaluation_template = PromptTemplate(
            input_variables=["question", "model_answer", "expected_answer"],
            template="""
            You are a precise evaluator. Compare these two answers and determine if they convey the same information:

            Question: {question}
            Model's answer: {model_answer}
            Expected answer: {expected_answer}

            Rules for evaluation:
            1. The answers must convey the same core information
            2. Units and numerical values must match exactly
            3. For lists of items, all required items must be present
            4. Minor differences in phrasing are acceptable
            5. Additional information in the model's answer is acceptable as long as the core information is correct

            Respond with ONLY 'yes' if the answers match according to these rules, or 'no' if they don't.
            Do not provide any explanation.

            Answer (yes/no):
            """
        )

    def load_pdf(self) -> List[dict]:
        """Load and process the PDF document."""
        try:
            loader = PyPDFLoader(self.pdf_path)
            pages = loader.load()
            
            # Process pages and keep track of page numbers
            processed_pages = []
            for page in pages:
                content = page.page_content
                page_num = page.metadata['page']
                processed_pages.append({
                    'content': content,
                    'page': page_num
                })
            return processed_pages

        except Exception as e:
            logger.error(f"Error loading PDF: {str(e)}")
            raise

    def get_page_content(self, pages: List[dict], target_page: int) -> str:
        """Get content for a specific page number."""
        for page in pages:
            if page['page'] == target_page:
                return f"[Page {page['page']}] {page['content']}"
        return ""

    def calculate_accuracy(self, results: List[Dict]) -> Tuple[float, int, int]:
        """Calculate accuracy metrics from results."""
        total_questions = len(results)
        correct_answers = sum(1 for result in results if result['self_evaluation'] == 'yes')
        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
        return accuracy, correct_answers, total_questions

    def run_model_evaluation(self) -> Dict:
        """Run evaluation across all models for given questions."""
        results = {}
        pages = self.load_pdf()
        
        for model_name, model in self.models.items():
            logger.info(f"Evaluating model: {model_name}")
            model_results = []
            
            for question, expected_answer in self.questions_and_answers.items():
                try:
                    target_page = expected_answer[1] - 1
                    page_content = self.get_page_content(pages, target_page)
                    
                    prompt = self.prompt_template.format(
                        context=page_content,
                        question=question
                    )
                    
                    start_time = time.time()
                    response = model.invoke(prompt)
                    end_time = time.time()
                    
                    eval_prompt = self.evaluation_template.format(
                        question=question,
                        model_answer=response,
                        expected_answer=expected_answer[0]
                    )

                    self_evaluation = self.evaluator_model.invoke(eval_prompt).strip().lower()
                    
                    model_results.append({
                        'question': question,
                        'expected_answer': expected_answer,
                        'response': response,
                        'time_taken': end_time - start_time,
                        'self_evaluation': self_evaluation
                    })
                    
                except Exception as e:
                    logger.error(f"Error with model {model_name} on question: {question}")
                    logger.error(str(e))
                    model_results.append({
                        'question': question,
                        'expected_answer': expected_answer,
                        'response': f"Error: {str(e)}",
                        'time_taken': None,
                        'self_evaluation': 'error'
                    })
            
            results[model_name] = model_results
        
        return results

    def format_results(self, results: Dict) -> str:
        """Format the results into a readable report."""
        report = "PDF Analysis Results\n" + "="*20 + "\n\n"
        # print all model results first 
        for model_name, model_results in results.items():
            report += f"Accuracy: {accuracy:.1f}% ({correct_answers}/{total_questions} correct answers). "
            report += f"Model: {model_name}\n" + "-"*50 + "\n\n"
        
        for model_name, model_results in results.items():
            # Calculate accuracy metrics
            accuracy, correct_answers, total_questions = self.calculate_accuracy(model_results)
            report += "-"*20 + "\n\n"
            report += f"Detailed Results"
            report += "-"*20 + "\n\n"
            report += f"Model: {model_name}\n" + "-"*50 + "\n"
            
            for result in model_results:
                report += f"Question: {result['question']}\n"
                report += f"Expected Answer: {result['expected_answer'][0]}\n"
                report += f"Model Response: {result['response']}\n"
                report += f"Self-Evaluation (Correct?): {result['self_evaluation']}\n"
                if result['time_taken']:
                    report += f"Time taken: {result['time_taken']:.2f} seconds\n"
                report += "\n"
            
            report += "\n"
            
        return report

def main():
    # Initialize analyzer
    pdf_path = "PDF/EN-A148703540-2.pdf"
    analyzer = PDFAnalyzer(pdf_path)
    
    try:
        # Run evaluation
        results = analyzer.run_model_evaluation()
        
        # Format and display results
        report = analyzer.format_results(results)
        print(report)
        
        # Optionally save results to file
        with open("analysis_results_one_page_input_with_self_evaluation.txt", "w") as f:
            f.write(report)
            
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

INFO:__main__:Evaluating model: llama3.2:1b
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:__main__:Evaluating model: llama3.2
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1

PDF Analysis Results

Model: llama3.2:1b
--------------------------------------------------
Accuracy: 50.0% (2/4 correct answers)

Question: At which flowrate of fertilizer in kg/min is it advised to use the fine application?
Expected Answer: 40kg/min
Model Response: I am very sure. It is advised to use the fine application when the flowrate is less than 40 kg/min, according to the spreading chart that provides this information for different working widths of the spreader.
Self-Evaluation (Correct?): no
Time taken: 30.08 seconds

Question: What is the maximum operating angle of the coupling shaft?
Expected Answer: 30 degrees
Model Response: Based on the given context, I conclude that the maximum operating angle of the coupling shaft is 30° when lifted. 

The mention of "when lifted" suggests a specific position or range of operation, which typically indicates a limited working angle. In this case, it appears to be the maximum possible angle under these conditions.
Self-Evaluation (Corr