# LLM Evaluation

In [1]:
# Standard library imports
import os
import sys
from threading import Thread
from typing import Optional, List, Dict
from yaml import safe_load

cwd = os.getcwd()
sys.path.append(os.path.join(cwd, 'src'))

# Third-party imports
from dotenv import load_dotenv
import pandas as pd
from datasets import Dataset
from huggingface_hub import login
import torch
from PIL import Image
from transformers import pipeline, BitsAndBytesConfig, TextIteratorStreamer
from yaml import safe_load

# Local application imports
import RAG

# Azure/OpenAI imports
from openai import AzureOpenAI

# LangChain and RAGAS imports
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_openai.embeddings import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import context_precision, context_recall, answer_relevancy, faithfulness
from ragas import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load configs
try:
    with open("config.yaml", "r") as file:
        config = safe_load(file)
        model_variant = config.get("model_variant")
        use_quantization = config.get("use_quantization")
        embed_model = config.get("embed_model")
except ImportError as e:
    print(f"Error loading config.yaml: {e}. Using default settings.")
    model_variant = "4b-it"
    use_quantization = True

## Load models

### MedGemma

In [5]:
class MedGemma:
    '''
    A class to interact with the MedGamma model for image-text-to-text tasks.
    This class initializes the model, loads it, 
    and provides a method to consult the model with a message and optional history.
    
    Attributes:
        model_id (str): The identifier for the MedGamma model.
        use_quantization (bool): Whether to use quantization for the model.
        pipe (pipeline): The Hugging Face pipeline for image-text-to-text tasks.
    '''
    def __init__(self, 
                 model_variant: str=None, 
                 use_quantization: bool=False
                 ):
        '''
        Initialize the MedGamma model with the specified variant and quantization option.
        Args:
            model_variant (str): The variant of the MedGamma model to use.
            use_quantization (bool): Whether to use quantization for the model.
        '''
        # Load environment variables and login to Hugging Face
        load_dotenv()
        try:
            login(token=os.getenv("HUGGINGFACE_TOKEN"))
        except Exception as e:
            print(f"Error logging in to Hugging Face: {e}")

        self.model_id = f"google/medgemma-{model_variant}"
        self.use_quantization = use_quantization
        self.pipe = self.load_pipeline()

    def load_pipeline(self):
        '''
        Load the MedGamma model pipeline with optional quantization.
        
        Returns:
            pipeline: A Hugging Face pipeline for image-text-to-text tasks.
        '''
        model_kwargs = dict(
            torch_dtype=torch.bfloat16,
            device_map="cuda"
        )

        if self.use_quantization:
            model_kwargs["quantization_config"] = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="fp4",
                bnb_4bit_use_double_quant=True,
            )
            
        pipe = pipeline(
        "image-text-to-text",
        model=self.model_id,
        model_kwargs=model_kwargs
            )
        return pipe

    def consult(
        self,
        message: str, 
        history: List[Dict[str, str]], 
        system_prompt: str = None,
        max_new_tokens: int = None,
        top_k: int = None,
        temperature: float = None,
        image: Optional[Image.Image] = None, 
        file: Optional[List] = None
        ):
        '''
        Consult the MedGamma model with a message and optional history, system prompt, image, and file attachments.
        
        Args:
            message (str): The user's message.
            history (List[Dict[str, str]]): Conversation history.
            system_prompt (str, optional): System prompt for the model.
            max_new_tokens (int, optional): Maximum number of new tokens to generate.
            top_k (int, optional): Top-k sampling parameter.
            temperature (float, optional): Temperature for sampling.
            image (Optional[Image.Image], optional): Image to include in the query.
            file (Optional[List], optional): List of files to process.
            
        Returns:
            Generator: Yields the model's response as it streams in.
            '''
        try:
            formatted_history = [
                {
                    "role": turn["role"],
                    "content": [{"type": "text", "text": turn["content"]}]
                }
                for turn in history
            ]

            messages = [
                {"role": "system", "content": [{
                    "type": "text", 
                    "text": system_prompt
                }]},
                *formatted_history
            ]

            # implement RAG for files (pdf, txt, docx, csv, pptx)
            documents = RAG.extract_text_from_pdf(file)
                    
            chunks = RAG.split_documents(documents)
            vectorstore = RAG.embed_chunks(chunks, embed_model=embed_model)
            query = message
            results = RAG.search_similar_chunks(query, vectorstore)

            retrieved_texts = [result.page_content for result in results]
            combined_context = "\n\n".join(retrieved_texts)
            message = f"Use the following context to answer the question:\n\n{combined_context}\n\n" + message

            user_message = {"role": "user", "content": [{"type": "text", "text": message}]}
            messages.append(user_message)

            # Set up streaming
            streamer = TextIteratorStreamer(self.pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
            input_ids = self.pipe.tokenizer.apply_chat_template(
                messages, 
                add_generation_prompt=True,
                tokenize=True,
                return_tensors="pt"
                ).to(self.pipe.model.device)
            
            self.pipe.model.generation_config.do_sample = True
            generation_kwargs = {
                "input_ids": input_ids,
                "max_new_tokens": max_new_tokens,
                "streamer": streamer,
                "do_sample": True,
                "temperature": temperature,
                "top_k": top_k
                }

            Thread(target=self.pipe.model.generate, kwargs=generation_kwargs).start()

            output = ""
            # Yield tokens as they stream in
            for token in streamer:
                output += token
            
            return output, retrieved_texts

        except Exception as e:
            return f"_Error: {str(e)}_"

In [6]:
medgemma = MedGemma(model_variant = model_variant,
                    use_quantization=use_quantization)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  6.00s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


### LLM Judge  - gpt-4o-mini

In [7]:
# Load .env file and get API key, API base and deployment name:
load_dotenv(".env", override=True)
# Set Azure OpenAI API key, base URL, and deployment name:
api_key = os.getenv("AZURE_OPENAI_API_KEY")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
embedding_name = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
api_base = os.getenv("AZURE_OPENAI_API_BASE")
api_version = os.getenv("AZURE_OPENAI_API_VERSION")


client = AzureOpenAI(
    azure_endpoint=api_base,
    api_key=api_key,
    api_version=api_version
)

In [8]:
azure_llm = AzureChatOpenAI(
    openai_api_version=api_version,
    azure_endpoint=api_base,
    azure_deployment=deployment_name,
    model=deployment_name,
    validate_base_url=False,
)

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
azure_embeddings = AzureOpenAIEmbeddings(
    openai_api_version=api_version,
    azure_endpoint=api_base,
    azure_deployment=embedding_name,
    model=embedding_name,
)

evaluator_llm = LangchainLLMWrapper(azure_llm)
evaluator_embeddings = LangchainEmbeddingsWrapper(azure_embeddings)

## Load evaluation dataset

In [9]:
test_data = pd.read_csv('data/eval.csv')
questions = test_data["Question"].to_list()
ground_truth = test_data["Answer"].to_list()

data = {"question": [], "answer": [], "contexts": [], "ground_truth": ground_truth}
file = 'data/report.pdf'

In [10]:
# gather generate answers
for query in questions:
        output, retrieved_texts = medgemma.consult(
            message=query,
            history=[],
            system_prompt="You are a helpful medical expert.",
            max_new_tokens=1000,
            top_k=50,
            temperature=0.7,
            image=None,
            file=file
            )
        data['question'].append(query)
        data['answer'].append(output)
        data['contexts'].append(retrieved_texts)
        

In [11]:
# create eval dataset
dataset = Dataset.from_dict(data)

# see sample
dataset[0]

{'question': 'What is the primary medical application discussed in the paper?',
 'answer': 'The primary medical application discussed in the paper is **radiomics-based decision support tool assists radiologists in small lung nodule classification**.\n',
 'contexts': ['[7] Sanne C Smid and Yves Rosseel. Sem with small samples: Two-step modeling and factor score\nregression versus bayesian estimation with informative priors. In Small sample size solutions , pages\n239–254. Routledge, 2020.\n[8] Janita E Van Timmeren, Davide Cester, Stephanie Tanadini-Lang, Hatem Alkadhi, and Bettina\nBaessler. Radiomics in medical imaging—“how-to” guide and critical reflection. Insights into imaging ,\n11(1):91, 2020.\n7',
  'Kochurov, Ravin Kumar, Junpeng Lao, Christian C Luhmann, Osvaldo A Martin, et al. Pymc:\na modern, and comprehensive probabilistic programming framework in python. PeerJ Computer\nScience , 9:e1516, 2023.\n[2] Benjamin Hunter, Christos Argyros, Marianna Inglese, Kristofer Linton-Rei

In [12]:
for i, ans in enumerate(dataset['answer']):
    print(f'{i}: {ans}')

0: The primary medical application discussed in the paper is **radiomics-based decision support tool assists radiologists in small lung nodule classification**.

1: Radiomic features are quantitative features extracted from medical images using data characterization algorithms. These features contain textural information such as spatial distribution of signal intensities and pixel interrelationships.

2: The imaging modality used to collect the dataset was **CT (Computed Tomography)**.

3: Based on the provided results, 51.2% of the nodules in the dataset were classified as malignant.

4: Radiomics is important in lung cancer diagnosis because it offers a non-invasive complement to CT scans for classifying lung nodules. This can enhance diagnostic and clinical decision-making.

5: The provided text doesn't mention the number of images used in the study. It only discusses the challenges of limited data availability in medical image analysis and the limitations of traditional statistical

## Evaluate

In [13]:
result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        answer_relevancy,
        faithfulness,
    ],
    llm=evaluator_llm,
    embeddings=evaluator_embeddings
)

Evaluating: 100%|██████████| 100/100 [00:36<00:00,  2.71it/s]


In [14]:
result

{'context_precision': 0.6633, 'context_recall': 0.6000, 'answer_relevancy': 0.8015, 'faithfulness': 0.7060}