# VQA Model Result Evaluation

## 0. Environment Setup

### 0.1. Import Necessary Libraries

In [1]:
from pathlib import Path

import pandas as pd
from datasets import disable_progress_bars

import src.utils.dataset_helpers.world_med_qa_v.dataset_management as world_med_qa_v_dataset_management
import src.utils.dataset_helpers.world_med_qa_v.plot_helpers as world_med_qa_v_plot_helpers
from src.utils.data_definitions import LoggerConfig, GeneralDocSplitterOptions, GeneralVQAStrategiesDetails
from src.utils.enums import DocumentSplitterType, RagQPromptType, VQAStrategyType, ZeroShotPromptType
from src.utils.logger import LoggerManager
from src.utils.string_formatting_helpers import to_snake_case_strategy_name
from src.utils.text_splitters.paragraph_splitter import ParagraphSplitter
from src.utils.text_splitters.recursive_character_splitter import RecursiveCharacterSplitter
from src.utils.text_splitters.spacy_sentence_splitter import SpacySentenceSplitter
from src.visual_qa_model import VisualQAModel
from src.visual_qa_strategies.base_vqa_strategy import BaseVQAStrategy
from src.visual_qa_strategies.rag_q_as_vqa_strategy import RagQAsVQAStrategy
from src.visual_qa_strategies.rag_q_vqa_strategy import RagQVQAStrategy
from src.visual_qa_strategies.zero_shot_vqa_strategy import ZeroShotVQAStrategy

### 0.2. Configure Environment Settings

Detect Google Colab Form Annotation Automatically

In [2]:
%load_ext ipyform
%form_config --auto-detect 1

Enable Automatic Module Reloading

In [3]:
%load_ext autoreload
%autoreload 2

Disable Progress Bar for Dataset Filtering

In [4]:
disable_progress_bars()

## 1. Evaluation of VQA Approaches

Define Constants

In [5]:
DATASET_DIR = Path("data/WorldMedQA-V")
MODEL_NAME = "llava"
COUNTRY = "spain"
FILE_TYPE = "english"
RESULTS_DIR = Path('evaluation_results')

Load Dataset

In [6]:
world_med_qa_v_dataset = world_med_qa_v_dataset_management.load_vqa_dataset(
    data_path=DATASET_DIR,
    country=COUNTRY,
    file_type=FILE_TYPE
)
world_med_qa_v_dataset

- Loading WorldMedQA-V dataset (filename: spain_english_processed.tsv) ...
+ WorldMedQA-V dataset (filename: spain_english_processed.tsv) loaded.


Dataset({
    features: ['index', 'image', 'question', 'A', 'B', 'C', 'D', 'answer', 'correct_option', 'split'],
    num_rows: 125
})

### 1.1. Zero-Shot Evaluation

Load Model

In [7]:
llava_model = VisualQAModel(
    visual_qa_strategy=ZeroShotVQAStrategy(prompt_type=ZeroShotPromptType.V1),
    model_name=MODEL_NAME,
    country=COUNTRY,
    file_type=FILE_TYPE
)

- Loading Zero-Shot strategy ...
+ Zero-Shot strategy loaded.
- Loading Llava model (prompt template: zs_v1) ...
+ Llava model (prompt template: zs_v1) loaded.


Evaluate Model (Prompt Template: `zs_v1`)

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:07<00:00, 63.70s/it]

+ Model evaluation (spain_english subset) completed.





Evaluate Model (Prompt Template: `zs_v2`)

In [10]:
llava_model.visual_qa_strategy.prompt_type = ZeroShotPromptType.V2
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:16<00:00, 68.04s/it]

+ Model evaluation (spain_english subset) completed.





Evaluate Model (Prompt Template: `zs_v3`)

In [None]:
llava_model.visual_qa_strategy.prompt_type = ZeroShotPromptType.V3
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(5),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 5/5 [06:24<00:00, 76.90s/it]

+ Model evaluation (spain_english subset) completed.





### 1.2. Retrieval-Augmented Generation (RAG) Evaluation

Define Model Specific Constants

In [8]:
INDEX_DIR = Path('data/WikiMed/indexed_db')
INDEX_NAME = "Wikimed+S-PubMedBert-MS-MARCO-FullTexts"
EMBEDDING_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
RELEVANT_DOCS_COUNT = 2

#### 1.2.1. RAG Q (Question Only)

Load Model

In [9]:
llava_model.visual_qa_strategy = RagQVQAStrategy(
    prompt_type=RagQPromptType.V1,
    index_dir=INDEX_DIR,
    index_name=INDEX_NAME,
    embedding_model_name=EMBEDDING_MODEL_NAME,
    relevant_docs_count=RELEVANT_DOCS_COUNT
)

- Loading RAG Q strategy ...
	- Loading Document Retriever ...
		- Loading Embeddings ...
		+ Embeddings Loaded.
		- Loading Index ...
		+ Index Loaded.
		- Loading Retriever ...
		+ Retriever Loaded.
	+ Document Retriever Loaded.
+ RAG Q strategy loaded.
- Loading Llava model (prompt template: rq_v1) ...
+ Llava model (prompt template: rq_v1) loaded.


Evaluate Model (Prompt Template: `rq_v1`)

- Document Splitter Type: `No Document Splitter`

In [15]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [07:47<00:00, 234.00s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Recursive Character Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [29]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:53<00:00, 86.67s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [30]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...:   0%|          | 0/2 [00:00<?, ?it/s]

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:54<00:00, 87.13s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `spaCy Sentence Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [10]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [03:06<00:00, 93.03s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [32]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:41<00:00, 80.58s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Paragraph Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [33]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [03:59<00:00, 119.96s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [34]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...:   0%|          | 0/2 [00:00<?, ?it/s]

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [04:47<00:00, 143.91s/it]

+ Model evaluation (spain_english subset) completed.





Evaluate Model (Prompt Template: `rq_v2`)

- Document Splitter Type: `No Document Splitter`

In [None]:
llava_model.visual_qa_strategy.prompt_type = RagQPromptType.V2
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [07:47<00:00, 233.96s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Recursive Character Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:38<00:00, 79.16s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:34<00:00, 77.47s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `spaCy Sentence Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:40<00:00, 80.45s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:35<00:00, 77.74s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Paragraph Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [04:17<00:00, 128.63s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [03:36<00:00, 108.21s/it]

+ Model evaluation (spain_english subset) completed.





Evaluate Model (Prompt Template: `rq_v3`)

- Document Splitter Type: `No Document Splitter`

In [None]:
llava_model.visual_qa_strategy.prompt_type = RagQPromptType.V3
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [07:07<00:00, 213.70s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Recursive Character Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:41<00:00, 80.54s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:32<00:00, 76.36s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `spaCy Sentence Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:38<00:00, 79.02s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:49<00:00, 84.81s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Paragraph Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [04:17<00:00, 128.51s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [03:46<00:00, 113.07s/it]

+ Model evaluation (spain_english subset) completed.





Evaluate Model (Prompt Template: `rq_v4`)

- Document Splitter Type: `No Document Splitter`

In [None]:
llava_model.visual_qa_strategy.prompt_type = RagQPromptType.V4
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [06:52<00:00, 206.30s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Recursive Character Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [03:41<00:00, 110.63s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = RecursiveCharacterSplitter(
        token_count=2,
        chunk_size=200,
        chunk_overlap=0,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:49<00:00, 84.97s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `spaCy Sentence Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:53<00:00, 86.73s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = SpacySentenceSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [02:56<00:00, 88.44s/it]

+ Model evaluation (spain_english subset) completed.





- Document Splitter Type: `Paragraph Splitter`

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=True
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [04:47<00:00, 143.97s/it]

+ Model evaluation (spain_english subset) completed.





&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`no_title`

In [None]:
llava_model.evaluate(
    dataset=world_med_qa_v_dataset.take(2),
    results_path=RESULTS_DIR,
    doc_splitter = ParagraphSplitter(
        token_count=2,
        add_title=False
    )
)

- Evaluating model (spain_english subset) ...: 100%|██████████| 2/2 [04:10<00:00, 125.05s/it]

+ Model evaluation (spain_english subset) completed.





#### 1.2.2. RAG Q+As (Question + Answers)

#### 1.2.3. RAG IMG (Image-Based)

#### 1.2.4. RAG DB-Reranker (Database with Reranker)

## 2. VQA Approaches Exploration

Define Model Specific Constants

In [5]:
DATASET_DIR = Path("data/WorldMedQA-V")
MODEL_NAME = "llava"
COUNTRY = "spain"
FILE_TYPE = "english"
RESULTS_DIR = Path('evaluation_results')
LOGS_DIR = Path('logs')

Define RAG Q Specific Constants

In [6]:
INDEX_DIR = Path('data/WikiMed/indexed_db')
INDEX_NAME = "Wikimed+S-PubMedBert-MS-MARCO-FullTexts"
EMBEDDING_MODEL_NAME = "pritamdeka/S-PubMedBert-MS-MARCO"
RELEVANT_DOCS_COUNT = 1

Define Possible VQA Strategies

In [7]:
vqa_strategies: dict[VQAStrategyType, BaseVQAStrategy] = {
    VQAStrategyType.ZERO_SHOT: ZeroShotVQAStrategy(prompt_type=ZeroShotPromptType.V1),
    VQAStrategyType.RAG_Q: RagQVQAStrategy(
        prompt_type=RagQPromptType.V1,
        index_dir=INDEX_DIR,
        index_name=INDEX_NAME,
        embedding_model_name=EMBEDDING_MODEL_NAME,
        relevant_docs_count=RELEVANT_DOCS_COUNT
    ),
    VQAStrategyType.RAG_Q_AS: RagQAsVQAStrategy(
        prompt_type=RagQPromptType.V1,
        index_dir=INDEX_DIR,
        index_name=INDEX_NAME,
        embedding_model_name=EMBEDDING_MODEL_NAME,
        relevant_docs_count=RELEVANT_DOCS_COUNT
    ),
    VQAStrategyType.RAG_IMG: None,
    VQAStrategyType.RAG_DB_RERANKER: None
}

- Loading Zero-Shot strategy ...
+ Zero-Shot strategy loaded.
- Loading RAG Q strategy ...
	- Loading Document Retriever ...
		- Loading Embeddings ...
		+ Embeddings Loaded.
		- Loading Index ...
		+ Index Loaded.
		- Loading Retriever ...
		+ Retriever Loaded.
	+ Document Retriever Loaded.
+ RAG Q strategy loaded.
- Loading RAG Q+As strategy ...
** Instance of DocumentRetriever already exists, returning the existing instance. **
+ RAG Q+As strategy loaded.


Load Dataset

In [8]:
world_med_qa_v_dataset = world_med_qa_v_dataset_management.load_vqa_dataset(
    data_path=DATASET_DIR,
    country=COUNTRY,
    file_type=FILE_TYPE
)
world_med_qa_v_dataset

- Loading WorldMedQA-V dataset (filename: spain_english_processed.tsv) ...
+ WorldMedQA-V dataset (filename: spain_english_processed.tsv) loaded.


Dataset({
    features: ['index', 'image', 'question', 'A', 'B', 'C', 'D', 'answer', 'correct_option', 'split'],
    num_rows: 125
})

Prepare Logger Manager

In [9]:
logger_manager = LoggerManager(
    log_save_directory=LOGS_DIR,
    logger_config=LoggerConfig(
        console_handler_enabled=False,
        file_handler_enabled=True
    ))

Logger 'VisualQALogger' created!
	- Log Directory: logs
	- Log Filename: None
	- Handlers:
		* Console Handler: Disabled
		* File Handler: Enabled



Experiment with the Models

In [13]:
# @title Interactive VQA Model Exploration Form
vqa_strategy_type = 'Zero-Shot' # @param ["Zero-Shot", "RAG Q", "RAG Q+As", "RAG IMG", "RAG DB-Reranker"]
prompt_type = "zs_v1" # @param ["zs_v1", "zs_v2", "zs_v3", "rq_v1", "rq_v2", "rq_v3", "rq_v4", "rq_v5", "rq_v6"]
question_id = 1 # @param {"type":"integer"}
image_width = 600 # @param {"type":"integer"}
action = 'Fetch from JSON' # @param ["Execute Model", "Fetch from JSON"]


row = world_med_qa_v_dataset_management.get_dataset_row_by_id(
    dataset=world_med_qa_v_dataset,
    question_id=question_id
)

if action == "Execute Model":
    # logger_manager.create_new_log_file(f"q{question_id}_{prompt_type}_conversation.log")
    logger_manager.create_new_log_file()
    formatted_vqa_strategy_type = to_snake_case_strategy_name(strategy_name=vqa_strategy_type)
    chosen_vqa_strategy = vqa_strategies[VQAStrategyType(formatted_vqa_strategy_type)]
    chosen_vqa_strategy.prompt_type = RagQPromptType(prompt_type)  # ZeroShotPromptType(prompt_type)
    model=VisualQAModel(
        visual_qa_strategy=chosen_vqa_strategy,
        model_name=MODEL_NAME,
        country=COUNTRY,
        file_type=FILE_TYPE
    )
    world_med_qa_v_plot_helpers.visualize_qa_pair_row(
        row=row,
        image_width=image_width,
        model_answer=model.generate_answer_from_row(
            row=row,
            possible_options=['A', 'B', 'C', 'D'],
            verbose=True,
            logger_manager=logger_manager,
            should_apply_rag_to_question=True
        )
    )
elif action == "Fetch from JSON":
    model_answer = world_med_qa_v_dataset_management.fetch_model_answer_from_json(
        evaluation_results_folder=RESULTS_DIR,
        vqa_strategy_name=to_snake_case_strategy_name(strategy_name=vqa_strategy_type),
        country=COUNTRY,
        file_type=FILE_TYPE,
        prompt_type_name=prompt_type,
        question_id=question_id,
    )
    world_med_qa_v_plot_helpers.visualize_qa_pair_row(
        row=row,
        image_width=image_width,
        model_answer=model_answer
    )

FormWidget(children=(VBox(children=(HTML(value=''), HTML(value='<h2>Interactive VQA Model Exploration Form</h2…

## 3. Result Analysis

### 3.1. Zero-Shot

In [9]:
zero_shot_evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.ZERO_SHOT],
        prompt_types=list(ZeroShotPromptType),
        relevant_docs_count=[None],
        doc_splitter_options=[None],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

zero_shot_evaluation_results = zero_shot_evaluation_results.fillna("-")
zero_shot_evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,zero_shot,zs_v1,-,-,-,-,-,-,-,0.28,1.0
1,spain,english,zero_shot,zs_v2,-,-,-,-,-,-,-,0.304,1.0
2,spain,english,zero_shot,zs_v3,-,-,-,-,-,-,-,0.304,1.0


In [10]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=zero_shot_evaluation_results,
    title="[Zero-Shot] Analysis of LLaVA Model Accuracy",
    column_names=list(zero_shot_evaluation_results['prompt_type'])
)

In [11]:
best_zero_shot_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=zero_shot_evaluation_results
)
best_zero_shot_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
1,spain,english,zero_shot,zs_v2,-,-,-,-,-,-,-,0.304,1.0
2,spain,english,zero_shot,zs_v3,-,-,-,-,-,-,-,0.304,1.0


### 3.2. Rag Q (Relevant Documents Count: 1)

#### 3.2.1 No Title

In [12]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[1],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,1,paragraph_splitter,False,1.0,-,-,-,0.272,1.0
1,spain,english,rag_q,rq_v1,1,paragraph_splitter,False,2.0,-,-,-,0.28,0.984
2,spain,english,rag_q,rq_v1,1,paragraph_splitter,False,3.0,-,-,-,0.216,0.976
3,spain,english,rag_q,rq_v1,1,paragraph_splitter,False,4.0,-,-,-,0.208,0.936
4,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,1.0,300.0,0.0,-,0.28,1.0
5,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,1.0,600.0,0.0,-,0.296,1.0
6,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,1.0,900.0,0.0,-,0.272,1.0
7,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,2.0,300.0,0.0,-,0.256,1.0
8,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,2.0,600.0,0.0,-,0.248,1.0
9,spain,english,rag_q,rq_v1,1,recursive_character_splitter,False,2.0,900.0,0.0,-,0.272,1.0


In [13]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 1)"
)

#### 3.2.2. With Title

In [14]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[1],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,1,paragraph_splitter,True,1.0,-,-,-,0.264,1.0
1,spain,english,rag_q,rq_v1,1,paragraph_splitter,True,2.0,-,-,-,0.296,0.976
2,spain,english,rag_q,rq_v1,1,paragraph_splitter,True,3.0,-,-,-,0.248,0.976
3,spain,english,rag_q,rq_v1,1,paragraph_splitter,True,4.0,-,-,-,0.208,0.936
4,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,1.0,300.0,0.0,-,0.256,1.0
5,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,1.0,600.0,0.0,-,0.288,1.0
6,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,1.0,900.0,0.0,-,0.264,1.0
7,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,2.0,300.0,0.0,-,0.28,1.0
8,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,2.0,600.0,0.0,-,0.272,1.0
9,spain,english,rag_q,rq_v1,1,recursive_character_splitter,True,2.0,900.0,0.0,-,0.272,1.0


In [15]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 1)"
)

### 3.3. Rag Q (Relevant Documents Count: 2)

#### 3.3.1. No Title

In [16]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[2],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,2,paragraph_splitter,False,1.0,-,-,-,0.264,0.92
1,spain,english,rag_q,rq_v1,2,paragraph_splitter,False,2.0,-,-,-,0.184,0.8
2,spain,english,rag_q,rq_v1,2,paragraph_splitter,False,3.0,-,-,-,0.184,0.68
3,spain,english,rag_q,rq_v1,2,paragraph_splitter,False,4.0,-,-,-,0.128,0.456
4,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,1.0,300.0,0.0,-,0.272,1.0
5,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,1.0,600.0,0.0,-,0.32,1.0
6,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,1.0,900.0,0.0,-,0.288,1.0
7,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,2.0,300.0,0.0,-,0.288,1.0
8,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,2.0,600.0,0.0,-,0.288,1.0
9,spain,english,rag_q,rq_v1,2,recursive_character_splitter,False,2.0,900.0,0.0,-,0.272,1.0


In [17]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 2)"
)

#### 3.3.2. With Title

In [18]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[2],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,2,paragraph_splitter,True,1.0,-,-,-,0.256,0.92
1,spain,english,rag_q,rq_v1,2,paragraph_splitter,True,2.0,-,-,-,0.192,0.792
2,spain,english,rag_q,rq_v1,2,paragraph_splitter,True,3.0,-,-,-,0.176,0.64
3,spain,english,rag_q,rq_v1,2,paragraph_splitter,True,4.0,-,-,-,0.128,0.44
4,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,1.0,300.0,0.0,-,0.248,1.0
5,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,1.0,600.0,0.0,-,0.296,1.0
6,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,1.0,900.0,0.0,-,0.264,1.0
7,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,2.0,300.0,0.0,-,0.312,1.0
8,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,2.0,600.0,0.0,-,0.256,1.0
9,spain,english,rag_q,rq_v1,2,recursive_character_splitter,True,2.0,900.0,0.0,-,0.248,1.0


In [19]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 2)"
)

### 3.4. Rag Q (Relevant Documents Count: 3)

#### 3.4.1. No Title

In [20]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[False],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[3],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,3,paragraph_splitter,False,1.0,-,-,-,0.216,0.8
1,spain,english,rag_q,rq_v1,3,paragraph_splitter,False,2.0,-,-,-,0.16,0.584
2,spain,english,rag_q,rq_v1,3,paragraph_splitter,False,3.0,-,-,-,0.096,0.28
3,spain,english,rag_q,rq_v1,3,paragraph_splitter,False,4.0,-,-,-,0.072,0.128
4,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,1.0,300.0,0.0,-,0.264,1.0
5,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,1.0,600.0,0.0,-,0.272,1.0
6,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,1.0,900.0,0.0,-,0.264,1.0
7,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,2.0,300.0,0.0,-,0.28,1.0
8,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,2.0,600.0,0.0,-,0.256,1.0
9,spain,english,rag_q,rq_v1,3,recursive_character_splitter,False,2.0,900.0,0.0,-,0.16,0.64


In [21]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 3)"
)

#### 3.4.2. With Title

In [22]:
general_doc_splitter_options = [
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.PARAGRAPH_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.RECURSIVE_CHARACTER_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2],
        chunk_sizes=[300, 600, 900],
        chunk_overlaps=[0]
    ),
    GeneralDocSplitterOptions(
        doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
        add_titles=[True],
        token_counts=[1, 2, 3, 4, 5],
        chunk_sizes=[None],
        chunk_overlaps=[None]
    ),
    None
]
vqa_strategy_details = [
    GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[3],
        doc_splitter_options=[option],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
    for option in general_doc_splitter_options
]
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=[
        item
        for detail in vqa_strategy_details
        for item in detail
    ]
)
evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,3,paragraph_splitter,True,1.0,-,-,-,0.216,0.776
1,spain,english,rag_q,rq_v1,3,paragraph_splitter,True,2.0,-,-,-,0.16,0.56
2,spain,english,rag_q,rq_v1,3,paragraph_splitter,True,3.0,-,-,-,0.088,0.256
3,spain,english,rag_q,rq_v1,3,paragraph_splitter,True,4.0,-,-,-,0.048,0.12
4,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,1.0,300.0,0.0,-,0.288,1.0
5,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,1.0,600.0,0.0,-,0.28,1.0
6,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,1.0,900.0,0.0,-,0.28,1.0
7,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,2.0,300.0,0.0,-,0.28,1.0
8,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,2.0,600.0,0.0,-,0.28,1.0
9,spain,english,rag_q,rq_v1,3,recursive_character_splitter,True,2.0,900.0,0.0,-,0.136,0.592


In [23]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 3)"
)

---

`spacy_sentence_splitter` seems to output the most consistent results. Hence, we continue experimenting only with this **document splitter**.

### 3.5. spaCy Document Splitter

In [157]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                token_counts=[1, 2, 3, 4, 5],
                add_titles=[False],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,1,-,-,-,0.296,1.0
1,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,2,-,-,-,0.272,1.0
2,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,3,-,-,-,0.272,1.0
3,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,4,-,-,-,0.24,1.0
4,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,5,-,-,-,0.272,1.0
5,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,1,-,-,-,0.28,1.0
6,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,2,-,-,-,0.272,1.0
7,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,3,-,-,-,0.272,1.0
8,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,4,-,-,-,0.248,1.0
9,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,5,-,-,-,0.24,1.0


In [158]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=evaluation_results,
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings (RDC: 1)"
)

#### 3.5.1. RAG Q

##### 3.5.1.1. No Title

**- Relevant Documents Count = 1**

In [224]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,1,-,-,-,0.296,1.0
1,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,2,-,-,-,0.272,1.0
2,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,3,-,-,-,0.272,1.0
3,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,4,-,-,-,0.24,1.0
4,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,False,5,-,-,-,0.272,1.0
5,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,1,-,-,-,0.28,1.0
6,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,2,-,-,-,0.272,1.0
7,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,3,-,-,-,0.272,1.0
8,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,4,-,-,-,0.248,1.0
9,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,False,5,-,-,-,0.24,1.0


In [225]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [226]:
no_title_rag_q_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
no_title_rag_q_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
28,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,False,4,-,-,-,0.304,1.0


**- Token Count = 4**<br>
**- Prompt Type = rq_v6**

In [227]:
no_title_rag_q_tc_4_rq_v6_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V6],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[4],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

no_title_rag_q_tc_4_rq_v6_results = no_title_rag_q_tc_4_rq_v6_results.fillna("-")
no_title_rag_q_tc_4_rq_v6_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,False,4,-,-,-,0.304,1.0
1,spain,english,rag_q,rq_v6,2,spacy_sentence_splitter,False,4,-,-,-,0.312,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,False,4,-,-,-,0.312,1.0
3,spain,english,rag_q,rq_v6,4,spacy_sentence_splitter,False,4,-,-,-,0.28,1.0
4,spain,english,rag_q,rq_v6,5,spacy_sentence_splitter,False,4,-,-,-,0.304,1.0


In [228]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=no_title_rag_q_tc_4_rq_v6_results,
    title="[Rag-Q] LLaVA Model Accuracy (No Title; Token Count: 4; Prompt Type: rq_v6)",
    column_names=[f"rdc_{rdc}" for rdc in no_title_rag_q_tc_4_rq_v6_results['relevant_docs_count']]
)

In [229]:
best_rag_q_no_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=no_title_rag_q_tc_4_rq_v6_results
)
best_rag_q_no_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
1,spain,english,rag_q,rq_v6,2,spacy_sentence_splitter,False,4,-,-,-,0.312,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,False,4,-,-,-,0.312,1.0


##### 3.5.1.2. With Title

In [230]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,True,1,-,-,-,0.28,1.0
1,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,True,2,-,-,-,0.248,1.0
2,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,True,3,-,-,-,0.272,1.0
3,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,True,4,-,-,-,0.256,1.0
4,spain,english,rag_q,rq_v1,1,spacy_sentence_splitter,True,5,-,-,-,0.248,1.0
5,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,True,1,-,-,-,0.28,1.0
6,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,True,2,-,-,-,0.256,1.0
7,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,True,3,-,-,-,0.272,1.0
8,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,True,4,-,-,-,0.24,1.0
9,spain,english,rag_q,rq_v2,1,spacy_sentence_splitter,True,5,-,-,-,0.264,1.0


In [231]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [232]:
with_title_rag_q_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
with_title_rag_q_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
25,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,1,-,-,-,0.304,1.0
26,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0
28,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,4,-,-,-,0.304,1.0


**- Token Count = 1**<br>
**- Prompt Type = rq_v6**

In [233]:
with_title_rag_q_tc_1_rq_v6_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V6],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_q_tc_1_rq_v6_results = with_title_rag_q_tc_1_rq_v6_results.fillna("-")
with_title_rag_q_tc_1_rq_v6_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,1,-,-,-,0.304,1.0
1,spain,english,rag_q,rq_v6,2,spacy_sentence_splitter,True,1,-,-,-,0.296,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,1,-,-,-,0.304,1.0
3,spain,english,rag_q,rq_v6,4,spacy_sentence_splitter,True,1,-,-,-,0.296,1.0
4,spain,english,rag_q,rq_v6,5,spacy_sentence_splitter,True,1,-,-,-,0.288,1.0


In [234]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_q_tc_1_rq_v6_results,
    title="[Rag-Q] LLaVA Model Accuracy (With Title; Token Count: 1; Prompt Type: rq_v6)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_q_tc_1_rq_v6_results['relevant_docs_count']]
)

In [235]:
best_rag_q_with_title_results_v1 = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_q_tc_1_rq_v6_results
)
best_rag_q_with_title_results_v1

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,1,-,-,-,0.304,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,1,-,-,-,0.304,1.0


**- Token Count = 2**<br>
**- Prompt Type = rq_v6**

In [236]:
with_title_rag_q_tc_2_rq_v6_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V6],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[2],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_q_tc_2_rq_v6_results = with_title_rag_q_tc_2_rq_v6_results.fillna("-")
with_title_rag_q_tc_2_rq_v6_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0
1,spain,english,rag_q,rq_v6,2,spacy_sentence_splitter,True,2,-,-,-,0.296,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0
3,spain,english,rag_q,rq_v6,4,spacy_sentence_splitter,True,2,-,-,-,0.296,1.0
4,spain,english,rag_q,rq_v6,5,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0


In [237]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_q_tc_2_rq_v6_results,
    title="[Rag-Q] LLaVA Model Accuracy (With Title; Token Count: 2; Prompt Type: rq_v6)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_q_tc_2_rq_v6_results['relevant_docs_count']]
)

In [238]:
best_rag_q_with_title_results_v2 = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_q_tc_2_rq_v6_results
)
best_rag_q_with_title_results_v2

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0
4,spain,english,rag_q,rq_v6,5,spacy_sentence_splitter,True,2,-,-,-,0.304,1.0


**- Token Count = 4**<br>
**- Prompt Type = rq_v6**

In [239]:
with_title_rag_q_tc_4_rq_v6_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q],
        prompt_types=[RagQPromptType.V6],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[4],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[None]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_q_tc_4_rq_v6_results = with_title_rag_q_tc_4_rq_v6_results.fillna("-")
with_title_rag_q_tc_4_rq_v6_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q,rq_v6,1,spacy_sentence_splitter,True,4,-,-,-,0.304,1.0
1,spain,english,rag_q,rq_v6,2,spacy_sentence_splitter,True,4,-,-,-,0.296,1.0
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,4,-,-,-,0.312,1.0
3,spain,english,rag_q,rq_v6,4,spacy_sentence_splitter,True,4,-,-,-,0.296,1.0
4,spain,english,rag_q,rq_v6,5,spacy_sentence_splitter,True,4,-,-,-,0.304,1.0


In [240]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_q_tc_4_rq_v6_results,
    title="[Rag-Q] LLaVA Model Accuracy (With Title; Token Count: 4; Prompt Type: rq_v6)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_q_tc_4_rq_v6_results['relevant_docs_count']]
)

In [241]:
best_rag_q_with_title_results_v3 = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_q_tc_4_rq_v6_results
)
best_rag_q_with_title_results_v3

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,4,-,-,-,0.312,1.0


**- Best Combination**

In [242]:
best_rag_q_with_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=pd.concat([
        best_rag_q_with_title_results_v1,
        best_rag_q_with_title_results_v2,
        best_rag_q_with_title_results_v3
    ])
)
best_rag_q_with_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
2,spain,english,rag_q,rq_v6,3,spacy_sentence_splitter,True,4,-,-,-,0.312,1.0


#### 3.5.2. RAG-Q+As (Answers Only)

##### 3.5.2.1. No Title

**- Relevant Documents Count = 1**

In [243]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[False]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,1,-,-,False,0.264,1.0
1,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,2,-,-,False,0.264,1.0
2,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,3,-,-,False,0.288,1.0
3,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,4,-,-,False,0.296,1.0
4,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,5,-,-,False,0.256,0.984
5,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,1,-,-,False,0.256,1.0
6,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,2,-,-,False,0.28,1.0
7,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,3,-,-,False,0.304,1.0
8,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,4,-,-,False,0.288,0.992
9,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,5,-,-,False,0.248,0.984


In [244]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [245]:
no_title_rag_not_q_as_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
no_title_rag_not_q_as_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
21,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,False,2,-,-,False,0.312,1.0


**- Token Count = 2**<br>
**- Prompt Type = rq_v5**

In [246]:
no_title_rag_not_q_as_tc_2_rq_v5_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=[RagQPromptType.V5],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[2],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[False]
    ).get_possible_vqa_strategy_details()
)

no_title_rag_not_q_as_tc_2_rq_v5_results = no_title_rag_not_q_as_tc_2_rq_v5_results.fillna("-")
no_title_rag_not_q_as_tc_2_rq_v5_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,False,2,-,-,False,0.312,1.0
1,spain,english,rag_q_as,rq_v5,2,spacy_sentence_splitter,False,2,-,-,False,0.32,1.0
2,spain,english,rag_q_as,rq_v5,3,spacy_sentence_splitter,False,2,-,-,False,0.28,1.0
3,spain,english,rag_q_as,rq_v5,4,spacy_sentence_splitter,False,2,-,-,False,0.296,1.0
4,spain,english,rag_q_as,rq_v5,5,spacy_sentence_splitter,False,2,-,-,False,0.296,1.0


In [247]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=no_title_rag_not_q_as_tc_2_rq_v5_results,
    title="[Rag-Q+As - Answers Only] LLaVA Model Accuracy (No Title; Token Count: 2; Prompt Type: rq_v5)",
    column_names=[f"rdc_{rdc}" for rdc in no_title_rag_not_q_as_tc_2_rq_v5_results['relevant_docs_count']]
)

In [248]:
best_rag_not_q_as_no_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=no_title_rag_not_q_as_tc_2_rq_v5_results
)
best_rag_not_q_as_no_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
1,spain,english,rag_q_as,rq_v5,2,spacy_sentence_splitter,False,2,-,-,False,0.32,1.0


##### 3.5.2.2. With Title

**- Relevant Documents Count = 1**

In [249]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[False]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,1,-,-,False,0.304,1.0
1,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,2,-,-,False,0.272,1.0
2,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,3,-,-,False,0.28,1.0
3,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,4,-,-,False,0.272,0.992
4,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,5,-,-,False,0.264,0.984
5,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,1,-,-,False,0.28,1.0
6,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,2,-,-,False,0.256,1.0
7,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,3,-,-,False,0.264,1.0
8,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,4,-,-,False,0.264,0.992
9,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,5,-,-,False,0.248,0.984


In [250]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [251]:
with_title_rag_not_q_as_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
with_title_rag_not_q_as_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
20,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,True,1,-,-,False,0.312,1.0
29,spain,english,rag_q_as,rq_v6,1,spacy_sentence_splitter,True,5,-,-,False,0.312,1.0


**- Token Count = 1**<br>
**- Prompt Type = rq_v5**

In [252]:
with_title_rag_not_q_as_tc_1_rq_v5_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=[RagQPromptType.V5],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[False]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_not_q_as_tc_1_rq_v5_results = with_title_rag_not_q_as_tc_1_rq_v5_results.fillna("-")
with_title_rag_not_q_as_tc_1_rq_v5_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,True,1,-,-,False,0.312,1.0
1,spain,english,rag_q_as,rq_v5,2,spacy_sentence_splitter,True,1,-,-,False,0.288,1.0
2,spain,english,rag_q_as,rq_v5,3,spacy_sentence_splitter,True,1,-,-,False,0.272,1.0
3,spain,english,rag_q_as,rq_v5,4,spacy_sentence_splitter,True,1,-,-,False,0.28,1.0
4,spain,english,rag_q_as,rq_v5,5,spacy_sentence_splitter,True,1,-,-,False,0.304,1.0


In [253]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_not_q_as_tc_1_rq_v5_results,
    title="[Rag-Q+As - Answers Only] LLaVA Model Accuracy (With Title; Token Count: 1; Prompt Type: rq_v5)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_not_q_as_tc_1_rq_v5_results['relevant_docs_count']]
)

In [254]:
best_rag_not_q_as_with_title_results_v1 = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_not_q_as_tc_1_rq_v5_results
)
best_rag_not_q_as_with_title_results_v1

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,True,1,-,-,False,0.312,1.0


**- Token Count = 5**<br>
**- Prompt Type = rq_v6**

In [255]:
with_title_rag_not_q_as_tc_5_rq_v6_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=[RagQPromptType.V6],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[False]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_not_q_as_tc_5_rq_v6_results = with_title_rag_not_q_as_tc_5_rq_v6_results.fillna("-")
with_title_rag_not_q_as_tc_5_rq_v6_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v6,1,spacy_sentence_splitter,True,5,-,-,False,0.312,1.0
1,spain,english,rag_q_as,rq_v6,2,spacy_sentence_splitter,True,5,-,-,False,0.288,1.0
2,spain,english,rag_q_as,rq_v6,3,spacy_sentence_splitter,True,5,-,-,False,0.288,1.0
3,spain,english,rag_q_as,rq_v6,4,spacy_sentence_splitter,True,5,-,-,False,0.288,1.0
4,spain,english,rag_q_as,rq_v6,5,spacy_sentence_splitter,True,5,-,-,False,0.288,1.0


In [256]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_not_q_as_tc_5_rq_v6_results,
    title="[Rag-Q+As - Answers Only] LLaVA Model Accuracy (With Title; Token Count: 5; Prompt Type: rq_v6)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_not_q_as_tc_5_rq_v6_results['relevant_docs_count']]
)

In [257]:
best_rag_not_q_as_with_title_results_v2 = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_not_q_as_tc_5_rq_v6_results
)
best_rag_not_q_as_with_title_results_v2

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v6,1,spacy_sentence_splitter,True,5,-,-,False,0.312,1.0


**- Best Combination**

In [258]:
best_rag_not_q_as_with_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=pd.concat([
        best_rag_not_q_as_with_title_results_v1,
        best_rag_not_q_as_with_title_results_v2
    ])
)
best_rag_not_q_as_with_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v5,1,spacy_sentence_splitter,True,1,-,-,False,0.312,1.0
0,spain,english,rag_q_as,rq_v6,1,spacy_sentence_splitter,True,5,-,-,False,0.312,1.0


#### 3.5.3. RAG-Q+As (Question and Answers)

##### 3.5.3.1. No Title

**- Relevant Documents Count = 1**

In [259]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[True]
    ).get_possible_vqa_strategy_details()
)

evaluation_results = evaluation_results.fillna("-")
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,1,-,-,True,0.288,1.0
1,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,2,-,-,True,0.296,1.0
2,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,3,-,-,True,0.288,1.0
3,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,4,-,-,True,0.232,0.992
4,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,False,5,-,-,True,0.256,0.92
5,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,1,-,-,True,0.304,1.0
6,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,2,-,-,True,0.264,1.0
7,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,3,-,-,True,0.24,1.0
8,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,4,-,-,True,0.24,0.984
9,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,False,5,-,-,True,0.208,0.912


In [260]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [261]:
no_title_rag_q_as_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
no_title_rag_q_as_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
15,spain,english,rag_q_as,rq_v4,1,spacy_sentence_splitter,False,1,-,-,True,0.32,1.0


**- Token Count = 1**<br>
**- Prompt Type = rq_v4**

In [262]:
no_title_rag_q_as_tc_1_rq_v4_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=[RagQPromptType.V4],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[False],
                token_counts=[1],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[True]
    ).get_possible_vqa_strategy_details()
)

no_title_rag_q_as_tc_1_rq_v4_results = no_title_rag_q_as_tc_1_rq_v4_results.fillna("-")
no_title_rag_q_as_tc_1_rq_v4_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v4,1,spacy_sentence_splitter,False,1,-,-,True,0.32,1.0
1,spain,english,rag_q_as,rq_v4,2,spacy_sentence_splitter,False,1,-,-,True,0.272,0.992
2,spain,english,rag_q_as,rq_v4,3,spacy_sentence_splitter,False,1,-,-,True,0.264,0.984
3,spain,english,rag_q_as,rq_v4,4,spacy_sentence_splitter,False,1,-,-,True,0.224,0.904
4,spain,english,rag_q_as,rq_v4,5,spacy_sentence_splitter,False,1,-,-,True,0.104,0.496


In [263]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=no_title_rag_q_as_tc_1_rq_v4_results,
    title="[Rag-Q+As - Question and As] LLaVA Model Accuracy (No Title; Token Count: 1; Prompt Type: rq_v4)",
    column_names=[f"rdc_{rdc}" for rdc in no_title_rag_q_as_tc_1_rq_v4_results['relevant_docs_count']]
)

In [264]:
best_rag_q_as_no_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=no_title_rag_q_as_tc_1_rq_v4_results
)
best_rag_q_as_no_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v4,1,spacy_sentence_splitter,False,1,-,-,True,0.32,1.0


##### 3.5.3.2. With Title

In [265]:
evaluation_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=list(RagQPromptType),
        relevant_docs_count=[1],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1, 2, 3, 4, 5],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[True]
    ).get_possible_vqa_strategy_details()
)
evaluation_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,1,,,True,0.32,1.0
1,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,2,,,True,0.272,1.0
2,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,3,,,True,0.288,1.0
3,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,4,,,True,0.24,0.968
4,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,5,,,True,0.208,0.904
5,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,1,,,True,0.296,1.0
6,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,2,,,True,0.272,1.0
7,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,3,,,True,0.296,1.0
8,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,4,,,True,0.232,0.968
9,spain,english,rag_q_as,rq_v2,1,spacy_sentence_splitter,True,5,,,True,0.192,0.896


In [266]:
world_med_qa_v_plot_helpers.plot_rag_q_evaluation_results_by_groups(
    title="Analysis of LLaVA Model Accuracy Across Different RAG Approaches and Parameter Settings",
    evaluation_results=evaluation_results,
    row_variable='relevant_docs_count',
    column_variable='token_count',
    bar_graph_variable='prompt_type'
)

In [267]:
with_title_rag_q_as_rdc1_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=evaluation_results
)
with_title_rag_q_as_rdc1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,1,,,True,0.32,1.0


**- Token Count = 1**<br>
**- Prompt Type = rq_v1**

In [268]:
with_title_rag_q_as_tc_1_rq_v1_results = world_med_qa_v_dataset_management.load_evaluation_results(
    evaluation_results_folder=RESULTS_DIR,
    vqa_strategy_details=GeneralVQAStrategiesDetails(
        countries=["spain"],
        file_types=["english"],
        vqa_strategy_types=[VQAStrategyType.RAG_Q_AS],
        prompt_types=[RagQPromptType.V1],
        relevant_docs_count=[1, 2, 3, 4, 5],
        doc_splitter_options=[
            GeneralDocSplitterOptions(
                doc_splitter_types=[DocumentSplitterType.SPACY_SENTENCE_SPLITTER],
                add_titles=[True],
                token_counts=[1],
                chunk_sizes=[None],
                chunk_overlaps=[None]
            )
        ],
        should_apply_rag_to_questions=[True]
    ).get_possible_vqa_strategy_details()
)

with_title_rag_q_as_tc_1_rq_v1_results = with_title_rag_q_as_tc_1_rq_v1_results.fillna("-")
with_title_rag_q_as_tc_1_rq_v1_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,1,-,-,True,0.32,1.0
1,spain,english,rag_q_as,rq_v1,2,spacy_sentence_splitter,True,1,-,-,True,0.264,0.992
2,spain,english,rag_q_as,rq_v1,3,spacy_sentence_splitter,True,1,-,-,True,0.248,0.968
3,spain,english,rag_q_as,rq_v1,4,spacy_sentence_splitter,True,1,-,-,True,0.184,0.68
4,spain,english,rag_q_as,rq_v1,5,spacy_sentence_splitter,True,1,-,-,True,0.04,0.12


In [269]:
world_med_qa_v_plot_helpers.display_bar_chart_on_evaluation_results(
    evaluation_results=with_title_rag_q_as_tc_1_rq_v1_results,
    title="[Rag-Q+As - Question and As] LLaVA Model Accuracy (With Title; Token Count: 1; Prompt Type: rq_v1)",
    column_names=[f"rdc_{rdc}" for rdc in with_title_rag_q_as_tc_1_rq_v1_results['relevant_docs_count']]
)

In [270]:
best_rag_q_as_with_title_results = world_med_qa_v_dataset_management.get_max_accuracy_rows(
    evaluation_results=with_title_rag_q_as_tc_1_rq_v1_results
)
best_rag_q_as_with_title_results

Unnamed: 0,country,file_type,vqa_strategy_type,prompt_type,relevant_docs_count,doc_splitter,add_title,token_count,chunk_size,chunk_overlap,should_apply_rag_to_question,accuracy,well_formatted_answers
0,spain,english,rag_q_as,rq_v1,1,spacy_sentence_splitter,True,1,-,-,True,0.32,1.0


### 3.6. Best Results

#### 3.6.1. **RDC** Fixed (*RDC = 1*) | (Variables: **Token Count** and **Prompt Type**)

In [295]:
world_med_qa_v_plot_helpers.display_evaluation_results_summary(
    evaluation_results_list=[
        no_title_rag_q_rdc1_results,
        with_title_rag_q_rdc1_results,
        no_title_rag_not_q_as_rdc1_results,
        with_title_rag_not_q_as_rdc1_results,
        no_title_rag_q_as_rdc1_results,
        with_title_rag_q_as_rdc1_results
    ],
    separator_rows=[4, 7]
)

Country,File Type,VQA Strategy,Prompt,Relevant Docs. Count,Doc. Splitter,Title,Token Count,Accuracy,Well Formatted Answers
spain,english,RAG Q,rq_v6,1,spacy_sentence_splitter,No,4,0.304,1.0
spain,english,RAG Q,rq_v6,1,spacy_sentence_splitter,Yes,1,0.304,1.0
spain,english,RAG Q,rq_v6,1,spacy_sentence_splitter,Yes,2,0.304,1.0
spain,english,RAG Q,rq_v6,1,spacy_sentence_splitter,Yes,4,0.304,1.0
spain,english,RAG Q+As (Answers Only),rq_v5,1,spacy_sentence_splitter,No,2,0.312,1.0
spain,english,RAG Q+As (Answers Only),rq_v5,1,spacy_sentence_splitter,Yes,1,0.312,1.0
spain,english,RAG Q+As (Answers Only),rq_v6,1,spacy_sentence_splitter,Yes,5,0.312,1.0
spain,english,RAG Q+As (Question and Answers),rq_v4,1,spacy_sentence_splitter,No,1,0.32,1.0
spain,english,RAG Q+As (Question and Answers),rq_v1,1,spacy_sentence_splitter,Yes,1,0.32,1.0


#### 3.6.2. **Token Count** and **Prompt Type** Fixed | (Variable: **RDC**)

In [293]:
world_med_qa_v_plot_helpers.display_evaluation_results_summary(
    evaluation_results_list=[
        best_zero_shot_results,
        best_rag_q_no_title_results,
        best_rag_q_with_title_results,
        best_rag_not_q_as_no_title_results,
        best_rag_not_q_as_with_title_results,
        best_rag_q_as_no_title_results,
        best_rag_q_as_with_title_results
    ],
    separator_rows=[2, 5, 8]
)

Country,File Type,VQA Strategy,Prompt,Relevant Docs. Count,Doc. Splitter,Title,Token Count,Accuracy,Well Formatted Answers
spain,english,Zero-Shot,zs_v2,-,-,-,-,0.304,1.0
spain,english,Zero-Shot,zs_v3,-,-,-,-,0.304,1.0
spain,english,RAG Q,rq_v6,2,spacy_sentence_splitter,No,4,0.312,1.0
spain,english,RAG Q,rq_v6,3,spacy_sentence_splitter,No,4,0.312,1.0
spain,english,RAG Q,rq_v6,3,spacy_sentence_splitter,Yes,4,0.312,1.0
spain,english,RAG Q+As (Answers Only),rq_v5,2,spacy_sentence_splitter,No,2,0.32,1.0
spain,english,RAG Q+As (Answers Only),rq_v5,1,spacy_sentence_splitter,Yes,1,0.312,1.0
spain,english,RAG Q+As (Answers Only),rq_v6,1,spacy_sentence_splitter,Yes,5,0.312,1.0
spain,english,RAG Q+As (Question and Answers),rq_v4,1,spacy_sentence_splitter,No,1,0.32,1.0
spain,english,RAG Q+As (Question and Answers),rq_v1,1,spacy_sentence_splitter,Yes,1,0.32,1.0
