In [23]:
# import packages
import json
from typing import Tuple, List
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.builders import PromptBuilder, AnswerBuilder
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.generators import OpenAIGenerator
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.evaluators import ContextRelevanceEvaluator, FaithfulnessEvaluator, SASEvaluator
from haystack.evaluation import EvaluationRunResult
from haystack.components.converters import MarkdownToDocument
from haystack.components.generators import HuggingFaceAPIGenerator
from haystack.utils import Secret

from tqdm import tqdm
from pathlib import Path
from openai import BadRequestError
from getpass import getpass
import os
import pandas as pd

In [24]:
from dotenv import load_dotenv
load_dotenv()

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
generator_openai = OpenAIGenerator(model="gpt-3.5-turbo")

if "HF_API_KEY" not in os.environ:
    os.environ["HF_API_KEY"] = getpass("Enter HF API key:")
generator = HuggingFaceAPIGenerator(api_type="serverless_inference_api",
                                    api_params={"model": "mistralai/Mistral-7B-v0.1"})

In [25]:
#import test data
# import data
import json
from typing import Tuple, List

def read_question_answers() -> Tuple[List[str], List[str]]:
    with open("../data/evaluation/eval_data_no_image_pdf.json", "r") as f:
        data = json.load(f)
        questions = data["questions"]
        answers = data["ground_truths"]
    return questions, answers

questions, answers = read_question_answers()

In [26]:
markdown_files = []
for root, dirs, files in os.walk("../data/processed_files"):
    for file in files:
        if file.lower().endswith('.md'):
            markdown_files.append(os.path.join(root, file))

# STEP 1: Pre-Processing: Document Loading, Indexing, Chunking

In [27]:
# document loading, indexing and embedding
def indexing(embedding_model: str, chunk_size: int):

    # specify document path
    files_path = "../data/processed_files"

    # specify type of document store - InMemory is the simplest one for prototyping. More advanced would be a VectorDatabase
    document_store = InMemoryDocumentStore()

    # define pre-processing pipeline
    pipeline = Pipeline()
    # markdown from already transformed PDF documents
    pipeline.add_component("converter", MarkdownToDocument())
    #pipeline.add_component("converter", PyPDFToDocument())
    pipeline.add_component("cleaner", DocumentCleaner())
    pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=chunk_size, split_overlap=25))  # splitting by word
    pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))
    pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(embedding_model))
    pipeline.connect("converter", "cleaner")
    pipeline.connect("cleaner", "splitter")
    pipeline.connect("splitter", "embedder")
    pipeline.connect("embedder", "writer")
    markdown_files = []
    for root, dirs, files in os.walk("../data/processed_files"):
        for file in files:
            if file.lower().endswith('.md'):
                markdown_files.append(os.path.join(root, file))
    pipeline.run({"converter": {"sources": markdown_files}})

    return document_store, pipeline


In [28]:
# let's index and embedd such that the RAG can simply access the documentstore
document_store, pipeline = indexing("sentence-transformers/all-mpnet-base-v2", 256)

Converting markdown files to Documents: 100%|██████████| 9/9 [00:00<00:00, 10.99it/s]
Batches: 100%|██████████| 5/5 [00:05<00:00,  1.16s/it]
