In [67]:
# import libraries
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter)
import os
import re

In [68]:
# import data
markdown_files = []
for root, dirs, files in os.walk("../data/processed_files"):
    for file in files:
        if file.lower().endswith('.md'):
            markdown_files.append(os.path.join(root, file))
    

In [66]:
# Iterate over the file paths
loaded_documents = []
for doc in markdown_files:
    try:
        loader = UnstructuredMarkdownLoader(doc)
        documents = loader.load()
        loaded_documents.extend(documents)
        print(f"Loaded: {doc}")
    except Exception as e:
        print(f"Error loading {doc}: {str(e)}")

Loaded: ../data/processed_files/Fortportal FY2020/Fortportal Regional Referral Hospital Report of Auditor General 2020.md
Loaded: ../data/processed_files/Fortportal FY2021/Fortportal Regional Referral Hospital Report FY20202021.md
Loaded: ../data/processed_files/Gulu FY2021/Gulu DLG Report of Auditor General 2021.md
Loaded: ../data/processed_files/CAG FY2022/Annual Consolidated OAG audit reports 2022.md
Loaded: ../data/processed_files/MWTS FY2021/MWTS Report of Auditor General 2021.md
Loaded: ../data/processed_files/Fortportal FY2022/Fortportal Regional Referral Hospital Report of Auditor General 2022.md
Loaded: ../data/processed_files/Gulu FY2022/Gulu DLG Report of Auditor General 2022.md
Loaded: ../data/processed_files/CAG FY2021/Annual Consolidated OAG audit reports 2021.md
Loaded: ../data/processed_files/MWTS FY2022/MWTS Report of Auditor General 2022.md


In [69]:
# 2. Custom function to normalize text
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters (customize as needed)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Apply normalization to each document
for doc in loaded_documents:
    doc.page_content = normalize_text(doc.page_content)

In [70]:
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=150, length_function=len
)
docs = text_splitter.split_documents(loaded_documents)

In [72]:
from dotenv import load_dotenv
load_dotenv()

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")
# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings()

In [74]:
docs[0]

Document(page_content='office of the auditor general the republic of uganda report of the auditor general on the financial statements of fort portal regional referral hospital for the year ended 30 th june 2020 table of contents table of contents  ii list of acronyms  iii qualified opinion  1 basis for qualified opinion  1 mischarge of expenditure  1 key audit matters  2 10 budget performance  2 management of covid  19 interventions 7 20 other matters  9 30 lack of a functional hospital governing board  9 40 hospital private wing  9 50 under staffing  10 other information  10 managements responsibility for the financial statements  10 auditor generals responsibilities for the audit of the financial statements  11 other reporting responsibilities  12 report on the audit of compliance with legislation  12 60 fleet management  12 payment of salaries to employees who had retired 15 70 80 status of medical equipment  15 appendices  18 appendix 1 mischarge of expenditure  18 appendix ii exte

In [75]:
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Change resulting question type distribution
distributions = {
    simple: 0.3,
    multi_context: 0.4,
    reasoning: 0.3
}

# use generator.generate_with_llamaindex_docs if you use llama-index as document loader
testset = generator.generate_with_langchain_docs(docs, 10, distributions, raise_exceptions=False) 
testset.to_pandas()

Filename and doc_id are the same for all nodes.


KeyboardInterrupt: 

In [None]:
# works but is quite expensive with OpenAI API token. 