In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import TokenTextSplitter
from autorag.data.corpus import llama_text_node_to_parquet
from pathlib import Path
from llama_index.core import SimpleDirectoryReader, ServiceContext, VectorStoreIndex, Document

from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import TextNode
import pandas as pd

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter)
from autorag.data.corpus import langchain_documents_to_parquet
import pandas as pd
from llama_index.llms.openai import OpenAI
from llama_index.llms.anthropic import Anthropic
from langchain_huggingface import HuggingFaceEmbeddings
from autorag.data.qacreation import generate_qa_llama_index, make_single_content_qa

from autorag.data.qacreation.ragas import generate_qa_ragas

import re
import getpass
import markdown
import os
import json




In [2]:
from dotenv import load_dotenv
load_dotenv()

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

if "ANTHROPIC_API_KEY" not in os.environ:
    os.environ["ANTHROPIC_API_KEY"] = getpass("Enter Anthropic API key:")

if "HF_API_KEY" not in os.environ:
    os.environ["HF_API_KEY"] = getpass("Enter HF API key:")

In [3]:
# import data
markdown_files = []
for root, dirs, files in os.walk("../data/processed_files"):
    for file in files:
        if file.lower().endswith('.md'):
            markdown_files.append(os.path.join(root, file))

In [4]:
# Iterate over the file paths
loaded_documents = []
for doc in markdown_files:
    try:
        loader = UnstructuredMarkdownLoader(doc)
        documents = loader.load()
        loaded_documents.extend(documents)
        print(f"Loaded: {doc}")
    except Exception as e:
        print(f"Error loading {doc}: {str(e)}")

Loaded: ../data/processed_files/Fortportal FY2020/Fortportal Regional Referral Hospital Report of Auditor General 2020.md
Loaded: ../data/processed_files/Fortportal FY2021/Fortportal Regional Referral Hospital Report FY20202021.md
Loaded: ../data/processed_files/Gulu FY2021/Gulu DLG Report of Auditor General 2021.md
Loaded: ../data/processed_files/CAG FY2022/Annual Consolidated OAG audit reports 2022.md
Loaded: ../data/processed_files/MWTS FY2021/MWTS Report of Auditor General 2021.md
Loaded: ../data/processed_files/Fortportal FY2022/Fortportal Regional Referral Hospital Report of Auditor General 2022.md
Loaded: ../data/processed_files/Gulu FY2022/Gulu DLG Report of Auditor General 2022.md
Loaded: ../data/processed_files/CAG FY2021/Annual Consolidated OAG audit reports 2021.md
Loaded: ../data/processed_files/MWTS FY2022/MWTS Report of Auditor General 2022.md


In [5]:
# 2. Custom function to normalize text
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove special characters (customize as needed)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

# Apply normalization to each document
for doc in loaded_documents:
    doc.page_content = normalize_text(doc.page_content)

In [6]:
loaded_documents = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=128).split_documents(loaded_documents)


In [7]:
corpus_df = langchain_documents_to_parquet(loaded_documents, '../autorag/corpus.parquet', upsert=True)

In [8]:
#import Anthropic model

from langchain_anthropic import ChatAnthropic

llm = ChatAnthropic(
    model="claude-3-5-sonnet-20240620",
    temperature=0.5,
    max_tokens=650,
    timeout=2,
    max_retries=3,
)

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [9]:
#llm = OpenAI(model='gpt-3.5-turbo-16k', temperature=0.8)

In [10]:
import nest_asyncio
nest_asyncio.apply()

In [11]:
# with RAGAS
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from autorag.data.qacreation.ragas import generate_qa_ragas

distributions = {  # uniform distribution
    simple: 0.1,
    reasoning: 0.35,
    multi_context: 0.2,
    conditional: 0.35
}
qa_df = generate_qa_ragas(corpus_df, test_size=10, 
                          generator_llm=llm, 
                          critic_llm=llm, 
                          embedding_model=embeddings, 
                          distributions=distributions)

embedding nodes:   0%|          | 0/15414 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


KeyboardInterrupt: 

In [None]:
# make qa data
qa_df_claude = make_single_content_qa(corpus_df, 75, generate_qa_llama_index, llm=llm, question_num_per_content=1,
                               output_filepath='../autorag/qa.parquet')

In [None]:
qa_df

Unnamed: 0,qid,retrieval_gt,query,generation_gt
0,15942580-0847-4df9-b55e-31170508611f,[[36463ae2-668c-4b75-bd8c-2a3eb44a73f8]],How much money was irregularly diverted from t...,[A total of ugx 1319m was irregularly diverted...
1,1b0ecaf7-dcd3-41b3-8885-431713e3f623,[[c9ffb554-85f8-4d35-932e-da12c10b9aa2]],What are the specific pages where the disclose...,[The matters disclosed under Note 19 can be fo...
2,e1eb90eb-86a1-4726-933e-39668407278c,[[d5b154b7-91c3-44b4-a63d-90a373ada346]],What was the reason for inefficiencies in the ...,[The institute had two IT systems that were no...
3,ca123f07-57a0-4ae3-a0f8-be93695b7b6d,[[f3186d2a-2e4f-4109-8309-1fd3c055fa26]],According to which act is the accounting offic...,[The accounting officer is accountable to parl...
4,6637917d-26f3-4049-b936-704371ba5e97,[[e5f5e28e-e1f7-44d8-a188-8babcbdedc2f]],What was the end user off peak tariff for extr...,[The end user off peak tariff for extra large ...
...,...,...,...,...
70,b8633b1a-552a-4f3d-bea6-31ab727f9327,[[01e22b3f-3147-41af-92db-0001ca14ddc9]],What was the total expenditure of the unqualif...,[The total expenditure of the unqualified acti...
71,70d68e15-9fe7-4531-a9d8-ba7371e05856,[[2e702d71-6fe6-48fe-a2b2-c8c4a68dfe70]],How much revenue was collected during the fina...,"[Only UGX141750bn was realized by the entity, ..."
72,08ec3b6d-0855-4a91-9cb5-2e618f48bbef,[[f17d87a8-3dc7-4ece-a184-6effd8942fc8]],How much did the hospital make in payments for...,[The hospital made payments for nonexistent li...
73,9c4dbf34-ec54-4590-929d-684a4daf601e,[[775b28b3-c844-4540-a527-bcb573993e91]],What are the entities mentioned in the text?,[The entities mentioned in the text include Ka...
