In [None]:
'''
# Update venv\Lib\site-packages\llama_index\finetuning\embeddings\common.py
def generate_qa_embedding_pairs
    ...
    save_counter = start_index

    # added --------------------------------------------------------------
    import time
    counter, start_time = 0, time.time()
    # --------------------------------------------------------------------

    for node_id, text in tqdm(
        list(node_dict.items())[start_index:], initial=start_index
    ):
        
        # added --------------------------------------------------------------
        counter += 1
        if counter > 10 and time.time() - start_time < 60:
            time.sleep(60 - (time.time() - start_time))
            counter, start_time = 0, time.time()
        # --------------------------------------------------------------------

        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
    ...
''';

In [58]:
import os
import json
import glob
import pickle
import openparse
from tqdm import tqdm
from huggingface_hub import login
import google.generativeai as genai
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from transformers import BitsAndBytesConfig
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.llms.chatml_utils import messages_to_prompt, completion_to_prompt

In [None]:
HF_CACHE_DIR = "../models/hf"
os.environ['HF_HOME'] = HF_CACHE_DIR

TIKTOKEN_CACHE_DIR = "../models/tiktoken"
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
# assert os.path.exists(os.path.join(TIKTOKEN_CACHE_DIR, "9b5ad71b2ce5302211f9c61530b329a4922fc6a4"))

In [None]:
# Read API tokens (SHOULD BE CREATED BY USER)
with open('../reqs/tokens.json', 'r') as file:
    tokens = json.load(file)

HF_ACCESS_TOKEN = tokens['HF_ACCESS_TOKEN'][0]
GOOGLE_API_KEY = tokens['GOOGLE_API_KEY'][0]
OPENAI_API_KEY = tokens['OPENAI_API_KEY'][0]

login(token=HF_ACCESS_TOKEN)

# Set/Load LLM

In [None]:
# API model
llm = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.3,
)
Settings.llm = llm

In [None]:
# # Local model
# llm_name = "meta-llama/Llama-3.2-3B"

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit = True,
#     bnb_4bit_compute_dtype = torch.float16,
#     bnb_4bit_quant_type = "nf4",
#     bnb_4bit_use_double_quant = True,
# )

# llm = HuggingFaceLLM(
#     model_name = llm_name,
#     tokenizer_name = llm_name,
#     context_window = 2048,
#     max_new_tokens = 512,

#     generate_kwargs = {
#         "do_sample": True,
#         "temperature": 0.5,
#     },
#     model_kwargs = {
#         # "torch_dtype": torch.float16,
#         "quantization_config": quantization_config,
#         "cache_dir": HF_CACHE_DIR,
#     },
#     device_map = "auto",
#     is_chat_model = True,

#     completion_to_prompt = completion_to_prompt,
#     messages_to_prompt = messages_to_prompt,
# )

# Settings.llm = llm

# Parse Files

In [None]:
num_train_folders = 5
train_files = []

for i in range(num_train_folders):
    train_files.append(glob.glob(f"../data/finetune/docs/train_{i+1}/*.pdf"))

val_files = glob.glob("../data/finetune/docs/val/*.pdf")

train_files, val_files

In [None]:
def parse_corpus(files):
    parser = openparse.DocumentParser(
        table_args = {"parsing_algorithm": "pymupdf",},
    )

    nodes = []
    for file in tqdm(files):
        try:
            nodes += parser.parse(file, ocr=True).to_llama_index_nodes()
        except ValueError:
            continue

    return nodes

# Parse and save
for i in range(num_train_folders):
    train_nodes = parse_corpus(train_files[i])
    with open(f'../data/finetune/docs/train_{i+1}/nodes.pkl', 'wb') as file: pickle.dump(train_nodes, file)
    print(len(train_nodes))
    
val_nodes = parse_corpus(val_files)
with open('../data/finetune/docs/val/nodes.pkl', 'wb') as file: pickle.dump(val_nodes, file) 
print(len(val_nodes))

# Generate Datasets

In [None]:
for i in range(num_train_folders):
    if i + 1 == 1:
        with open(f'../data/finetune/docs/train_{i+1}/nodes.pkl', 'rb') as file:
            train_nodes = pickle.load(file)

        train_dataset = generate_qa_embedding_pairs(
            llm = llm,
            nodes = train_nodes,
            num_questions_per_chunk = 2,
            output_path = f"../data/finetune/datasets/train_{i+1}.json",
        )

In [None]:
val_dataset = generate_qa_embedding_pairs(
    llm = llm,
    nodes = val_nodes,
    num_questions_per_chunk = 2,
    output_path = f"../data/finetune/datasets/val.json",
)

# Clean Dataset (Part I)

In [None]:
def preprocess(text):
    text = text.replace('*', ' ').split(' ')
    text_modified = ''
    for t in text:
        if not t.isupper():
            t = t.lower()
        text_modified += t + ' '
    return text_modified

In [None]:
bad_queries = ("**Question 1:**",)

In [None]:
# Train set
# Combine train sets
json_files = glob.glob("../data/finetune/datasets/train_*.json")
train_sets = []

for f in json_files:
    with open(f, 'r') as file:
        train_sets.append(json.load(file))

queries_size = []
corpus_size = []
relevant_docs_size = []

for train_set in train_sets:
    queries_size.append(len(train_set['queries'].keys()))
    corpus_size.append(len(train_set['corpus'].keys()))
    relevant_docs_size.append(len(train_set['relevant_docs'].keys()))

train_set = {}
for t in train_sets:
    for key1 in t:
        for key2 in t[key1]:
            if key1 == 'mode':
                continue
            if key1 not in train_set.keys():
                train_set[key1] = {}
            train_set[key1][key2] = t[key1][key2]

train_set["mode"] = "text"

assert sum(queries_size) == len(train_set['queries'].keys()), "Unmatched number of queries"
assert sum(corpus_size) == len(train_set['corpus'].keys()), "Unmatched number of corpus"
assert sum(relevant_docs_size) == len(train_set['relevant_docs'].keys()), "Unmatched number of relevant_docs"

# Remove bad questions
train_set['queries'] = {key:val for key, val in train_set['queries'].items() if val not in bad_queries}
train_set['queries'] = {key:val for key, val in train_set['queries'].items() if 'question 1' not in val.lower()}

# Process
for key, val in train_set['queries'].items():
    val = preprocess(val)
    train_set['queries'][key] = val
for key, val in train_set['corpus'].items():
    val = preprocess(val)
    train_set['corpus'][key] = val

# Save
with open("../data/finetune/datasets/train.json", 'w') as f:
    json.dump(train_set, f)

In [None]:
# Validation set
with open("../data/finetune/datasets/val.json", 'r') as file:
    val_dataset = json.load(file)

# Remove bad questions
val_dataset['queries'] = {key:val for key, val in val_dataset['queries'].items() if val not in bad_queries}
val_dataset['queries'] = {key:val for key, val in val_dataset['queries'].items() if 'question 1' not in val.lower()}

# Process
for key, val in val_dataset['queries'].items():
    val = preprocess(val)
    val_dataset['queries'][key] = val
for key, val in val_dataset['corpus'].items():
    val = preprocess(val)
    val_dataset['corpus'][key] = val

# Save
with open("../data/finetune/datasets/val.json", 'w') as f:
    json.dump(val_dataset, f)

In [None]:
len(list(train_set['queries'].keys())) + len(list(val_dataset['queries'].keys()))

# Clean Dataset (Part II)

In [63]:
def preprocess_2(text):
    bad_strings = (
        '<br><br>', '<th></th>', '<td></td>', '<table border=\"1\">',
        '<tr></tr>', '<table></table>', '<td>', '</td>', '<tr>', '</tr>', '<table>', '</table>', '<th>', '</th>', '<br>',
        '<p>', '</p>', '<h1>', '</h1>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>', '<h6>', '</h6>', '<ul>', '</ul>', '<ol>', '</ol>', '<li>', '</li>',
        '<strong>', '</strong>', '<em>', '</em>', '<b>', '</b>', '<i>', '</i>', '<u>', '</u>', '<sub>', '</sub>', '<sup>', '</sup>', '<code>', '</code>', '<pre>', '</pre>', '<blockquote>', '</blockquote>', '<hr>', '<br>', '<br />', '<br/>',
    )
    for bad_string in bad_strings:
        text = text.replace(bad_string, '\n')

    bad_string = '\n'
    for _ in range(10):
        bad_string += '\n'
        text = text.replace(bad_string, '\n')
    
    bad_string = '\n'
    for _ in range(10):
        bad_string += ' \n'
        text = text.replace(bad_string, '\n')

    return text


def convert_to_question_context_pairs(dataset):
    question_context = {}

    counter = 0
    for key in dataset['queries'].keys():
        counter += 1
        question = dataset['queries'][key]
        val = dataset['relevant_docs'][key][0]
        context = dataset['corpus'][val]
        context = preprocess_2(context)
        question_context[counter] = [key, question, context]

    return question_context

In [None]:
# Train set
with open("../data/finetune/datasets/train.json", 'r') as file:
    train_dataset = json.load(file)

train_pairs = convert_to_question_context_pairs(train_dataset)
with open("../data/finetune/datasets/test/train_pairs.json", 'w') as f:
    json.dump(train_pairs, f)

# # Check percentage of removed items
# with open("../data/finetune/datasets/test/modified_train_pairs.json", 'r') as file:
#     modified_train_pairs = json.load(file)
# round((len(list(train_pairs.keys())) - len(list(modified_train_pairs.keys()))) / len(list(train_pairs.keys())) * 100)

In [None]:
# Validation set
with open("../data/finetune/datasets/val.json", 'r') as file:
    val_dataset = json.load(file)

val_pairs = convert_to_question_context_pairs(val_dataset)
with open("../data/finetune/datasets/test/val_pairs.json", 'w') as f:
    json.dump(val_pairs, f)

# Check percentage of removed items
with open("../data/finetune/datasets/test/modified_validation_pairs.json", 'r') as file:
    modified_validation_pairs = json.load(file)
round((len(list(val_pairs.keys())) - len(list(modified_validation_pairs.keys()))) / len(list(val_pairs.keys())) * 100)

15