In [23]:
'''
# Update venv\Lib\site-packages\llama_index\finetuning\embeddings\common.py
def generate_qa_embedding_pairs
    ...
    save_counter = start_index

    # added --------------------------------------------------------------
    import time
    counter, start_time = 0, time.time()
    # --------------------------------------------------------------------

    for node_id, text in tqdm(
        list(node_dict.items())[start_index:], initial=start_index
    ):
        
        # added --------------------------------------------------------------
        counter += 1
        if counter > 10 and time.time() - start_time < 60:
            time.sleep(60 - (time.time() - start_time))
            counter, start_time = 0, time.time()
        # --------------------------------------------------------------------

        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
    ...
''';

In [49]:
import os
import json
import glob
import pickle
import openparse
from tqdm import tqdm
from huggingface_hub import login
import google.generativeai as genai
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from transformers import BitsAndBytesConfig
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.llms.chatml_utils import messages_to_prompt, completion_to_prompt








In [20]:
HF_CACHE_DIR = "../models/hf"
os.environ['HF_HOME'] = HF_CACHE_DIR

TIKTOKEN_CACHE_DIR = "../models/tiktoken"
os.environ["TIKTOKEN_CACHE_DIR"] = TIKTOKEN_CACHE_DIR
# assert os.path.exists(os.path.join(TIKTOKEN_CACHE_DIR, "9b5ad71b2ce5302211f9c61530b329a4922fc6a4"))

In [None]:
# Read API tokens (SHOULD BE CREATED BY USER)
with open('../reqs/tokens.json', 'r') as file:
    tokens = json.load(file)

HF_ACCESS_TOKEN = tokens['HF_ACCESS_TOKEN'][0]
GOOGLE_API_KEY = tokens['GOOGLE_API_KEY'][0]

login(token=HF_ACCESS_TOKEN)

# Set/Load LLM

In [7]:
# API model
llm = Gemini(
    api_key = GOOGLE_API_KEY,
    model = "models/gemini-1.0-pro",
    temperature = 0.3,
)
Settings.llm = llm

In [3]:
# # Local model
# llm_name = "meta-llama/Llama-3.2-3B"

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit = True,
#     bnb_4bit_compute_dtype = torch.float16,
#     bnb_4bit_quant_type = "nf4",
#     bnb_4bit_use_double_quant = True,
# )

# llm = HuggingFaceLLM(
#     model_name = llm_name,
#     tokenizer_name = llm_name,
#     context_window = 2048,
#     max_new_tokens = 512,

#     generate_kwargs = {
#         "do_sample": True,
#         "temperature": 0.5,
#     },
#     model_kwargs = {
#         # "torch_dtype": torch.float16,
#         "quantization_config": quantization_config,
#         "cache_dir": HF_CACHE_DIR,
#     },
#     device_map = "auto",
#     is_chat_model = True,

#     completion_to_prompt = completion_to_prompt,
#     messages_to_prompt = messages_to_prompt,
# )

# Settings.llm = llm

# Parse Files

In [None]:
num_train_folders = 5
train_files = []

for i in range(num_train_folders):
    train_files.append(glob.glob(f"../data/finetune/docs/train_{i+1}/*.pdf"))

val_files = glob.glob("../data/finetune/docs/val/*.pdf")

train_files, val_files

In [None]:
def parse_corpus(files):
    parser = openparse.DocumentParser(
        table_args = {"parsing_algorithm": "pymupdf",},
    )

    nodes = []
    for file in tqdm(files):
        try:
            nodes += parser.parse(file, ocr=True).to_llama_index_nodes()
        except ValueError:
            continue

    return nodes

# Parse and save
for i in range(num_train_folders):
    train_nodes = parse_corpus(train_files[i])
    with open(f'../data/finetune/docs/train_{i+1}/nodes.pkl', 'wb') as file: pickle.dump(train_nodes, file)
    print(len(train_nodes))
    
val_nodes = parse_corpus(val_files)
with open('../data/finetune/docs/val/nodes.pkl', 'wb') as file: pickle.dump(val_nodes, file) 
print(len(val_nodes))

# Generate Datasets

In [None]:
for i in range(num_train_folders):
    if i + 1 == 1:
        with open(f'../data/finetune/docs/train_{i+1}/nodes.pkl', 'rb') as file:
            train_nodes = pickle.load(file)

        train_dataset = generate_qa_embedding_pairs(
            llm = llm,
            nodes = train_nodes,
            num_questions_per_chunk = 2,
            output_path = f"../data/finetune/datasets/train_{i+1}.json",
        )

In [None]:
val_dataset = generate_qa_embedding_pairs(
    llm = llm,
    nodes = val_nodes,
    num_questions_per_chunk = 2,
    output_path = f"../data/finetune/datasets/val.json",
)

In [51]:
bad_queries = ("**Question 1:**",)


# Combine train sets
json_files = glob.glob("../data/finetune/datasets/train_*.json")
train_sets = []

for f in json_files:
    with open(f, 'r') as file:
        train_sets.append(json.load(file))

queries_size = []
corpus_size = []
relevant_docs_size = []

for train_set in train_sets:
    queries_size.append(len(train_set['queries'].keys()))
    corpus_size.append(len(train_set['corpus'].keys()))
    relevant_docs_size.append(len(train_set['relevant_docs'].keys()))

train_set = {}
for t in train_sets:
    for key1 in t:
        for key2 in t[key1]:
            if key1 == 'mode':
                continue
            if key1 not in train_set.keys():
                train_set[key1] = {}
            train_set[key1][key2] = t[key1][key2]

train_set["mode"] = "text"

assert sum(queries_size) == len(train_set['queries'].keys()), "Unmatched number of queries"
assert sum(corpus_size) == len(train_set['corpus'].keys()), "Unmatched number of corpus"
assert sum(relevant_docs_size) == len(train_set['relevant_docs'].keys()), "Unmatched number of relevant_docs"

# Remove bad questions
train_set['queries'] = {key:val for key, val in train_set['queries'].items() if val not in bad_queries}

with open("../data/finetune/datasets/train.json", 'w') as f:
    json.dump(train_set, f)


# Remove bad queries
with open("../data/finetune/datasets/val.json", 'r') as file:
    val_dataset = json.load(file)

val_dataset['queries'] = {key:val for key, val in val_dataset['queries'].items() if val not in bad_queries}

with open("../data/finetune/datasets/val.json", 'w') as f:
    json.dump(val_dataset, f)

# Finetune Embedding Model

In [1]:
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer, losses
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from sentence_transformers.evaluation import InformationRetrievalEvaluator


stransformers_cache_dir = "../models/stransformers"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = stransformers_cache_dir

In [2]:
train_dataset = EmbeddingQAFinetuneDataset.from_json("../data/finetune/datasets/train.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("../data/finetune/datasets/val.json")
models = [
    "Snowflake/snowflake-arctic-embed-m",
    "Snowflake/snowflake-arctic-embed-l",
    "dunzhang/stella_en_400M_v5l",
]

In [4]:
model_index = 1
num_times = 1
epochs = 2

In [None]:
for i in range(1, num_times + 1):
    model_id = \
        models[model_index-1] if i == 1 else \
        f"../models/stransformers/{models[model_index-1].split('/')[-1]}-finetuned-{(i-1)*epochs}"

    # model = SentenceTransformer(embedding_name, trust_remote_code=True)
    # loss = losses.MultipleNegativesRankingLoss(model)
    # loss = losses.MatryoshkaLoss(model, loss, [768, 256])  # for m
    # loss = losses.MatryoshkaLoss(model, loss, [1024, 512])  # for l

    finetune_engine = SentenceTransformersFinetuneEngine(
        dataset = train_dataset,
        model_id = model_id,
        model_output_path = \
            f"../models/stransformers/{model_id.split('/')[-1]}-finetuned-{i*epochs}" if i == 1 else \
            f"../models/stransformers/{model_id.split('/')[-1].replace(str((i-1)*epochs), str(i*epochs))}",
        val_dataset = val_dataset,
        epochs = epochs,
        trust_remote_code = True,
        batch_size = 12 if model_index == 1 else 2,
        evaluation_steps = 100 if model_index == 1 else 200,
        # loss = loss,
    )

    finetune_engine.finetune()

# Evaluate

In [6]:
def evaluate(model_id, dataset, name, output_path):
    Path(output_path).mkdir(exist_ok=True, parents=True)

    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs
    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name,
    )

    model = SentenceTransformer(model_id)

    return evaluator(model, output_path=output_path)

In [None]:
results = {}
for i in range(1, num_times + 1):
    if i == 1:
        model_base = models[model_index-1]
        output_path = f"../results/{model_base.split('/')[-1]}"
        results['base'] = evaluate(model_base, val_dataset, "base", output_path)
    else:
        output_path = f"../results/{model_base.split('/')[-1]}"

    model_finetuned = f"../models/stransformers/{models[model_index-1].split('/')[-1]}-finetuned-{i*epochs}"
    results[f"finetuned-{i*epochs}"] = evaluate(
        model_finetuned,
        val_dataset,
        f"finetuned-{i*epochs}",
        output_path,
    )

In [8]:
for i in range(1, num_times + 1):
    results_base = results['base']
    results_finetuned = results[f"finetuned-{i*epochs}"]

    improvement = round((results_finetuned - results_base) / results_base * 100)
    print(i)
    print(results_base, results_finetuned)
    print(f"Improvement: {improvement}%")

1
0.40312160993233853 0.7864231160513896
Improvement: 95%
