In [None]:
!wget '<https://arxiv.org/pdf/2402.04177.pdf>'  -O "Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf"
!wget '<https://arxiv.org/pdf/2403.06563.pdf>' -O "Unraveling_the_Mystery_of_Scaling_Laws.pdf"

In [None]:
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install accelerate
%pip install -i https://pypi.org/simple/ bitsandbytes
%pip install sentence-transformers

%pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-readers-web

%pip install llama-index-embeddings-openai
%pip install llama-index-embeddings-adapter
%pip install llama-index-finetuning

In [None]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM


import os
import warnings

warnings.filterwarnings('ignore')


# Download PDF Files
training_file_name = "lyft_2021.pdf"
validation_file_name = "uber_2021.pdf"

working_dir = "./"

# !wget 'https://arxiv.org/pdf/2402.04177.pdf'  -O "Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf"
# !wget 'https://arxiv.org/pdf/2403.06563.pdf' -O "Unraveling_the_Mystery_of_Scaling_Laws.pdf"

TRAIN_FILES = [os.path.join(working_dir, training_file_name)]
TRAIN_CORPUS_FPATH = "./train_corpus.json"

VAL_FILES = [os.path.join(working_dir, validation_file_name)]
VAL_CORPUS_FPATH = "./val_corpus.json"

print(f"Train files: {TRAIN_FILES}")
print(f"Val files: {VAL_FILES}")

In [None]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

def load_corpus(files):
    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=True)
    print(f"Parsed {len(nodes)} nodes")
    return nodes

train_nodes = load_corpus(TRAIN_FILES)
val_nodes = load_corpus(VAL_FILES)

In [None]:
quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#We recreate template (or messages) and ensure that they have the correct format, as per,
#https://github.com/run-llama/llama_index/issues/9277#issuecomment-1837545398 for zephyr-7b-beta

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == 'system':
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"
    return prompt

## Load synthetic data generation model

In [None]:
def huggingface_llm(model_name="HuggingFaceH4/zephyr-7b-beta",
                    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
                    context_window=3900,
                    max_new_tokens=256,
                    quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        messages_to_prompt=messages_to_prompt,
        device_map="auto",
    )

    return llm

llm = huggingface_llm()

## Generate synthetic data - Training Set

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

train_dataset = generate_qa_embedding_pairs(train_nodes, llm)

In [None]:
train_dataset

## Fine Tune embedding model w/ synthetic data

In [None]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.core.embeddings import resolve_embed_model
import torch

def embedding_model(model="local:avsolatorio/GIST-large-Embedding-v0",
                    model_output_path="model_output_test",
                    bias=True,
                    no_of_epochs=4,
                    verbose=True,
                    optimizer=torch.optim.AdamW,
                    optimizer_params={"lr": 0.01}
                   ):

    base_embed_model = resolve_embed_model(model)
    finetune_engine = EmbeddingAdapterFinetuneEngine(
        train_dataset,
        base_embed_model,
        model_output_path=model_output_path,
        bias=bias,
        epochs=no_of_epochs,
        verbose=verbose,
        optimizer_class=optimizer,
        optimizer_params=optimizer_params
    )

    return finetune_engine

finetune_engine = embedding_model()
finetune_engine.finetune()

In [None]:
ft_embed_model = finetune_engine.get_finetuned_model()

## Generate Synthetic data - Evaluation Set

In [None]:
val_dataset = generate_qa_embedding_pairs(val_nodes, llm)

In [None]:
print(val_dataset)

# Evaluate Embedding Model Performance

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
## Open AI ADA Embedding Model(Gold Standard)

ada = OpenAIEmbedding(api_key="sk-")
ada_val_results = evaluate(val_dataset, ada)

df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

In [None]:
## Pretained GIST Embedding Model

GIST_model = "local:avsolatorio/GIST-large-Embedding-v0"
GIST_val_results = evaluate(val_dataset, GIST_model)
df_embed_models = pd.DataFrame(GIST_val_results)
hit_rate_bge = df_embed_models["is_hit"].mean()
hit_rate_bge

In [None]:
## Fine Tuned GIST Embedding Model

embed_model = finetune_engine.get_finetuned_model()

val_results_finetuned = evaluate(val_dataset, embed_model)
df_embed_models_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_bge_finetuned = df_embed_models["is_hit"].mean()
hit_rate_bge_finetuned

In [None]:
## Results

print(f"hit_rate_bge_pretrained: {hit_rate_bge}\nhit_rate_finetuned: {hit_rate_bge_finetuned}\nhit_rate_ada {hit_rate_ada}")