# Finetuning Embedding Models

In [None]:
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install accelerate
%pip install -i https://pypi.org/simple/ bitsandbytes
%pip install sentence-transformers

%pip install llama-index llama-index-llms-huggingface llama-index-embeddings-huggingface llama-index-readers-web

%pip install llama-index-embeddings-openai
%pip install llama-index-embeddings-adapter
%pip install llama-index-finetuning
%pip install llama-index-llms-ollama

### Prepare Data

We are using the 10-K report for both Uber and Lyft for 2021. Train data is the 10-K report for Lyft and validation data is the 10-K report for Uber.

In [None]:
TRAIN_FILES = ["./Scaling_Laws_for_Downstream_Task_Performance_of_Large_Language_Models.pdf"]
VAL_FILES = ["./Unraveling_the_Mystery_of_Scaling_Laws.pdf"]

TRAIN_CORPUS_FPATH = "./train_corpus.json"
VAL_CORPUS_FPATH = "./val_corpus.json"

Load PDF reports and chunk text using LLamaIndex 

In [None]:
import json
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode

def load_corpus(files):
    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=True)
    print(f"Parsed {len(nodes)} nodes")
    return nodes

train_nodes = load_corpus(TRAIN_FILES)
val_nodes = load_corpus(VAL_FILES)

### Generate Synthetic Dataset

We will generate a synthetic dataset using Ollama and the `zephyr-7b-beta` model.

In [None]:
?llama_index.llms.ollama.Ollama

In [None]:
from llama_index.llms.ollama import Ollama

llm = Ollama(
    model="mattw/huggingfaceh4_zephyr-7b-beta:latest", 
    base_url="http://localhost:11434",
    temperature=0.7,
    context_window=3900,
    additional_kwargs = {"top_k": 50, "top_p": 0.95},
)


In [None]:
# [Optional] Generate synthetic dataset using HF LLM directly (slower)

import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_conf = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

#We recreate template (or messages) and ensure that they have the correct format, as per,
#https://github.com/run-llama/llama_index/issues/9277#issuecomment-1837545398 for zephyr-7b-beta

def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == 'system':
            prompt += f"<|system|>\n{message.content}</s>\n"
        elif message.role == 'user':
            prompt += f"<|user|>\n{message.content}</s>\n"
        elif message.role == 'assistant':
            prompt += f"<|assistant|>\n{message.content}</s>\n"

    # ensure we start with a system prompt, insert blank if needed
    if not prompt.startswith("<|system|>\n"):
        prompt = "<|system|>\n</s>\n" + prompt

    # add final assistant prompt
    prompt = prompt + "<|assistant|>\n"
    return prompt

def huggingface_llm(model_name="HuggingFaceH4/zephyr-7b-beta",
                    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
                    context_window=3900,
                    max_new_tokens=256,
                    quantization_config = quantization_conf
                   ):
    llm = HuggingFaceLLM(
        model_name=model_name,
        tokenizer_name=tokenizer_name,
        query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
        context_window=context_window,
        max_new_tokens=max_new_tokens,
        model_kwargs={"quantization_config": quantization_config},
        # tokenizer_kwargs={},
        generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95},
        messages_to_prompt=messages_to_prompt,
        device_map="auto",
    )

    return llm

llm = huggingface_llm()

In [None]:
# [Optional] Load synthetic dataset from JSON file
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

train_dataset = EmbeddingQAFinetuneDataset.from_json("train_dataset.json")
val_dataset = EmbeddingQAFinetuneDataset.from_json("val_dataset.json")

In [None]:
from llama_index.finetuning import generate_qa_embedding_pairs

train_dataset = generate_qa_embedding_pairs(
    llm=llm, nodes=train_nodes, verbose=False
)
val_dataset = generate_qa_embedding_pairs(
    llm=llm, nodes=val_nodes, verbose=False
)

train_dataset.save_json("train_dataset.json")
val_dataset.save_json("val_dataset.json")

### Finetune Embedding 

In [None]:
from llama_index.finetuning import SentenceTransformersFinetuneEngine

finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id="BAAI/bge-small-en",
    model_output_path="bge_ft_SL",
    val_dataset=val_dataset,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

### Evaluate Finetuned Model

Compare the new embedding model with the original and an OpenAI ada embedding model.

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
# OpenAI Eval

ada = OpenAIEmbedding(api_key='sk-')
ada_val_results = evaluate(val_dataset, ada)

df_ada = pd.DataFrame(ada_val_results)

hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada # 0.8 // 0.8913 // 0.8695 // 0.93478(HF) // 0.9347826086956522(HF) 
# // 0.9347826086956522(HF) // 0.8695652173913043(O-B) // 0.9347826086956522(OL) // 0.9130434782608695(OL)


In [None]:
# BGE Pretrained Eval

bge = "local:BAAI/bge-small-en"
bge_val_results = evaluate(train_dataset, bge)

df_bge = pd.DataFrame(bge_val_results)

hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge # 0.8222 // 0.956521 // 0.82608 // 0.8695(HF) // 0.8695652173913043(HF) 
# // 0.8695652173913043(HF) // 0.8260869565217391(O-B) // 0.8913043478260869(OL) // 0.8043478260869565(OL)

In [None]:
# FT BGE Embedding Eval
val_results_finetuned = evaluate(val_dataset, embed_model)
df_finetuned = pd.DataFrame(val_results_finetuned)
hit_rate_finetuned = df_finetuned["is_hit"].mean()
hit_rate_finetuned # 0.93333 // 0.97826 // 0.9130434782608695 // 0.95652(HF) // 0.95652173(HF) 
# // 0.9565217391304348(HF) // 0.9782608695652174(O-B) // 0.8695652173913043(OL) // 0.9130434782608695(OL)