<a href="https://colab.research.google.com/github/isamdr86/towards-ai/blob/main/notebooks/08-Finetune_Embedding_ir.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Packages and Setup Variables


In [1]:
!pip install -q llama-index==0.10.65 llama-index-finetuning llama-index-embeddings-adapter openai==1.37.0 tiktoken==0.7.0 chromadb==0.5.5 llama-index-vector-stores-chroma==0.1.10 cohere==5.6.2 llama-index-llms-gemini==0.1.11 html2text llama-index-llms-openai llama-index-embeddings-huggingface llama-index-embeddings-openai llama-index-llms-azure-openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.6/80.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.0/337.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%%capture
!pip install openai==1.55.3 httpx==0.27.2 tiktoken==0.7.0 --force-reinstall

In [4]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

# Download the Dataset


In [5]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="ai_tutor_knowledge.jsonl",repo_type="dataset",local_dir="/content")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ai_tutor_knowledge.jsonl:   0%|          | 0.00/6.96M [00:00<?, ?B/s]

In [6]:
import json
with open(file_path, "r") as file:
    ai_tutor_knowledge = [json.loads(line) for line in file]

len(ai_tutor_knowledge)

762

## LlamaIndex Document

In [8]:
ai_tutor_knowledge[0]

{'tokens': 768,
 'doc_id': 'd72b4670-84f0-54a3-b259-7eb7f218674e',
 'name': 'BERT HuggingFace Model Deployment using Kubernetes [ Github Repo]  03/07/2024',
 'url': 'https://towardsai.net/p/machine-learning/bert-huggingface-model-deployment-using-kubernetes-github-repo-03-07-2024',
 'source': 'tai_blog',
 'content': 'Github Repo : https://github.com/vaibhawkhemka/ML-Umbrella/tree/main/MLops/Model_Deployment/Bert_Kubernetes_deployment   Model development is useless if you dont deploy it to production  which comes with a lot of issues of scalability and portability.   I have deployed a basic BERT model from the huggingface transformer on Kubernetes with the help of docker  which will give a feel of how to deploy and manage pods on production.   Model Serving and Deployment:ML Pipeline:Workflow:   Model server (using FastAPI  uvicorn) for BERT uncased model    Containerize model and inference scripts to create a docker image    Kubernetes deployment for these model servers (for scalabilit

In [7]:
from typing import List
from llama_index.core import Document

def create_docs_from_list(data_list: List[dict]) -> List[Document]:
    documents = []
    for data in data_list:
        documents.append(
            Document(
                doc_id=data["doc_id"],
                text=data["content"],
                metadata={  # type: ignore
                    "url": data["url"],
                    "title": data["name"],
                    "tokens": data["tokens"],
                    "source": data["source"],
                },
                excluded_llm_metadata_keys=[
                    "title",
                    "tokens",
                    "source",
                ],
                excluded_embed_metadata_keys=[
                    "url",
                    "tokens",
                    "source",
                ],
            )
        )
    return documents

doc = create_docs_from_list(ai_tutor_knowledge)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


### Splitting Dataset


In [9]:
import random

random.shuffle(doc)
split_index = int(len(doc) * 0.9)

# TRAIN_DOCs and VALIDATION_DOCs
TRAIN_DOCs = doc[:split_index]
VALIDATION_DOCs = doc[split_index:]

# Chunking


In [10]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document

# Now use the parser
parser = SimpleNodeParser.from_defaults(chunk_size=768, chunk_overlap=64)
TRAIN_NODEs = parser.get_nodes_from_documents(TRAIN_DOCs)
VALIDATION_NODEs = parser.get_nodes_from_documents(VALIDATION_DOCs)

print(len(TRAIN_NODEs), len(VALIDATION_NODEs))

2895 216


In [11]:
# Use a subset of the dataset if testing.

# Test with a few sample, processing dataset fully can be costly depanding on the size.
# NOTE: Checkpoints are provided in the lesson, so no need to run the code on full dataset.

testing =False

if testing:
    TRAIN_NODEs = TRAIN_NODEs[0:10]
    VALIDATION_NODEs = VALIDATION_NODEs[0:5]

# Generate Question


We use a Large Language Model (LLM) to produce questions for each chunk of the dataset. Then we can use these data to train the model to develop embeddings that more accurately represent the types of questions users may ask.


In [12]:
# Use this block of code if you don't want to generate the questions for the dataset. (Avoid API call charges!)
# Uncomment the following code, and keep in mind to comment all the contents in the next coding block.


# from llama_index.finetuning import generate_qa_embedding_pairs
# from llama_index.llms.openai import OpenAI

# llm = OpenAI(model="gpt-4o-mini", temperature=1, max_tokens=512)

# # Generate questions for each chunk.

# TRAIN_DATASET = generate_qa_embedding_pairs(TRAIN_NODEs, llm=llm, output_path="./train_dataset.json")

# VALIDATION_DATASET = generate_qa_embedding_pairs(VALIDATION_NODEs, llm=llm, output_path="./val_dataset.json")

In [13]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id="jaiganesan/Embedding-model-fine-tuning-dataset", repo_type="dataset",local_dir="/content/")


from llama_index.finetuning import EmbeddingQAFinetuneDataset

# Load the pre-generated questions json files.
TRAIN_DATASET = EmbeddingQAFinetuneDataset.from_json("./train_dataset.json")
VALIDATION_DATASET = EmbeddingQAFinetuneDataset.from_json("./val_dataset.json")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

train_dataset.json:   0%|          | 0.00/8.23M [00:00<?, ?B/s]

val_dataset.json:   0%|          | 0.00/789k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/2.42k [00:00<?, ?B/s]

# Load an Embedding Model


In [14]:
from llama_index.core.embeddings import resolve_embed_model

# Load an existing embedding model with a linear layer adopter on top.
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

. BGE We chose the model because of its lightweight architecture, which makes it suitable for applications with limited computational resources. Also, the performance of smaller models tends to improve more noticeably with fine-tuning compared to larger models.


A linear adapter is just a linear transformation that transforms query embeddings while keeping document embeddings unchanged.

In [15]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
import torch

# Finetune the adapter
finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    base_embed_model,
    model_output_path="model_output_test",
    epochs=2,
    verbose=True,
    bias=True,
)

In [None]:
# Initiate the Finetuning process
finetune_engine.finetune()

[1;3;34m> Prepared optimizer, scheduler, and loss model.
[0m

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 0] Current loss: 1.5546023845672607
[0m[1;3;34m> [Epoch 0] Current loss: 1.5433043241500854
[0m[1;3;34m> [Epoch 0] Current loss: 1.3001140356063843
[0m[1;3;34m> [Epoch 0] Current loss: 1.209113359451294
[0m[1;3;34m> [Epoch 0] Current loss: 1.0383665561676025
[0m[1;3;34m> [Epoch 0] Current loss: 1.0853663682937622
[0m[1;3;34m> [Epoch 0] Current loss: 1.6050153970718384
[0m[1;3;34m> [Epoch 0] Current loss: 1.3934540748596191
[0m[1;3;34m> [Epoch 0] Current loss: 1.6227833032608032
[0m[1;3;34m> [Epoch 0] Current loss: 1.8313658237457275
[0m[1;3;34m> [Epoch 0] Current loss: 2.351806163787842
[0m[1;3;34m> [Epoch 0] Current loss: 2.3631062507629395
[0m[1;3;34m> [Epoch 0] Current loss: 2.266368865966797
[0m[1;3;34m> [Epoch 0] Current loss: 1.8658138513565063
[0m[1;3;34m> [Epoch 0] Current loss: 2.3685429096221924
[0m[1;3;34m> [Epoch 0] Current loss: 2.3456649780273438
[0m[1;3;34m> [Epoch 0] Current loss: 2.4202945232391357
[0m[1;3;34m> [Epoch

Iteration:   0%|          | 0/545 [00:00<?, ?it/s]

[1;3;34m> [Epoch 1] Current loss: 1.5021082162857056
[0m[1;3;34m> [Epoch 1] Current loss: 1.4780657291412354
[0m[1;3;34m> [Epoch 1] Current loss: 1.291349172592163
[0m[1;3;34m> [Epoch 1] Current loss: 1.1621007919311523
[0m[1;3;34m> [Epoch 1] Current loss: 1.0283396244049072
[0m[1;3;34m> [Epoch 1] Current loss: 1.0029467344284058
[0m[1;3;34m> [Epoch 1] Current loss: 1.5177440643310547
[0m[1;3;34m> [Epoch 1] Current loss: 1.3794138431549072
[0m[1;3;34m> [Epoch 1] Current loss: 1.616805076599121
[0m[1;3;34m> [Epoch 1] Current loss: 1.8005434274673462
[0m[1;3;34m> [Epoch 1] Current loss: 2.352266311645508
[0m[1;3;34m> [Epoch 1] Current loss: 2.3422789573669434
[0m[1;3;34m> [Epoch 1] Current loss: 2.263824701309204
[0m[1;3;34m> [Epoch 1] Current loss: 1.8562850952148438
[0m[1;3;34m> [Epoch 1] Current loss: 2.3466315269470215
[0m[1;3;34m> [Epoch 1] Current loss: 2.3370862007141113
[0m[1;3;34m> [Epoch 1] Current loss: 2.3952319622039795
[0m[1;3;34m> [Epoch 

In [None]:
embed_model = finetune_engine.get_finetuned_model()

# Or, import model from the directory whenever required.
#from llama_index.core.embeddings import LinearAdapterEmbeddingModel
#embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

In [None]:
embed_model

## Fine tuning OpenAI Embedding Model using Adapter method

In [20]:
from llama_index.finetuning import EmbeddingAdapterFinetuneEngine
from llama_index.embeddings.openai import OpenAIEmbedding

openai_finetune_engine = EmbeddingAdapterFinetuneEngine(
    TRAIN_DATASET,
    OpenAIEmbedding(model="text-embedding-3-small"),
    model_output_path="model_output_test_openai",
    bias=True,
    epochs=2,
    verbose=True,
)

In [None]:
openai_finetune_engine.finetune()

openai_embed_model = openai_finetune_engine.get_finetuned_model()

In [None]:
openai_embed_model

# Evaluate


## Define the Evaluation Functions


In [18]:
from llama_index.core import ServiceContext, VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm import tqdm
import pandas as pd

def evaluate(dataset, embed_model, top_k=5, verbose=False):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    # Chunking the documents and generating embeddings
    service_context = ServiceContext.from_defaults(embed_model=embed_model)
    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(nodes, service_context=service_context, show_progress=True)

    # Define a retriever to answer the questions
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []

    # Look into each response sources to see if the chunk that contains the answer is retrieved.
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]

        try:
            rank = retrieved_ids.index(expected_id) + 1
            reciprocal_rank = 1 / rank
        except ValueError:
            rank = None
            reciprocal_rank = 0

        is_hit = expected_id in retrieved_ids

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
            "rank": rank,
            "reciprocal_rank": reciprocal_rank
        }
        eval_results.append(eval_result)

    return eval_results

## OpenAI Embedding Model Evaluation


In [21]:
# Load the OpenAI Ada model and evaluate it.
openai_text_embedding_small = OpenAIEmbedding(model="text-embedding-3-small")
openai_embedding_val_results = evaluate(VALIDATION_DATASET, openai_text_embedding_small)

  service_context = ServiceContext.from_defaults(embed_model=embed_model)


Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [02:29<00:00,  3.32it/s]


In [22]:
openai_embedding_val_results = [
    result for result in openai_embedding_val_results if isinstance(result, dict)
]

df_openai = pd.DataFrame(openai_embedding_val_results)

hit_rate_openai = df_openai["is_hit"].mean()
mrr_openai = df_openai["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai}")
print(f"MRR: {mrr_openai}")

Hit rate: 0.9354838709677419
MRR: 0.787869623655914


### OpenAI Embedding Model with Fine Tuned Adapter Model Evaluation

In [None]:
from llama_index.legacy.embeddings.adapter import LinearAdapterEmbeddingModel

openai_text_embedding_small = OpenAIEmbedding(model="text-embedding-3-small")
openai_embed_model = LinearAdapterEmbeddingModel(openai_text_embedding_small, "model_output_test_openai")

val_results_ft_openai = evaluate(VALIDATION_DATASET, openai_embed_model)

  torch.load(
  service_context = ServiceContext.from_defaults(embed_model=embed_model)


Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [03:15<00:00,  2.54it/s]


In [None]:
val_results_ft_openai = [
    result for result in val_results_ft_openai if isinstance(result, dict)
]

df_openai_ft = pd.DataFrame(val_results_ft_openai)

hit_rate_openai_ft = df_openai_ft["is_hit"].mean()
mrr_openai_ft = df_openai_ft["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_openai_ft}")
print(f"MRR: {mrr_openai_ft}")

Hit rate: 0.9637096774193549
MRR: 0.8210685483870968


## Open Source BAAI Model Evaluation


In [23]:
# Load the Base model without fine-tuning
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")
bge_val_results = evaluate(VALIDATION_DATASET, base_embed_model)

  service_context = ServiceContext.from_defaults(embed_model=embed_model)


Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [00:53<00:00,  9.35it/s]


In [24]:
bge_val_results = [
    result for result in bge_val_results if isinstance(result, dict)
]

df_bge = pd.DataFrame(bge_val_results)

hit_rate_bge = df_bge["is_hit"].mean()
mrr_bge = df_bge["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_bge}")
print(f"MRR: {mrr_bge}")

Hit rate: 0.8387096774193549
MRR: 0.7207325268817205


## FineTuned BAAI Adapter Embedding Model Evaluation


In [None]:
from llama_index.legacy.embeddings.adapter import LinearAdapterEmbeddingModel

# Load the Fine-tuned model.
embed_model = LinearAdapterEmbeddingModel(base_embed_model, "model_output_test")

val_results_finetuned = evaluate(VALIDATION_DATASET, embed_model)

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  torch.load(
  service_context = ServiceContext.from_defaults(embed_model=embed_model)


Generating embeddings:   0%|          | 0/248 [00:00<?, ?it/s]

100%|██████████| 496/496 [00:10<00:00, 47.26it/s]


In [None]:
val_results_finetuned = [
    result for result in val_results_finetuned if isinstance(result, dict)
]

df_finetuned = pd.DataFrame(val_results_finetuned)

hit_rate_finetuned = df_finetuned["is_hit"].mean()
mrr_finetuned = df_finetuned["reciprocal_rank"].mean()

print(f"Hit rate: {hit_rate_finetuned}")
print(f"MRR: {mrr_finetuned}")

Hit rate: 0.8629032258064516
MRR: 0.7499663978494624
