<a href="https://colab.research.google.com/github/isamdr86/towards-ai/blob/main/notebooks/GPT_4o_mini_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/GPT_4o_mini_Fine_Tuning.ipynb)

In [1]:
!pip install -q llama-index==0.10.57 openai==1.51.2 chromadb==0.5.5 pydantic llama-index-vector-stores-chroma==0.1.10 jsonlines

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m584.3/584.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m71.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m19.8 MB/s[0m eta [36m0:

In [2]:
%%capture
!pip install openai==1.55.3 httpx==0.27.2 tiktoken==0.7.0 --force-reinstall

In [3]:

import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')


In [4]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio

nest_asyncio.apply()

## Create Vector Store

In [5]:
# Downloading Vector store from Hugging face hub
from huggingface_hub import hf_hub_download
vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip",repo_type="dataset",local_dir="/content")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vectorstore.zip:   0%|          | 0.00/97.2M [00:00<?, ?B/s]

In [6]:
!unzip vectorstore.zip

Archive:  vectorstore.zip
   creating: ai_tutor_knowledge/
   creating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/length.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/index_metadata.pickle  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/link_lists.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/header.bin  
  inflating: ai_tutor_knowledge/684af133-f877-4230-bde4-575cf53b6688/data_level0.bin  
  inflating: ai_tutor_knowledge/chroma.sqlite3  


In [7]:
# Setup an Embedding Model

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

# Default Embedding model
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [8]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import VectorStoreIndex

# Create your index
db = chromadb.PersistentClient(path="./ai_tutor_knowledge")
chroma_collection = db.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Create your index

vector_index = VectorStoreIndex.from_vector_store(vector_store)

### GPT-4o

In [9]:
from llama_index.llms.openai import OpenAI

llm_gpt_4o = OpenAI(temperature=1, model="gpt-4o")

In [10]:
# Query Engine
query_engine_0 = vector_index.as_query_engine(llm= llm_gpt_4o , similarity_top_k=5)

response_gpt_4o = query_engine_0.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_gpt_4o.response

"When comparing the knowledge retention abilities of a Retrieval-Augmented Generation (RAG) model to a BERT-based model that has been fine-tuned using PEFT techniques, there are notable differences in their outputs when the knowledge source is removed. A RAG model combines pre-trained parametric and non-parametric memory, allowing it to dynamically retrieve and generate language based on external, dense sources like a Wikipedia index. If the knowledge source is removed, the RAG model's ability to generate accurate, fact-based responses significantly diminishes, since it relies heavily on retrieving up-to-date information from its non-parametric memory.\n\nIn contrast, a BERT-based model fine-tuned using PEFT techniques primarily relies on the knowledge embedded within its parameters. This means that even without an external knowledge source, it can continue to generate responses based on the information encoded during training. However, because it lacks the dynamic retrieval component 

In [11]:
for src in response_gpt_4o.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

### GPT-4o-mini

In [12]:
from llama_index.llms.openai import OpenAI

llm_gpt_4o_mini = OpenAI(temperature=1, model="gpt-4o-mini")

In [13]:
# Query Engine
query_engine_1 = vector_index.as_query_engine(llm= llm_gpt_4o_mini , similarity_top_k=5)

response_gpt_4o_mini = query_engine_1.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_gpt_4o_mini.response

'The RAG model combines pre-trained parametric memory from a seq2seq model with non-parametric memory accessed from a dense vector index, allowing it to effectively retrieve and utilize knowledge during language generation. In contrast, a BERT-based model that has been extensively fine-tuned using PEFT techniques relies solely on its learned parameters.\n\nWhen the knowledge source is removed, the output of the RAG model may deteriorate significantly as it depends on the retrieved information to enhance its responses. Without access to the external knowledge base, the RAG model would primarily rely on its parametric memory, potentially leading to less specific and diverse outputs. \n\nOn the other hand, a fine-tuned BERT model might still produce coherent language, but its outputs could lack factual accuracy or specificity, as it cannot access new information beyond its training data. This limitation could result in more generic or less informed responses compared to the adaptive capab

In [14]:
for src in response_gpt_4o_mini.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

### Dataset Format validation & Number of tokens in training data

In [15]:
# Format error checks

# https://cookbook.openai.com/examples/chat_finetuning_data_prep

from collections import defaultdict
format_errors = defaultdict(int)

def validate_dataset(output_data):

  for ex in output_data:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant", "function"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          function_call = message.get("function_call", None)

          if (not content and not function_call) or not isinstance(content, str):
              format_errors["missing_content"] += 1

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("\nNo errors found in the Formatted dataset \n")

In [16]:
import tiktoken

def counting_no_tokens(output_data):

  tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

  total_tokens = sum(len(tokenizer.encode(" ".join(message['content'] for message in entry['messages']))) for entry in output_data)

  print(f"Total number of tokens in the Dataset: {total_tokens} \n")

In [17]:
from huggingface_hub import hf_hub_download
import json
import jsonlines
from pprint import pprint

def dataset_preparation(file_name):
    file_path = hf_hub_download(
        repo_id="jaiganesan/GPT_4o_mini_Fine_tune",
        filename=file_name,
        repo_type="dataset",
        local_dir="/content"
    )

    with open(file_path, "r") as file:
        data = [json.loads(line) for line in file]

    print("Total entries in the dataset:", len(data))
    print("-_"*30)
    print(data[4])

    output_data = []

    for entry in data:
        formatted_entry = {
            "messages": [
                {"role": "system", "content": "As AI Tutor, answer questions related to AI topics in an in-depth and factual manner."},
                {"role": "user", "content": entry['question']},
                {"role": "assistant", "content": entry['answer']}
            ]
        }
        output_data.append(formatted_entry)

    # Validate and analyze the output data
    validate_dataset(output_data)
    counting_no_tokens(output_data)

    print("-_"*30)
    print(output_data[4])

    base_file_name = os.path.splitext(file_name)[0]
    output_file_path = f'formatted_{base_file_name}.jsonl'

    with jsonlines.open(output_file_path, mode='w') as writer:
        writer.write_all(output_data)

    print(f"\nFormatted dataset has been saved to {output_file_path}.")


In [18]:
# Training Dataset
dataset_preparation("question_answers_data_100.jsonl")

question_answers_data_100.jsonl:   0%|          | 0.00/276k [00:00<?, ?B/s]

Total entries in the dataset: 100
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'tai_blog', 'question': 'What are the key advantages of using BiFPN in object detection compared to conventional methods?', 'answer': "BiFPN, or Bi-directional Feature Pyramid Network, offers several advantages in object detection when compared to conventional methods. It's part of the EfficientDet family of object detectors developed by Google Research and is designed to enhance the efficiency and scalability of object detection models.\n\n### Key Advantages of BiFPN:\n\n1. **Weighted Feature Fusion:**\n   Unlike conventional methods that simply sum up input features during feature fusion, BiFPN introduces learnable weights to adjust the importance of different input features. This means that during multi-scale fusion, input features are not merely combined indiscriminately but are weighted according to their relevance, which enhances the accuracy of the fusion process.\n\n2. **Bi

In [19]:
# Evaluation Dataset
dataset_preparation("question_answers_data_30.jsonl")

question_answers_data_30.jsonl:   0%|          | 0.00/81.6k [00:00<?, ?B/s]

Total entries in the dataset: 30
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
{'source': 'openai_cookbooks', 'question': 'How can creating high-quality evaluations for large language models like GPT-4 improve the stability and reliability of AI applications?', 'answer': "Creating high-quality evaluations for large language models (LLMs), like GPT-4, significantly enhances the stability and reliability of AI applications. Evaluations serve as a robust mechanism to monitor and assess how well these models perform across various scenarios, ultimately leading to improvements in model robustness and reliability.\n\nFirstly, high-quality evaluations can help identify and address areas where models may be underperforming. For instance, systematic evaluations can uncover issues such as drifting performance or deteriorating accuracy over time. By regularly evaluating LLMs against a comprehensive set of benchmarks, developers can detect and correct potential degradation in model 

**This Formatted Training and Evaluation Datasets are being used in the OpenAI Models Fine tuning**

**Up until now, we have explored response generation in RAG System using GPT-4o and GPT-4o-mini and Formatting Training data. Moving forward, we will focus on response generation using a newly fine-tuned model. In our lesson, we explored the process of fine-tuning through the OpenAI UI. However, if you want to learn about Fine tuning OpenAI Models using Code, You can explore the code sections in the notebook.**

## Response Generation Using New Fine Tuned Model

In [24]:
# Fine Tuned Model

from llama_index.llms.openai import OpenAI

llm_gpt_fine_tuned_model = OpenAI(temperature=0.8, model="ft:gpt-4o-mini-2024-07-18:towards-ai:ai-tutor:AIc6MRSB")

In [25]:
# Query Engine
query_engine_2 = vector_index.as_query_engine(llm= llm_gpt_fine_tuned_model , similarity_top_k=5)

response_fine_tuned_model = query_engine_2.query("Compare the knowledge retention abilities of a RAG model versus a BERT-based model that has been extensively fine-tuned using PEFT techniques. How do their outputs differ when the knowledge source is removed?")

response_fine_tuned_model.response

NotFoundError: Error code: 404 - {'error': {'message': 'The model `ft:gpt-4o-mini-2024-07-18:towards-ai:ai-tutor:AIc6MRSB` does not exist or you do not have access to it.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_found'}}

In [None]:
for src in response_fine_tuned_model.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("Metadata\t", src.metadata)
    print("-_" * 20)

Node ID	 727ac73b-1bfd-4782-8523-fa38b48cec50
Title	 Adaptive Retrieval-Augmented Generation for Conversational Systems:Model Training and Evaluation Setups
Text	 We evaluate the performance of introducing RAGate according to its binary classification performance and the effectiveness of the resulting response generation. Specifically, we use the KE-TOD dataset (Chen et al., 2022), which has fully annotated 5,324 dialogues and 52,063 turns of conversations. In particular, it is associated with 33,761 knowledge snippets to be retrieved and augmented. In addition, KETOD was developed with human labels on turns of conversations (around 12.1% of turns) about the need for augmenting with retrieved knowledge snippets for a natural and informative system response. Hence, we use these human labels as natural ground truths when evaluating RAGate. It is worth indicating that many current knowledge-augmented conversational datasets often ground their conversations on the knowledge snippet, such a

# Fine Tuning OpenAI Models Using OpenAI API (Code)

### Upload file

In [28]:
from openai import OpenAI
client = OpenAI()

fine_tune_file = client.files.create(
    file=open("formatted_question_answers_data_100.jsonl", "rb"),
    purpose="fine-tune"
)

In [29]:
pprint(fine_tune_file)

FileObject(id='file-VLM277YEE9fFFg64ZAnd4c', bytes=291689, created_at=1736951701, filename='formatted_question_answers_data_100.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [30]:
param_training_file_name = fine_tune_file.id
pprint(param_training_file_name)

'file-VLM277YEE9fFFg64ZAnd4c'


### Create Fine tune model

In [31]:
result_job = client.fine_tuning.jobs.create(
    training_file=param_training_file_name,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={ "n_epochs":2 }
)

pprint(result_job)

FineTuningJob(id='ftjob-wcNzlUfqWDW8juVhSHkCe2Dp', created_at=1736951712, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=2, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-cqPQ00bXAHMrGkKBUoiDmZIi', result_files=[], seed=217695527, status='validating_files', trained_tokens=None, training_file='file-VLM277YEE9fFFg64ZAnd4c', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None, method={'type': 'supervised', 'supervised': {'hyperparameters': {'batch_size': 'auto', 'learning_rate_multiplier': 'auto', 'n_epochs': 2}}})


In [32]:
param_file_tune_job_id = result_job.id
pprint(param_file_tune_job_id)

'ftjob-wcNzlUfqWDW8juVhSHkCe2Dp'


### Model Fine tuning

In [33]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id)

FineTuningJob(id='ftjob-wcNzlUfqWDW8juVhSHkCe2Dp', created_at=1736951712, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=2, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-cqPQ00bXAHMrGkKBUoiDmZIi', result_files=[], seed=217695527, status='validating_files', trained_tokens=None, training_file='file-VLM277YEE9fFFg64ZAnd4c', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None, method={'type': 'supervised', 'supervised': {'hyperparameters': {'n_epochs': 2, 'batch_size': 'auto', 'learning_rate_multiplier': 'auto'}}})

In [34]:
# Retrieve the status of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status

'validating_files'

In [None]:
import time
from datetime import datetime

while True:
  time.sleep(20)
  try:
      job_status = client.fine_tuning.jobs.retrieve(param_file_tune_job_id)
      print(f"------------ Job Status: {job_status.status} --------------")

      if job_status.status in ["failed", "succeeded", "cancelled"]:
          print("Job Completed. Detailed Events list:")
          events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=param_file_tune_job_id)
          for event in events:
              print(f'{datetime.fromtimestamp(event.created_at)} {event.message}')

          print("######## Fine-tuned model ###########")
          print(f"{job_status.fine_tuned_model}")
          print("#####################################")
          break
  except Exception as e:
      print(f"Error monitoring job: {e}")
      break

------------ Job Status: validating_files --------------
------------ Job Status: validating_files --------------


In [None]:
# # Retrieve the state of a fine-tune

# while client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status != 'succeeded':
#   sleep(10)

In [None]:
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve(param_file_tune_job_id).status

In [None]:
param_file_tune_model = client.fine_tuning.jobs.retrieve(param_file_tune_job_id).fine_tuned_model
pprint(param_file_tune_model)

### We can delete the fine tuned model

In [None]:
# Delete a fine-tuned model (must be an owner of the org the model was created in)
# result_delete = client.models.delete(param_file_tune_model)
# pprint(result_delete)