SetUp

In [40]:
!pip install openai requests tiktoken numpy




[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Retrieve key and endpoint

In [None]:
import os
import openai
from dotenv import load_dotenv
#load variable from .env file
load_dotenv()

# Set the environment variable
openai.api_key = os.getenv("OPENAI_API_KEY_fine_tuning")
openai.api_base = os.getenv("OPENAI_API_BASE_fine_tuning")

# Verify that it has been set
print(openai.api_key)
# Verify that it has been set
print(openai.api_base)

Import Data

In [42]:
# Run preliminary checks

import json

# Load the training set
with open(r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_training.jsonl', 'r', encoding='utf-8') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
with open(r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_validation.jsonl', 'r', encoding='utf-8') as f:
    validation_dataset = [json.loads(line) for line in f]

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 101
First example in training set:
{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'}
{'role': 'user', 'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients: '}
{'role': 'assistant', 'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}

Number of examples in validation set: 100
First example in validation set:
{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'}
{'role': 'user', 'content': 'Title: Crustless Vegetable Ham Pie\n\nIngredients: ["1/4 c. butter", "1/4 lb. mushrooms, sliced", "1 garlic clove, min

Valisate Tokens Limit

In [43]:
# Validate token counts

import json
import tiktoken
import numpy as np
from collections import defaultdict

encoding = tiktoken.get_encoding("cl100k_base") # default encoding used by gpt-4, turbo, and text-embedding-ada-002 models

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = [
    r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_validation.jsonl',
    r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_training.jsonl'
]

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)

Processing file: C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_validation.jsonl

#### Distribution of total tokens:
min / max: 73, 276
mean / median: 145.0, 139.5
p5 / p95: 102.0, 202.0

#### Distribution of assistant tokens:
min / max: 10, 69
mean / median: 31.39, 29.0
p5 / p95: 17.0, 50.0
**************************************************
Processing file: C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_training.jsonl

#### Distribution of total tokens:
min / max: 69, 227
mean / median: 134.16831683168317, 130.0
p5 / p95: 97.0, 180.0

#### Distribution of assistant tokens:
min / max: 8, 58
mean / median: 28.15841584158416, 26.0
p5 / p95: 16.0, 43.0
**************************************************


In [44]:
# Upload fine-tuning files

import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = openai.api_base,
  api_key = openai.api_key ,
  api_version = "2024-05-01-preview"  # This API version or later is required to access seed/events/checkpoint features
)

training_file_name = r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_validation.jsonl'
validation_file_name = r'C:\Users\21629\Desktop\Danger!\Projects\GenAI\data\tmp_recipe_finetune_training.jsonl'


# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file = open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file = open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

Training file ID: file-fed928cf29fa4f838aefe795e75d343f
Validation file ID: file-cad549fe17b14f358326f192f7bbb355


In [45]:
# List all files
#check files status (needs to be Processed)
response = client.files.list()

# Iterate through the response to check the status of each file
for file in response:
    if file.id == training_file_id:
        print(f"Training file status: {file.status}")
    elif file.id == validation_file_id:
        print(f"Validation file status: {file.status}")



Validation file status: pending
Training file status: pending


Begin FineTuning

In [None]:
# Submit fine-tuning training job

response = client.fine_tuning.jobs.create(
    training_file = training_file_id,
    validation_file = validation_file_id,
    model = "gpt-35-turbo-0613"# Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters.
   
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

List fine-tuning events

In [64]:
response = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id, limit=10)
print(response.model_dump_json(indent=5))

{
     "data": [
          {
               "id": "ftevent-049fcf2fe0f442ff880be400352c996c",
               "created_at": 1724014084,
               "level": "info",
               "message": "Training tokens billed: 43000",
               "object": "fine_tuning.job.event",
               "type": "message"
          },
          {
               "id": "ftevent-3507a0e50fa5413782470c76e15e0997",
               "created_at": 1724014084,
               "level": "info",
               "message": "Completed results file: file-fc6f0466313646dda1d0ea6e5ded0c14",
               "object": "fine_tuning.job.event",
               "type": "message"
          },
          {
               "id": "ftevent-b8db4a9cb551469da2b1f57d12c243d2",
               "created_at": 1724014080,
               "level": "info",
               "message": "Postprocessing started.",
               "object": "fine_tuning.job.event",
               "type": "message"
          },
          {
               "id": "ftevent-

List checkpoints

Final training run results

In [None]:
# Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response.model_dump_json(indent=2))
fine_tuned_model = response.fine_tuned_model

Deployment

In [None]:
# Deploy fine-tuned model

import json
import requests
load_dotenv()

token = os.getenv("token")
print(token)
subscription = os.getenv("subscription")
print(subscription)
resource_group = os.getenv("resource_group")
resource_name = os.getenv("resource_name")
model_deployment_name = os.getenv("model_deployment_name")

deploy_params = {'api-version': "2024-05-01-preview"}
deploy_headers = {'Authorization': 'Bearer {}'.format(token), 'Content-Type': 'application/json'}

deploy_data = {
    "sku": {"name": "standard", "capacity": 1},
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": "gpt-35-turbo-0613.ft-3bf62ca6cb934fa299f25b03b9ead914", #retrieve this value from the previous call, it will look like gpt-35-turbo-0613.ft-b044a9d3cf9c4228b5d393567f693b83
            "version": "1"
        }
    }
}
deploy_data = json.dumps(deploy_data)

request_url = f'https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}'

print('Creating a new deployment...')

r = requests.put(request_url, params=deploy_params, headers=deploy_headers, data=deploy_data)

print(r)
print(r.reason)
print(r.json())

Test the Deployed Model

In [86]:
# Use the deployed customized model

import os
from openai import AzureOpenAI
load_dotenv()
import os

client = AzureOpenAI(
  azure_endpoint = os.getenv("OPENAI_API_BASE_fine_tuning"),
  api_key = os.getenv("OPENAI_API_KEY_fine_tuning"),
  api_version = "2024-05-01-preview",

)


response = client.chat.completions.create(
    model = "deployment-finetuning", # model = "Custom deployment name you chose for your fine-tuning model"
    messages = [
        {"role": "system", "content": "You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided."}, {"role": "user", "content": "Title: couscous\n\nIngredients: [\"1 box powdered sugar\", \"8 oz. soft butter\", \"1 (8 oz.) peanut butter\", \"paraffin\", \"12 oz. chocolate chips\"]\n\nGeneric ingredients: "}, {"role": "assistant", "content": "[\"powdered sugar\", \"butter\", \"peanut butter\", \"paraffin\", \"chocolate chips\"]"}
    ]

)

print(response.choices[0].message.content)

["powdered sugar", "butter", "peanut butter", "paraffin", "chocolate chips"]
