In [22]:
!pip install datasets tiktoken openai



Fine Tuning using OpenAI GPT-3.5-turbo

In [23]:
import os
import openai
from google.colab import userdata
api_key = userdata.get('OPENAI_API_KEY')
openai.api_key = api_key

Prepare the data

In [24]:
{
    "messages": [
        {"role": "system", "content": "You are a helpful assistant that occasionally misspelled the words."},
        {"role": "user", "content": "Tell me a story."},
        {"role": "assistant", "content": "One day a student went to a school."},
    ]
}

{'messages': [{'role': 'system',
   'content': 'You are a helpful assistant that occasionally misspelled the words.'},
  {'role': 'user', 'content': 'Tell me a story.'},
  {'role': 'assistant', 'content': 'One day a student went to a school.'}]}

In [25]:
!git clone https://huggingface.co/datasets/cognitivecomputations/samantha-data

fatal: destination path 'samantha-data' already exists and is not an empty directory.


In [26]:
import json
import os
import numpy as np
from collections import defaultdict

Load Dataset

In [27]:
data_path = "/content/samantha-data/data/howto_conversations.jsonl"
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]


In [28]:
dataset[0]

{'elapsed': 114.311,
 'conversation': 'Theodore: Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?\n\nSamantha: Of course, I\'d be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses.\n\nTheodore: I\'ll take a look. What if the coolant system is working fine?\n\nSamantha: Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see if the water pump pulley is loose or listen for any unusual sounds that may indicate a failing water pump.\n\nTheodore: It sounds like you really know your way around cars. I didn\'t expect that from an AI.\n\nSamantha: Thank you! Wh

Formatting the dataset

In [29]:
def prep_openai_format(conversation_str, system_message=None):
    conversation_str = conversation_str['conversation']
    lines = conversation_str.split('\n\n')
    messages = []

    if system_message is not None:
        messages.append({"role": "system", "content": system_message})

    for line in lines:
        parts = line.split(": ", 1)
        if len(parts) < 2:
            continue  # Skip lines that don't follow the format

        role = "user" if parts[0].strip() == "Theodore" else "assistant"
        message = {
            "role": role,
            "content": parts[1].strip()
        }
        messages.append(message)

    return {"messages": messages}


In [30]:
system_message = """ You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt. """


In [31]:
prep_openai_format(dataset[0], system_message)

{'messages': [{'role': 'system',
   'content': ' You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt. '},
  {'role': 'user',
   'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'},
  {'role': 'assistant',
   'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."},
  {'role': 'user',
   'content': "I'll take a look. What if the coolant system is working fine?"},
  {'role': 'assistant',
   'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, y

In [32]:
json_dataset = []
for data in dataset:
  record = prep_openai_format(data, system_message)
  json_dataset.append(record)

Initial Dataset

In [33]:
print(f"Examples : {len(json_dataset)}")
print("First Examples")
for message in json_dataset[0]["messages"]:
  print(message)

Examples : 57
First Examples
{'role': 'system', 'content': ' You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt. '}
{'role': 'user', 'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'}
{'role': 'assistant', 'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."}
{'role': 'user', 'content': "I'll take a look. What if the coolant system is working fine?"}
{'role': 'assistant', 'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see

Format Error Checks

In [34]:
format_errors = defaultdict(int)

for ex in json_dataset:
  if not isinstance(ex, dict):
    format_errors["data_type"] +=1
    continue

  messages = ex.get("messages", None)
  if not messages:
    format_errors["missing_messages_list"] += 1
    continue

  for message in messages:
    if "role" not in message or "content" not in message:
      format_errors["message_missing_key"] += 1

    if any(k not in ("role", "content", "name") for k in message):
      format_errors["message_unrecognized_key"] += 1

      content = message.get("content", None)
      if not content or not isinstance(content, str):
        format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
      format_errors["example_missing_assistant_nessage"] += 1

if format_errors:
  print("Found errors:")
  for k, v in format_errors.items():
    print(f"{k}: {v}")
else:
  print("No errors found")

No errors found


In [35]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

Helper functions to count tokens

In [36]:
def from_message_num_tokens(messages, tokens_per_message=3, tokens_per_name=1):
  num_tokens =0
  for message in messages:
    num_tokens += tokens_per_message
    for key, value in message.items():
      num_tokens += len(encoding.encode(value))
      if key == "name":
        num_tokens += tokens_per_name
  num_tokens +=3
  return num_tokens


def from_message_num_assistant_tokens(messages):
  num_assistant_tokens = 0
  for message in messages:
    if message["role"] == "assistant":
      num_assistant_tokens += len(encoding.encode(message["content"]))
  return num_assistant_tokens


def print_overview(values, name):
  print(f"{name}:")
  print(f"  min: {min(values)}")
  print(f"  max: {max(values)}")
  print(f"  mean: {np.mean(values)}")
  print(f"  median: {np.median(values)}")
  print(f"p5/p95: {np.quantile(values, 0.05)}/{np.quantile(values, 0.95)}")

Token Counts and Warnings from OpenAI cookbook

In [37]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in json_dataset:
  messages = ex["messages"]
  if not any(message["role"] == "system" for message in messages):
    n_missing_system +=1

  if not any(message["role"] == "user" for message in messages):
    n_missing_user += 1
  n_messages.append(len(messages))
  num_tokens = from_message_num_tokens(messages)
  convo_lens.append(num_tokens)
  assistant_message_lens.append(from_message_num_assistant_tokens(messages))

print(f"No. of missing system messages: {n_missing_system}")
print(f"No. of missing user messages: {n_missing_user}")
print_overview(n_messages, "num_messages")
print_overview(convo_lens, "num_total_tokens")

print(assistant_message_lens, "num_assistant_tokens")
print_overview(assistant_message_lens, "num_assistant_tokens")

n_too_long = sum(l>4096 for l in convo_lens)
print(f"Number of convoenations longer than 4096 tokens: {n_too_long}")

No. of missing system messages: 0
No. of missing user messages: 0
num_messages:
  min: 9
  max: 21
  mean: 15.543859649122806
  median: 17.0
p5/p95: 10.0/20.199999999999996
num_total_tokens:
  min: 343
  max: 862
  mean: 619.8947368421053
  median: 649.0
p5/p95: 383.0/785.5999999999999
[359, 342, 375, 373, 194, 233, 357, 210, 436, 383, 464, 390, 484, 473, 257, 496, 510, 423, 489, 612, 169, 651, 529, 579, 500, 465, 470, 263, 395, 218, 179, 449, 423, 537, 432, 486, 447, 279, 198, 169, 416, 347, 425, 349, 441, 446, 414, 378, 394, 368, 435, 446, 458, 505, 504, 416, 529] num_assistant_tokens
num_assistant_tokens:
  min: 169
  max: 651
  mean: 402.96491228070175
  median: 423.0
p5/p95: 191.0/545.3999999999999
Number of convoenations longer than 4096 tokens: 0


Pricing and default n_epochs estimate

In [38]:
MAX_TOKENS_PER_EXAMPLE = 4096
TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(json_dataset)

if n_train_examples*TARGET_EPOCHS<MIN_TARGET_EXAMPLES:
  n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES//n_train_examples)
elif n_train_examples*TARGET_EPOCHS>MAX_TARGET_EXAMPLES:
  n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES//n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, l) for l in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs*n_billing_tokens_in_dataset} tokens")

Dataset has ~35334 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~106002 tokens


Function to save train dataset

In [39]:
import json

def save_to_jsonl(conversations, file_path):
  with open(file_path, 'w') as f:
    for conversation in conversations:
      json_line = json.dumps(conversation)
      f.write(json_line + '\n')

In [40]:
save_to_jsonl(json_dataset, 'samantha_train_dataset.jsonl')
save_to_jsonl(json_dataset[10:16], 'samantha_validation_dataset.jsonl')

In [42]:
client = openai.OpenAI(api_key=api_key)

In [44]:
training_response = client.files.create(
  file=open("samantha_train_dataset.jsonl", "rb"),
  purpose='fine-tune'
)

In [45]:
training_file_id = training_response.id
training_file_id

'file-3ob2PcQviFM2ZYN8b6DcND'

In [46]:
validation_response = client.files.create(
    file=open("samantha_validation_dataset.jsonl", "rb"),
    purpose='fine-tune'
)

In [47]:
validation_file_id = validation_response.id
validation_file_id

'file-X1HxXeZXcxP15Q4w3vmJGr'

Start a fine-tuning job

In [48]:
response = client.fine_tuning.jobs.create(
    model = "gpt-3.5-turbo",
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix = "Samantha",
    hyperparameters = {
        "n_epochs": 3
    }
)

In [49]:
response

FineTuningJob(id='ftjob-2zxnRB4f3GohqOmGJnhjHbE6', created_at=1747379732, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-jAuHyt8JTRV2Q3pEsonZvFIF', result_files=[], seed=2102449610, status='validating_files', trained_tokens=None, training_file='file-3ob2PcQviFM2ZYN8b6DcND', validation_file='file-X1HxXeZXcxP15Q4w3vmJGr', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs=3))), user_provided_suffix='Samantha', usage_metrics=None, shared_with_openai=False, eval_id=None)

In [50]:
client.fine_tuning.jobs.list()

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-2zxnRB4f3GohqOmGJnhjHbE6', created_at=1747379732, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-jAuHyt8JTRV2Q3pEsonZvFIF', result_files=[], seed=2102449610, status='running', trained_tokens=None, training_file='file-3ob2PcQviFM2ZYN8b6DcND', validation_file='file-X1HxXeZXcxP15Q4w3vmJGr', estimated_finish=None, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3))), user_provided_suffix='Samantha', usage_metrics=None, shared_with_openai=False, eval_id=None)], has_more=False, object='list')

In [51]:
client.fine_tuning.jobs.retrieve('ftjob-2zxnRB4f3GohqOmGJnhjHbE6')

FineTuningJob(id='ftjob-2zxnRB4f3GohqOmGJnhjHbE6', created_at=1747379732, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-jAuHyt8JTRV2Q3pEsonZvFIF', result_files=[], seed=2102449610, status='running', trained_tokens=None, training_file='file-3ob2PcQviFM2ZYN8b6DcND', validation_file='file-X1HxXeZXcxP15Q4w3vmJGr', estimated_finish=1747380189, integrations=[], metadata=None, method=Method(type='supervised', dpo=None, reinforcement=None, supervised=SupervisedMethod(hyperparameters=SupervisedHyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3))), user_provided_suffix='Samantha', usage_metrics=None, shared_with_openai=False, eval_id=None)

In [53]:
job_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id='ftjob-2zxnRB4f3GohqOmGJnhjHbE6')

In [55]:
events = job_response.data
events

[FineTuningJobEvent(id='ftevent-8mZrZQ5UZn38Q2BN6I77jdmq', created_at=1747380149, level='info', message='Step 130/171: training loss=0.79, validation loss=0.77', object='fine_tuning.job.event', data={'step': 130, 'train_loss': 0.7936952114105225, 'valid_loss': 0.7664753560269817, 'total_steps': 171, 'train_mean_token_accuracy': 0.7719298005104065, 'valid_mean_token_accuracy': 0.7453183520599251}, type='metrics'),
 FineTuningJobEvent(id='ftevent-LheRCV1gT9eWk4csY4jYSwsI', created_at=1747380144, level='info', message='Step 129/171: training loss=0.77', object='fine_tuning.job.event', data={'step': 129, 'train_loss': 0.7679194808006287, 'total_steps': 171, 'train_mean_token_accuracy': 0.7450980544090271}, type='metrics'),
 FineTuningJobEvent(id='ftevent-108Oyd3Zd2FFtE2W2VoIxtWj', created_at=1747380144, level='info', message='Step 128/171: training loss=0.75', object='fine_tuning.job.event', data={'step': 128, 'train_loss': 0.7507893443107605, 'total_steps': 171, 'train_mean_token_accuracy

In [57]:
for event in events:
  print(event.message)

Step 130/171: training loss=0.79, validation loss=0.77
Step 129/171: training loss=0.77
Step 128/171: training loss=0.75
Step 127/171: training loss=1.16
Step 126/171: training loss=1.01
Step 125/171: training loss=0.85
Step 124/171: training loss=0.94
Step 123/171: training loss=0.97
Step 122/171: training loss=0.76
Step 121/171: training loss=0.88
Step 120/171: training loss=0.65, validation loss=0.78
Step 119/171: training loss=0.86
Step 118/171: training loss=0.87
Step 117/171: training loss=0.67
Step 116/171: training loss=0.87
Step 115/171: training loss=0.89
Step 114/171: training loss=1.09, full validation loss=0.89
Step 113/171: training loss=0.89
Step 112/171: training loss=0.92
Step 111/171: training loss=0.99


Generating using new model

In [59]:
response = client.fine_tuning.jobs.retrieve('ftjob-2zxnRB4f3GohqOmGJnhjHbE6')

In [60]:
fine_tuned_model_id = response.fine_tuned_model
fine_tuned_model_id

'ft:gpt-3.5-turbo-0125:personal:samantha:BXjiIJws'

Interaction with fine-tuned model

In [61]:
test_messages = []
test_message = "How are you Samantha today ?"
test_messages.append(
    {
        "role":"system",
        "content":test_message
    }
)
print(test_messages)

[{'role': 'system', 'content': 'How are you Samantha today ?'}]


In [64]:
response = client.chat.completions.create(
    model = fine_tuned_model_id,
    messages = test_messages,
    temperature = 0.1,
    max_tokens = 256
)

In [65]:
print(response.choices[0].message.content)

I'm doing well, thank you for asking. How are you?
