In [None]:
!pip install datasets tiktoken openai

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.31.0-py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.1/324.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Fine Tuning using OpenAI GPT-3.5-turbo

### Prepare the data

In [None]:
{
    "messages": [
        {"role": "system", "content": "You are an assistant that occassionally misspells words" },
        {"role": "user", "content": "Tell me a story"},
        {"role": "assistant", "content": "One day a student went to a schoool." }
    ]
}


{'messages': [{'role': 'system',
   'content': 'You are an assistant that occassionally misspells words'},
  {'role': 'user', 'content': 'Tell me a story'},
  {'role': 'assistant', 'content': 'One day a student went to a schoool.'}]}

In [None]:
!git clone https://huggingface.co/datasets/cognitivecomputations/samantha-data

Cloning into 'samantha-data'...
remote: Enumerating objects: 107, done.[K
remote: Total 107 (delta 0), reused 0 (delta 0), pack-reused 107 (from 1)[K
Receiving objects: 100% (107/107), 4.86 MiB | 11.33 MiB/s, done.
Resolving deltas: 100% (33/33), done.
Filtering content: 100% (13/13), 354.08 MiB | 47.96 MiB/s, done.




*   Theodore - User
*   Samantha - Assistant




In [None]:
import json
import os
import tiktoken
import numpy as np
from collections import defaultdict


In [None]:
data_path = "/content/samantha-data/data/howto_conversations.jsonl"


In [None]:
# load datasets

with open(data_path) as f:
  json_dataset = [ json.loads(line) for line in f]

In [None]:
json_dataset[0]

{'elapsed': 114.311,
 'conversation': 'Theodore: Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?\n\nSamantha: Of course, I\'d be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses.\n\nTheodore: I\'ll take a look. What if the coolant system is working fine?\n\nSamantha: Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see if the water pump pulley is loose or listen for any unusual sounds that may indicate a failing water pump.\n\nTheodore: It sounds like you really know your way around cars. I didn\'t expect that from an AI.\n\nSamantha: Thank you! Wh

In [None]:
def prep_openai_format(conversation_str, system_message=None):
    conversation_str = conversation_str['conversation']
    # Splitting the conversation string into individual lines
    lines = conversation_str.split('\n\n')

    # Initializing the messages list
    messages = []

    # Including the system message if provided
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    # Iterating through the lines and formatting the messages
    for line in lines:
        # Splitting each line by the colon character to separate the speaker and content
        parts = line.split(': ', 1)
        if len(parts) < 2:
            continue

        # Identifying the role based on the speaker's name
        role = "user" if parts[0].strip() == "Theodore" else "assistant"

        # Formatting the message
        message = {
            "role": role,
            "content": parts[1].strip()
        }
        messages.append(message)

    # Creating the final output dictionary
    output_dict = {
        "messages": messages
    }

    return output_dict




In [None]:
system_message = """You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt"""

In [None]:
prep_openai_format(json_dataset[0], system_message=system_message)

{'messages': [{'role': 'system',
   'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt'},
  {'role': 'user',
   'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'},
  {'role': 'assistant',
   'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."},
  {'role': 'user',
   'content': "I'll take a look. What if the coolant system is working fine?"},
  {'role': 'assistant',
   'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you 

In [None]:
dataset = []
for data in json_dataset[0:10]:
  record = prep_openai_format(data, system_message=system_message)
  dataset.append(record)

In [None]:
# initial dataset
print("Examples : ", len(dataset))
print("First Example:")
for message in dataset[0]['messages']:
  print(message)




Examples :  10
First Example:
{'role': 'system', 'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt'}
{'role': 'user', 'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'}
{'role': 'assistant', 'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."}
{'role': 'user', 'content': "I'll take a look. What if the coolant system is working fine?"}
{'role': 'assistant', 'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see i

In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


No errors found


In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

In [None]:
# helper functions to token counting
def from_message_num_tokens(messages, tokens_per_message=3, tokens_per_name=1):
  num_tokens = 0
  for message in messages:
    num_tokens += tokens_per_message
    for key, value in message.items():
      num_tokens += len(encoding.encode(value))
      if key=="name":
        num_tokens += tokens_per_name

  num_tokens +=3
  return num_tokens

def from_message_num_assistant_tokens(messages):
  num_tokens = 0
  for message in messages:
    if message["role"] == "assistant":
      num_tokens +=len(encoding.encode(message["content"]))

  return num_tokens

def print_overview(values, name):
  print(f"\n #### Distribution of {name}:")
  print(f"min / max: {min(values)}, {max(values)}")
  print(f"mean / median: {np.mean(values)}, {np.median(values)}")
  print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")




In [None]:
# tokens counts and warnings - from OpenAI cookbook

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(from_message_num_tokens(messages))
    assistant_message_lens.append(from_message_num_assistant_tokens(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)

print_overview(n_messages, "num_messages_per_example")
print_overview(convo_lens, "num_total_tokens_per_example")

print_overview(assistant_message_lens, "num_assistant_tokens_per_example")

n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")


Num examples missing system message: 0
Num examples missing user message: 0

 #### Distribution of num_messages_per_example:
min / max: 12, 19
mean / median: 15.2, 15.0
p5 / p95: 13.8, 17.2

 #### Distribution of num_total_tokens_per_example:
min / max: 430, 673
mean / median: 556.0, 587.0
p5 / p95: 454.3, 608.1999999999999

 #### Distribution of num_assistant_tokens_per_example:
min / max: 194, 436
mean / median: 326.2, 358.0
p5 / p95: 208.4, 388.29999999999995

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [None]:
# Pricing and default n_epochs estimate

MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 1
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")


Dataset has ~5560 tokens that will be charged for during training
By default, you'll train for 10 epochs on this dataset
By default, you'll be charged for ~55600 tokens
See pricing page to estimate total costs


In [None]:
dataset[:1]

[{'messages': [{'role': 'system',
    'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt'},
   {'role': 'user',
    'content': 'Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?'},
   {'role': 'assistant',
    'content': "Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses."},
   {'role': 'user',
    'content': "I'll take a look. What if the coolant system is working fine?"},
   {'role': 'assistant',
    'content': 'Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that 

In [None]:
# function to save training data
import json

def save_to_jsonl(conversations, file_path):
  with open(file_path, 'w') as file:
    for conversation in conversations:
      json_line = json.dumps(conversation)
      file.write(json_line + '\n')



In [None]:
# train dataset

save_to_jsonl(dataset, '/content/samantha_task_train.jsonl')

# validation dataset

save_to_jsonl(dataset[9:], '/content/samantha_task_validation.jsonl')


### Upload training/validation dataset

##### for openai ver 1.0.0

In [None]:
import os
from openai import OpenAI

client = OpenAI(
    api_key="",
)

In [None]:
training_dataset_file_name = '/content/samantha_task_train.jsonl'
validation_dataset_file_name = '/content/samantha_task_validation.jsonl'

In [None]:
from pathlib import Path

training_response = client.files.create(
    file=Path(training_dataset_file_name),
    purpose="fine-tune"
)

In [None]:
training_response

FileObject(id='file-C6eXRaSJFMYlXPzqUKw7Txxv', bytes=28792, created_at=1717514062, filename='samantha_task_train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
training_file_id = training_response.id
training_file_id


'file-C6eXRaSJFMYlXPzqUKw7Txxv'

In [None]:
validation_response = client.files.create(
    file=Path(validation_dataset_file_name),
    purpose="fine-tune"
)


In [None]:
validation_response


FileObject(id='file-BIljUDYFUbYHasgP4SG3TUuw', bytes=3174, created_at=1717514065, filename='samantha_task_validation.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
validation_file_id = validation_response.id

In [None]:
validation_file_id

'file-BIljUDYFUbYHasgP4SG3TUuw'

### Start a fine-tuning job

In [None]:
response = client.fine_tuning.jobs.create(
    model = "gpt-3.5-turbo",
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="samantha-test"
)


In [None]:
response


FineTuningJob(id='ftjob-ZOgKx3jL6ctr3p5okvFtzZpr', created_at=1717514070, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-8Z5JRCzKJzkqgbxg1GntqnHs', result_files=[], seed=1966200900, status='validating_files', trained_tokens=None, training_file='file-C6eXRaSJFMYlXPzqUKw7Txxv', validation_file='file-BIljUDYFUbYHasgP4SG3TUuw', estimated_finish=None, integrations=[], user_provided_suffix='samantha-test')

In [None]:
job_id = response.id
job_id

'ftjob-ZOgKx3jL6ctr3p5okvFtzZpr'

In [None]:
client.fine_tuning.jobs.list()


SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-ZOgKx3jL6ctr3p5okvFtzZpr', created_at=1717514070, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-8Z5JRCzKJzkqgbxg1GntqnHs', result_files=[], seed=1966200900, status='running', trained_tokens=None, training_file='file-C6eXRaSJFMYlXPzqUKw7Txxv', validation_file='file-BIljUDYFUbYHasgP4SG3TUuw', estimated_finish=None, integrations=[], user_provided_suffix='samantha-test'), FineTuningJob(id='ftjob-tpVolq79ZTHPC5H0OhM9tjTL', created_at=1717513633, error=Error(code='invalid_validation_file', message='Expected file to have JSONL format, where every line is a valid JSON dictionary. Line 1 is not a dictionary.', param='validation_file'), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_s

In [None]:
 client.fine_tuning.jobs.retrieve('ftjob-ZOgKx3jL6ctr3p5okvFtzZpr')

FineTuningJob(id='ftjob-ZOgKx3jL6ctr3p5okvFtzZpr', created_at=1717514070, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-8Z5JRCzKJzkqgbxg1GntqnHs', result_files=[], seed=1966200900, status='running', trained_tokens=None, training_file='file-C6eXRaSJFMYlXPzqUKw7Txxv', validation_file='file-BIljUDYFUbYHasgP4SG3TUuw', estimated_finish=None, integrations=[], user_provided_suffix='samantha-test')

In [None]:
job_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id='ftjob-ZOgKx3jL6ctr3p5okvFtzZpr')

In [None]:
job_response

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-f2Q578vNq0Se4XZ16Mqinwl5', created_at=1717514452, level='info', message='Step 69/100: training loss=0.53, validation loss=0.65', object='fine_tuning.job.event', data={'step': 69, 'train_loss': 0.5253582000732422, 'valid_loss': 0.6456686840781682, 'total_steps': 100, 'train_mean_token_accuracy': 0.8328840732574463, 'valid_mean_token_accuracy': 0.8151898734177215}, type='metrics'), FineTuningJobEvent(id='ftevent-APYCfwDWWCjh46VMJcR1v27f', created_at=1717514447, level='info', message='Step 68/100: training loss=0.36, validation loss=0.65', object='fine_tuning.job.event', data={'step': 68, 'train_loss': 0.36249402165412903, 'valid_loss': 0.6502999269509617, 'total_steps': 100, 'train_mean_token_accuracy': 0.9068825840950012, 'valid_mean_token_accuracy': 0.8075949367088607}, type='metrics'), FineTuningJobEvent(id='ftevent-bKX0scdoPWjhNuCSBO95CvBV', created_at=1717514445, level='info', message='Step 67/100: training loss

In [None]:
events = job_response.data
events

[FineTuningJobEvent(id='ftevent-f2Q578vNq0Se4XZ16Mqinwl5', created_at=1717514452, level='info', message='Step 69/100: training loss=0.53, validation loss=0.65', object='fine_tuning.job.event', data={'step': 69, 'train_loss': 0.5253582000732422, 'valid_loss': 0.6456686840781682, 'total_steps': 100, 'train_mean_token_accuracy': 0.8328840732574463, 'valid_mean_token_accuracy': 0.8151898734177215}, type='metrics'),
 FineTuningJobEvent(id='ftevent-APYCfwDWWCjh46VMJcR1v27f', created_at=1717514447, level='info', message='Step 68/100: training loss=0.36, validation loss=0.65', object='fine_tuning.job.event', data={'step': 68, 'train_loss': 0.36249402165412903, 'valid_loss': 0.6502999269509617, 'total_steps': 100, 'train_mean_token_accuracy': 0.9068825840950012, 'valid_mean_token_accuracy': 0.8075949367088607}, type='metrics'),
 FineTuningJobEvent(id='ftevent-bKX0scdoPWjhNuCSBO95CvBV', created_at=1717514445, level='info', message='Step 67/100: training loss=0.30, validation loss=0.66', object='

In [None]:
for event in events:
  print(event.message)

Step 69/100: training loss=0.53, validation loss=0.65
Step 68/100: training loss=0.36, validation loss=0.65
Step 67/100: training loss=0.30, validation loss=0.66
Step 66/100: training loss=0.57, validation loss=0.66
Step 65/100: training loss=0.75, validation loss=0.67
Step 64/100: training loss=0.69, validation loss=0.68
Step 63/100: training loss=0.51, validation loss=0.69
Step 62/100: training loss=0.35, validation loss=0.69
Step 61/100: training loss=0.59, validation loss=0.69
Step 60/100: training loss=0.78, validation loss=0.70, full validation loss=0.70
Step 59/100: training loss=0.63, validation loss=0.71
Step 58/100: training loss=0.70, validation loss=0.71
Step 57/100: training loss=0.36, validation loss=0.72
Step 56/100: training loss=0.60, validation loss=0.73
Step 55/100: training loss=0.63, validation loss=0.74
Step 54/100: training loss=0.42, validation loss=0.75
Step 53/100: training loss=0.45, validation loss=0.76
Step 52/100: training loss=0.79, validation loss=0.78
S

### Generating using new model

In [None]:
# getting fine_tune_model name

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)
response

FineTuningJob(id='ftjob-ZOgKx3jL6ctr3p5okvFtzZpr', created_at=1717514070, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-3.5-turbo-0125:personal:samantha-test:9WQHNLiS', finished_at=1717514539, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-8Z5JRCzKJzkqgbxg1GntqnHs', result_files=['file-61hBJwl7PA7aHkXCQy4XMXuO'], seed=1966200900, status='succeeded', trained_tokens=55400, training_file='file-C6eXRaSJFMYlXPzqUKw7Txxv', validation_file='file-BIljUDYFUbYHasgP4SG3TUuw', estimated_finish=None, integrations=[], user_provided_suffix='samantha-test')

In [None]:
fine_tune_model_id = response.fine_tuned_model
fine_tune_model_id


'ft:gpt-3.5-turbo-0125:personal:samantha-test:9WQHNLiS'

In [None]:
test_messages = []
test_messages.append({"role": "system", "content": system_message})
test_message = "How are you today Samantha?"

test_messages.append({"role": "user", "content": test_message})

print(test_messages)

[{'role': 'system', 'content': 'You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt'}, {'role': 'user', 'content': 'How are you today Samantha?'}]


In [None]:
response = client.chat.completions.create(
    model = fine_tune_model_id,
    messages = test_messages
)

print(response.choices[0].message.content)



I'm doing well, thank you for asking! How can I assist you today?
