<a href="https://colab.research.google.com/github/isaac-mackey/mind-uploading/blob/main/Isaac_SMS_Role_System_User_Assistant_Fine_Tuning_GPT3_5Turbo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installs and Imports

In [None]:
!pip install openai
!pip install numpy
!pip install tiktoken

In [None]:
import openai
import csv
import json
import os
import numpy as np
from collections import defaultdict
import tiktoken

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Validate the dataset for training

In [None]:
#from OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
# !openai tools fine_tunes.prepare_data -f /content/drive/My\ Drive/sms-20231224020008.xml-role-system-user-4.json

# Next, we specify the data path and open the JSONL file

# data_path = "/content/drive/My Drive/in-sent-after-2023-07-01-before-2024-01-12.mbox-5.json"
data_path = '/content/drive/My Drive/sms-combined.xml-role-system-user-7.json'

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"][:5]:
    print(message)

print('\n'*3)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

MAX_TOKENS_PER_EXAMPLE = 2048

dataset_split_long_messages = []
for ex in dataset:
    messages = ex["messages"]
    if num_tokens_from_messages(messages) > MAX_TOKENS_PER_EXAMPLE:
        sublists = []
        current_sublist = []
        system_message = messages[0]
        current_sum = 0

        for message in messages:
            if current_sum + num_tokens_from_messages([message]) > MAX_TOKENS_PER_EXAMPLE:
                # Current sublist reached its maximum capacity without exceeding target sum
                sublists.append(current_sublist)
                current_sublist = [system_message, message]       # Start a new sublist
                current_sum = num_tokens_from_messages([message])
            else:
                current_sublist.append(message)
                current_sum += num_tokens_from_messages([message])

        # Adding the last sublist if it's not empty
        if current_sublist:
            sublists.append(current_sublist)

        for s in sublists:
            dataset_split_long_messages.append({"messages":s})

    else:
        dataset_split_long_messages.append(ex)

dataset = dataset_split_long_messages

print("Num examples in dataset",str(len(dataset)))

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > MAX_TOKENS_PER_EXAMPLE for l in convo_lens)
print(f"\n{n_too_long} examples may be over the token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
# MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

## Split data into training and testing files

In [None]:
import random

#from OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
# !openai tools fine_tunes.prepare_data -f /content/drive/My\ Drive/sms-20231224020008.xml-role-system-user-4.json

# Next, we specify the data path and open the JSONL file

data_path = '/content/drive/My Drive/sms-combined.xml-role-system-user-7.json'
data_path = "/content/drive/My Drive/in-sent-after-2023-07-01-before-2024-01-12.mbox-5.json"

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"][:5]:
    print(message)

print('\n'*3)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_missing_assistant = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

MAX_TOKENS_PER_EXAMPLE = 2048

dataset_split_long_messages = []

for ex in dataset:
    messages = ex["messages"]
    if num_tokens_from_messages(messages) > MAX_TOKENS_PER_EXAMPLE:
        sublists = []
        current_sublist = []
        system_message = messages[0]
        current_sum = 0

        for message in messages:
            if current_sum + num_tokens_from_messages([message]) > MAX_TOKENS_PER_EXAMPLE:
                # Current sublist reached its maximum capacity without exceeding target sum
                sublists.append(current_sublist)
                current_sublist = [system_message, message]       # Start a new sublist
                current_sum = num_tokens_from_messages([message])
            else:
                current_sublist.append(message)
                current_sum += num_tokens_from_messages([message])

        # Adding the last sublist if it's not empty
        if current_sublist:
            sublists.append(current_sublist)

        for s in sublists:
            dataset_split_long_messages.append({"messages":s})

    else:
        dataset_split_long_messages.append(ex)

dataset = dataset_split_long_messages

dataset_train = []
dataset_test = []

for d in dataset:
    if not any(message["role"] == "assistant" for message in d['messages']):
        continue
    if random.random() < .8:
        dataset_train.append(d)
    else:
        dataset_test.append(d)

print("Num examples in training dataset",str(len(dataset_train)))
print("Num examples in testing dataset",str(len(dataset_test)))

for ex in dataset_train:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    if not any(message["role"] == "assistant" for message in messages):
        n_missing_assistant += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print("Num examples missing assistant message:", n_missing_assistant)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > MAX_TOKENS_PER_EXAMPLE for l in convo_lens)
print(f"\n{n_too_long} examples may be over the token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
# MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset_train)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

In [None]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
training_jsonl_file_path = data_path+'-train.jsonl'
testing_jsonl_file_path = data_path+'-test.jsonl'

# Save the dataset to the specified file path
save_to_jsonl(dataset_train, training_jsonl_file_path)
save_to_jsonl(dataset_test, testing_jsonl_file_path)

# Upload data to OpenAI for training

In [None]:
from datetime import datetime
# Format the current date and time in a human-readable format
print(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

from openai import OpenAI

)

training_file_name = training_jsonl_file_path

training_response = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)
training_file_id = training_response.id
print("Training file id:", training_file_id)

validation_file_name = testing_jsonl_file_path

validation_response = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)
validation_file_id = validation_response.id

#Gives training file id
print("Validation file id:", validation_file_id)

# 2024-01-05 8pm: Training file id: file-TMQuBaSWdCa2MKMa7iDA8Klm

# 2024-01-05 10pm: Training file id: file-px88PEb4t11QC3rLY7FEFZCm

# 2023-01-08 11pm: Training file id: file-oF5tFlyiWMDWVDMmmEQRXUn5

# 2024-01-09 18:38:00
# Training file id: file-RVsnOPc81XyTloaJIi8A2qaA
# Validation file id: file-oTjXdDxeiAquIBJIprcI0258

In [None]:
#Create Fine-Tuning Job
suffix_name = "isaac-email-1"

fine_tuned_model_training_response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
    validation_file=validation_file_id
)

job_id = fine_tuned_model_training_response.id

print(fine_tuned_model_training_response)

In [None]:
from datetime import datetime

def UNIX_timestamp_to_formatted_datetime(date):
    if date == None:
        return None
    unix_timestamp = int(date)  # Convert to integer and then to seconds
    date_time_obj = datetime.utcfromtimestamp(unix_timestamp)
    # Format the datetime object as a string
    formatted_date = date_time_obj.strftime('%Y-%m-%d %H:%M:%S')
    return formatted_date

!pip install openai
from openai import OpenAI

client = OpenAI(
    api_key="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",
)

#paste ft fine-tune file id from line "message": "Created fine-tune: ft-XXXXXXXXX"
list_of_jobs_response = client.fine_tuning.jobs.list(limit=10)
print('Jobs found:',str(len(list_of_jobs_response.data)))
print()
# for x in list_of_jobs_response:
#   print_fields(x)

events = list_of_jobs_response.data
events.reverse()

for event in events:
    if not event.error:
        print(event)
        print('created: ',UNIX_timestamp_to_formatted_datetime(event.created_at))
        print('finished: ',UNIX_timestamp_to_formatted_datetime(event.finished_at))
        print()

In [None]:
!pip install openai
from openai import OpenAI

client = OpenAI(
    api_key="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",
)

In [None]:
file_list = client.files.list()

file_names = {}
file_sizes = {}
for x in file_list:
    print(x)
    print(x.id)

In [None]:
file_list = client.files.list()

file_names = {}
file_sizes = {}
for x in file_list:
    file_sizes[x.id] = x.bytes
    file_names[x.id] = x.filename

#paste ft fine-tune file id from line "message": "Created fine-tune: ft-XXXXXXXXX"
list_of_jobs_response = client.fine_tuning.jobs.list(limit=10)
print('Jobs found:',str(len(list_of_jobs_response.data)))
print()

events = list_of_jobs_response.data
events.reverse()
events = sorted(events, key=lambda x: x.created_at)

for i,event in enumerate(events):
    if not event.error:
        print(i)
        print('fine_tuned_model:',event.fine_tuned_model)
        print('training file name:',file_names[event.training_file])
        print('training file size:',"{:,}".format(file_sizes[event.training_file]))
        print()

In [None]:
def print_fields(obj, indent=0):
    # Determine the indentation
    indentation = ' ' * indent

    # If obj is a dictionary, iterate over its items
    if isinstance(obj, dict):
        for key, value in obj.items():
            print(f"{indentation}{key}:")
            print_fields(value, indent + 2)
    # If obj is an object, iterate over its attributes
    elif hasattr(obj, '__dict__'):
        for key, value in obj.__dict__.items():
            print(f"{indentation}{key}:")
            print_fields(value, indent + 2)
    # Otherwise, just print the value
    else:
        print(f"{indentation}{obj}")

model_selector = 4
print_fields(events[model_selector])

#retrieve fine-tune model
fine_tuned_model_id = events[model_selector].fine_tuned_model
print("\nFine-tuned model id:", fine_tuned_model_id)

In [None]:
events[-1].result_files[0]

'file-gCEk7yFpbxRIjeQC5FWbA8tD'

In [None]:
content = client.files.retrieve_content("file-B6Qby1PY4Pf3j9qIEVGiiBCG")

  content = client.files.retrieve_content("file-B6Qby1PY4Pf3j9qIEVGiiBCG")


# Single completion mode

In [None]:
#Test it out!
test_messages = []

system_message = ("You are a computer science graduate in the Marine Corps"
                    " Be polite and formal. Do not apologize. Use correct grammar and avoid logic fallacies.")
test_messages.append({"role": "system", "content": system_message})
user_message = "What did you do today?"
test_messages.append({"role": "user", "content": user_message})

from openai import OpenAI

client = OpenAI(
    api_key="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",
)

# fine_tuned_model_id = 'ft:gpt-3.5-turbo-0613:university-of-california-santa-barbara:isaac-sms-bot:8ds3OPKx'
fine_tuned_model_id = events[model_selector].fine_tuned_model

#OpenAI Chat Completions
response = client.chat.completions.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0.1,
    max_tokens=500
)
print(response.choices[0].message)

In [None]:
response_text = response.choices[0].text

# Find the first occurrence of 'END' and slice the string up to that point
end_index = response_text.find('END')
if end_index != -1:
    response_text = response_text[:end_index]

# Clean up the response text by removing leading/trailing white space
response_text = response_text.strip()

print(response_text)

## Conversation Mode

In [None]:
!pip install openai
from openai import OpenAI

client = OpenAI(
    api_key="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",
)

In [None]:
file_list = client.files.list()

file_names = {}
file_sizes = {}
for x in file_list:
    file_sizes[x.id] = x.bytes
    file_names[x.id] = x.filename

#paste ft fine-tune file id from line "message": "Created fine-tune: ft-XXXXXXXXX"
list_of_jobs_response = client.fine_tuning.jobs.list(limit=10)

events = list_of_jobs_response.data
events.reverse()
events = sorted(events, key=lambda x: x.created_at)

model_selector = 4
event = events[model_selector]
print('fine_tuned_model:',event.fine_tuned_model)
print('training file name:',file_names[event.training_file])
print('training file size:',"{:,}".format(file_sizes[event.training_file]))
print('trained_tokes:',"{:,}".format(event.trained_tokens))
print('n_epochs:',event.hyperparameters.n_epochs)
print()

#retrieve fine-tune model
fine_tuned_model_id = events[model_selector].fine_tuned_model

In [None]:
#Test it out!
chat_messages = []

system_message = ("You are a computer science graduate in the Marine Corps"
                      " Be polite and formal. Do not apologize. Use correct grammar and avoid logic fallacies.")
chat_messages.append({"role": "system", "content": system_message})

user_message = input("user     : ")  # User input

for _ in range(10):

    chat_messages.append({"role": "user", "content": user_message})

    #OpenAI Chat Completions
    response = client.chat.completions.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=chat_messages,
    temperature=0.01,
    max_tokens=500
    )
    response_text = response.choices[0].message.content
    chat_messages.append({"role": "assistant", "content": response_text})
    print("assistant:",response.choices[0].message.content)

    user_message = input("user     : ")  # User input
    if user_message == 'end':
        print('Conversation ended by user')
        break