## Setup

To complete the following guide you will need to install the following packages:

- openai
- pandas
- requests

You will also need:

- OpenAI account (https://platform.openai.com/)
- OpenAI API key

In [1]:
#! pipenv install openai pandas requests python-dotenv

You should consider upgrading via the '/Users/scottkramer/.pyenv/versions/3.9.9/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [1]:
import json
import os

from openai import OpenAI
import pandas as pd

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the API key from environment variable
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=api_key)

# Create the test and train data

In [3]:
import random

input_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/clean_faq_dataset.txt'
train_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/train.tsv'
test_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/test.tsv'

def read_qa_pairs(file_path):
    qa_pairs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        current_question = None
        current_answer = None
        for line in file:
            line = line.strip()
            if line.startswith('Question:'):
                if current_question and current_answer:
                    qa_pairs.append((current_question, current_answer))
                current_question = line[9:].strip()
                current_answer = None
            elif line.startswith('Answer:'):
                current_answer = line[7:].strip()
        if current_question and current_answer:
            qa_pairs.append((current_question, current_answer))
    return qa_pairs

def write_tsv(file_path, data):
    with open(file_path, 'w', encoding='utf-8', newline='') as file:
        file.write('question\tanswer\n')  # Header
        for question, answer in data:
            file.write(f'{question}\t{answer}\n')

# Read Q&A pairs
qa_pairs = read_qa_pairs(input_file)

# Shuffle the data
random.shuffle(qa_pairs)

# Calculate split index
split_index = int(len(qa_pairs) * 0.8)

# Split the data
train_data = qa_pairs[:split_index]
test_data = qa_pairs[split_index:]

# Write train and test data
write_tsv(train_file, train_data)
write_tsv(test_file, test_data)

print(f"Train data ({len(train_data)} pairs) written to {train_file}")
print(f"Test data ({len(test_data)} pairs) written to {test_file}")

### finetuning training dataset curation


In [5]:
import json
import csv

input_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/train.tsv'
output_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/pk_faq_training_data.jsonl'

def process_qa_pair(question, answer):
    return {
        "messages": [
            {"role": "system", "content": "You are Poppy, a helpful assistant for Poppy Kids Pediatric Dentistry."},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

# Read TSV and write JSONL
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    reader = csv.reader(infile, delimiter='\t')
    next(reader)  # Skip header row
    
    for row in reader:
        if len(row) == 2:
            question, answer = row
            formatted_data = process_qa_pair(question, answer)
            json.dump(formatted_data, outfile, ensure_ascii=False)
            outfile.write('\n')

print(f"Formatted data has been written to {output_file}")

Formatted data has been written to /Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/pk_faq_training_data.jsonl


# Test dataset

In [None]:
import json
import csv

input_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/test.tsv'
output_file = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/pk_faq_test_data.jsonl'

def process_qa_pair(question, answer):
    return {
        "messages": [
            {"role": "system", "content": "You are Poppy, a helpful assistant for Poppy Kids Pediatric Dentistry."},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

# Read TSV and write JSONL
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    reader = csv.reader(infile, delimiter='\t')
    next(reader)  # Skip header row
    
    for row in reader:
        if len(row) == 2:
            question, answer = row
            formatted_data = process_qa_pair(question, answer)
            json.dump(formatted_data, outfile, ensure_ascii=False)
            outfile.write('\n')

print(f"Formatted data has been written to {output_file}")

# Finetuning Starts here

In [6]:
# Uploads training data to OpenAI
# Define the path for the training data file
dataset_file_name = '/Users/acrobat/Documents/GitHub/fine-tuning-workshop/poppykids/pk_data/pk_faq_training_data.jsonl'

# Read the existing JSONL file
with open(dataset_file_name, 'r') as f:
    training_json = [json.loads(line) for line in f]

# Upload the file to OpenAI
file_upload = client.files.create(
    file=open(dataset_file_name, "rb"),
    purpose="fine-tune"
)

print(f"File uploaded successfully. File ID: {file_upload.id}")

File uploaded successfully. File ID: file-RW2yeYnEXOdcNWavMaHYY0b2


### Fine-Tuning

We will now fine-tune models using the OpenAI API. OpenAI supports creating fine-tuning jobs both via the fine-tuning UI or programmatically. The number of epochs, learning rate, and batch size can all be optimized manually for your use case. In this exercise, we will use the default parameters.

See https://platform.openai.com/docs/guides/fine-tuning/create-a-fine-tuned-model for more details

In [7]:
# Creates a training job with the default hyperparameters
client.fine_tuning.jobs.create(
  training_file='file-RW2yeYnEXOdcNWavMaHYY0b2', # the file ID that was returned when the training file was uploaded to the OpenAI API.
  model='gpt-4o-mini-2024-07-18' 
)

FineTuningJob(id='ftjob-Hg6Gfoq45O3M4EeNdq93rvHM', created_at=1726617261, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-txV60G5rl5rp8sFMUYSZwoAv', result_files=[], seed=1798807987, status='validating_files', trained_tokens=None, training_file='file-RW2yeYnEXOdcNWavMaHYY0b2', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

# List current jobs

In [9]:
# List 10 fine-tuning jobs
#client.fine_tuning.jobs.list(limit=10)

# Retrieve the state of a fine-tune
print(client.fine_tuning.jobs.retrieve("ftjob-Hg6Gfoq45O3M4EeNdq93rvHM"))

# Cancel a job
#client.fine_tuning.jobs.cancel("ftjob-abc123")

# List up to 10 events from a fine-tuning job
#client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-abc123", limit=10)

# Delete a fine-tuned model (must be an owner of the org the model was created in)
#client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")

FineTuningJob(id='ftjob-Hg6Gfoq45O3M4EeNdq93rvHM', created_at=1726617261, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-txV60G5rl5rp8sFMUYSZwoAv', result_files=[], seed=1798807987, status='running', trained_tokens=None, training_file='file-RW2yeYnEXOdcNWavMaHYY0b2', validation_file=None, estimated_finish=1726620027, integrations=[], user_provided_suffix=None)


In [14]:
completion = client.chat.completions.create(
  model="ft:gpt-4o-mini-2024-07-18:acrobat::A8d1zaGt",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Can you provide me with instructions on how to brush my child's teeth?"},
  ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content="To brush your child's teeth, use a small, soft-bristled toothbrush with a pea-sized amount of fluoride toothpaste. Brush gently in circular motions for two minutes, covering all surfaces of the teeth. Use a child-friendly method to ensure thorough and pleasant brushing.", refusal=None, role='assistant', function_call=None, tool_calls=None)


### Evaluate Results - Really not possible by inferring, I can do semantic check to see or human evaluation. 

We will now deploy our models and evaluate the results. We will calculate the accuracy on two different models.

- The base model gpt-4o-mini model without any fine-tuning.
- Our fine-tuned model.

In the example below, you'll see that fine-tuning improved accuracy on our test set from 69% to 94%!

See https://platform.openai.com/docs/guides/fine-tuning/use-a-fine-tuned-model for more details

Below does not make sense for this use case. 

In [10]:
# Uses an LLM to predicted class labels for a list of support tickets
def classify_tickets(tickets, model):
    responses = list()

    for ticket in tickets:
        user_prompt = create_prompt(ticket)
    
        response = client.chat.completions.create(
            model=model,
            messages=[{ "role": "user", "content": user_prompt}],
            temperature=0, # setting temperature to 0 for this use case, so that responses are as deterministic as possible
            stop=["</category>"],
            max_tokens=2048,
        )

        response = response.choices[0].message.content.split("<category>")[-1].strip()
        responses.append(response)

    return responses


# Calculates the percent of predictions we classified correctly
def evaluate_accuracy(predicted, actual):
    num_correct = sum([predicted[i] == actual[i] for i in range(len(actual))])
    return round(100 * num_correct / len(actual), 2)

In [11]:
# Determine how the base model without any fine-tuning performs
model_id = 'gpt-4o-mini'

training_responses = classify_tickets(
    tickets=training_tickets, 
    model=model_id
)
accuracy = evaluate_accuracy(training_responses, training_labels)
print(f"Training Set Accuracy: {accuracy}%")

test_responses = classify_tickets(
    tickets=test_tickets, 
    model=model_id
)

accuracy = evaluate_accuracy(test_responses, test_labels)
print(f"Test Set Accuracy: {accuracy}%")

Training Set Accuracy: 70.59%
Test Set Accuracy: 67.65%


In [14]:
# Determine how the base model performs with the increases rank, epochs, and learning rate
#model_id = 'ft:gpt-4o-mini-2024-07-18:brainiac-labs::A1b3dY1n' # REPLACE THIS WITH THE OUTPUT MODEL ID IN THE OPENAI FINE-TUNING DASHBOARD
load_dotenv()
model_id = os.getenv('MODEL_ID_ONE')

training_responses = classify_tickets(
    tickets=training_tickets, 
    model=model_id
)
accuracy = evaluate_accuracy(training_responses, training_labels)
print(f"Training Set Accuracy: {accuracy}%")

test_responses = classify_tickets(
    tickets=test_tickets, 
    model=model_id
)

accuracy = evaluate_accuracy(test_responses, test_labels)
print(f"Test Set Accuracy: {accuracy}%")

Training Set Accuracy: 100.0%
Test Set Accuracy: 95.59%
