### Resources

* https://platform.openai.com/docs/guides/fine-tuning


In [61]:
import pandas as pd
import openai
import os
import time
import json
from IPython.display import clear_output
from datetime import datetime


%reload_ext autoreload
%autoreload 2
from src.util import distributionPreservingDownsample, prettyPrintFineTuneJob, tersePrintFineTuneJob, tersePrintFineTuneJobHeader, makeJobsDataframe

In [37]:
from dotenv import load_dotenv; load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [38]:
sms_spam_all = pd.read_csv('../data/kaggle_sms_spam.csv', encoding='latin-1')[['label', 'prompt']]

# Create spam_flag column, a boolean indicating spam or not
sms_spam_all['spam_flag'] = sms_spam_all['label'].apply(lambda x: True if x == 'spam' else False)

# Some datasets may have duplicate prompts, we want to remove these
sms_spam = sms_spam_all.drop_duplicates(subset=['prompt'])

print("Loaded sms data file with {} rows, kept {}".format(len(sms_spam_all), len(sms_spam)))
sms_spam.head()


Loaded sms data file with 5572 rows, kept 5169


Unnamed: 0,label,prompt,spam_flag
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False


# Set up System Prompt

In [39]:
systemPrompt = "You are a system for categorizing SMS text messages as being unwanted spam or normal messages."

# Create downsampled datasets at various sizes

We want to see how the dataset size affects model training time

In [63]:
# sample_sizes = [25,50,75,100,125,150,175,200,225,250,275,300,325,350,375,400]
sample_sizes = [100,150,200]

# map sample sizes to distributionPreservingDownsample(sms_spam, 'spam_flag', x)
# display(sms_spam)
downsampled_datasets = {x: distributionPreservingDownsample(sms_spam, 'spam_flag', x) for x in sample_sizes}
# downsampled_datasets
jsonl_files = []
for sample_size in downsampled_datasets:
    jsonl_data_path = f"../data/temp/downsampled_{sample_size}.jsonl"
    jsonl_files.append(jsonl_data_path)
    with open(jsonl_data_path, 'w') as f:
        for index, row in downsampled_datasets[sample_size].iterrows():
            f.write(json.dumps({
                "messages": [
                    {"role": "system", "content": systemPrompt},
                    {"role": "user", "content": row['prompt']},
                    {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
                ]
            }) + "\n")

# Do Fine Tuning

In [64]:
def runFineTuning(jsonl_data_path):
    uploadedFile = openai.File.create(
        file=open(jsonl_data_path, "rb"),
        purpose='fine-tune'
    )
    job = openai.FineTuningJob.create(training_file=uploadedFile.id, model="gpt-3.5-turbo")
    print("Submitted job {} for file {}".format(job.id, jsonl_data_path))
    return job


In [65]:
submitted_jobs = []
for jsonl_file in jsonl_files:
    submitted_jobs.append({
        "jsonl_file": jsonl_file,
        "job": runFineTuning(jsonl_file)
    })


Submitted job ftjob-tB505C0UG0oE8A0tO8DiWVsv for file ../data/temp/downsampled_100.jsonl
Submitted job ftjob-hjCv26zXKV13we6T1GoZSU3I for file ../data/temp/downsampled_150.jsonl
Submitted job ftjob-KUl5tSNid5Rq08EK9JFiySDt for file ../data/temp/downsampled_200.jsonl


In [67]:

while True:
    current_jobs = openai.FineTuningJob.list(limit=10)
    df = makeJobsDataframe(current_jobs.data)
    clear_output(wait=True)
    print(f"Updated at {datetime.now()}")
    display(df)
    time.sleep(10)

Updated at 2023-11-06 07:32:44.657746


Unnamed: 0,ID,Training File,Status,Duration,TrainedTokens,TokensPerMinute
0,ftjob-KUl5tSNid5Rq08EK9JFiySDt,file-tmOKf3KsbaOQLcIDMvx7lMbq,succeeded,16.916667,31632,1869.871921
1,ftjob-hjCv26zXKV13we6T1GoZSU3I,file-MTHHGTfNOed6X1YWWF1zX8Ud,succeeded,18.55,23943,1290.727763
2,ftjob-tB505C0UG0oE8A0tO8DiWVsv,file-PW2XKUivTmO7VWBQ8OY7ZCPa,succeeded,13.966667,16536,1183.961814
3,ftjob-GWOixlqvNI2QqJza3U8dXwdw,file-MO8Iqtp7D7mU4YkaqNfuYVmg,succeeded,9.816667,12156,1238.302207
4,ftjob-bnNcTibuWGgZ8vmzpr3PEuk9,file-zbeYBgkkQDZzvMnZY4IPc76K,succeeded,6.133333,8118,1323.586957
5,ftjob-r3GPkAgJzHT04XrwvJlhQAEO,file-dP2rnjZABe7xg0GjddnL5yzI,succeeded,6.2,5744,926.451613
6,ftjob-3xQGCgLB44R1C5jG0hrvcIbt,file-A9NBysw0N5tsSopAdpkcOF6S,succeeded,49.483333,81474,1646.493769
7,ftjob-Y5a4mgDfDooMfuXv4nI8g28E,file-A9NBysw0N5tsSopAdpkcOF6S,cancelled,9183.544276,0,0.0


KeyboardInterrupt: 

### Monitor Job Status

In [None]:
# Get job details - training is ready when "fine_tuned_model" is no longer null
# jobId = job.id
jobId = "ftjob-3xQGCgLB44R1C5jG0hrvcIbt"
status = openai.FineTuningJob.retrieve(jobId)
display(status)

if status.fine_tuned_model is None:
    print("Training not complete yet")
else:
    print("Training complete!")

fineTunedModelId = status.fine_tuned_model

In [27]:
# Other useful commands
jobs = openai.FineTuningJob.list(limit=10)
#openai.FineTuningJob.list_events(id=job.id, limit=10)
#openai.FineTuningJob.cancel(job.id)

jobs
# prettyPrintFineTuneJob(jobs.data[0])

<OpenAIObject list at 0x12fa14230> JSON: {
  "object": "list",
  "data": [
    {
      "object": "fine_tuning.job",
      "id": "ftjob-3xQGCgLB44R1C5jG0hrvcIbt",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1698726834,
      "finished_at": 1698729803,
      "fine_tuned_model": "ft:gpt-3.5-turbo-0613:aa-engineering::8FbVkEom",
      "organization_id": "org-n3iT5I0sZST0QX1nSKkPHmb7",
      "result_files": [
        "file-EnSXF8M6qBUkOhQVdCW3CvfZ"
      ],
      "status": "succeeded",
      "validation_file": null,
      "training_file": "file-A9NBysw0N5tsSopAdpkcOF6S",
      "hyperparameters": {
        "n_epochs": 3
      },
      "trained_tokens": 81474,
      "error": null
    },
    {
      "object": "fine_tuning.job",
      "id": "ftjob-Y5a4mgDfDooMfuXv4nI8g28E",
      "model": "gpt-3.5-turbo-0613",
      "created_at": 1698726552,
      "finished_at": null,
      "fine_tuned_model": null,
      "organization_id": "org-n3iT5I0sZST0QX1nSKkPHmb7",
      "result_files": [],
