# Part 1 - Fine Tuning

### Overview

This notebook demonstrates fine-tuning GPT 3.5 for text classification on a dataset of SMS text messages.

The following steps are covered:

* Loading and enriching SMS dataset
* Downsampling the dataset for fine tuning
* Training three fine-tuned models with sizes: 50, 100, 200
* Experimenting with the fine-tuned models

### Requirements

* Python 3 environment
    * python3 -m venv venv
    * Select venv kernel in VS Code
        * Upper-right corner of notebook in editor
* OpenAI Account
    * Need a valid API key: https://platform.openai.com/account/api-keys
* OpenAI Python Module
    * https://github.com/openai/openai-python
    * pip install --pre openai
    * Configure with API Key: 
        * Create .env file with `OPENAI_API_KEY=sk_XXXX_...`

In [1]:
# Install dependencies if needed
# %pip install pandas
# %pip install python-dotenv
# %pip install openai
# %pip show openai

### Resources

* https://platform.openai.com/docs/guides/fine-tuning
* https://platform.openai.com/docs/api-reference/fine-tuning


In [2]:
import pandas as pd
from openai import OpenAI
# from openai import AsyncOpenAI
import os
import json
from datetime import datetime
import time
from IPython.display import clear_output

%reload_ext autoreload
%autoreload 2
from src.util import getTrainTestSplit, makeJobsDataframe

In [3]:
from dotenv import load_dotenv; load_dotenv()
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Load SMS Dataset

In [4]:
sms_spam_all = pd.read_csv('../data/kaggle_sms_spam.csv', encoding='latin-1')[['label', 'prompt']]
sms_spam_all['spam_flag'] = sms_spam_all['label'].apply(lambda x: True if x == 'spam' else False)
sms_spam = sms_spam_all.drop_duplicates(subset=['prompt'])
print("Loaded sms data file with {} rows, kept {}".format(len(sms_spam_all), len(sms_spam)))
sms_spam.head()


Loaded sms data file with 5572 rows, kept 5169


Unnamed: 0,label,prompt,spam_flag
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False


# Set up System Prompt

In [5]:
systemPrompt = "You are a system for categorizing SMS text messages as being unwanted spam or normal messages."

# Create downsampled datasets at various sizes

We want to see how the dataset size affects model training time

In [18]:
sample_sizes = [50, 60, 70, 80, 90, 100]

for sample_size in sample_sizes:
    
    train_data, test_data = getTrainTestSplit(sms_spam, 'spam_flag', sample_size, 200)

    model_path = f"../data/temp/model_{sample_size}"
    os.makedirs(model_path, exist_ok=True)

    with open(f"{model_path}/training.jsonl", 'w') as f:
        for index, row in train_data.iterrows():
            f.write(json.dumps({
                "messages": [
                    {"role": "system", "content": systemPrompt},
                    {"role": "user", "content": row['prompt']},
                    {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
                ]
            }) + "\n")

    with open(f"{model_path}/validation.jsonl", 'w') as f:
        for index, row in train_data.iterrows():
            f.write(json.dumps({
                "messages": [
                    {"role": "system", "content": systemPrompt},
                    {"role": "user", "content": row['prompt']},
                    {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
                ]
            }) + "\n")

# Do Fine Tuning

In [19]:
foundationModel = "gpt-3.5-turbo-1106"

def runFineTuning(training_data_path, validation_data_path):
    print("Uploading training file: {}".format(training_data_path))
    training_file = client.files.create(
        file=open(training_data_path, "rb"),
        purpose='fine-tune'
    )
    print("Uploading validation file: {}".format(validation_data_path))
    validation_file = client.files.create(
        file=open(validation_data_path, "rb"),
        purpose='fine-tune'
    )
    print("Submitting fine-tuning job for foundation model {}".format(foundationModel))
    job = client.fine_tuning.jobs.create(training_file=training_file.id, validation_file=validation_file.id, model=foundationModel)
    print("Submitted job {}".format(job.id))
    return job

### Submit a training job for each sample size we are testing

In [20]:
# Running this cell will start jobs with OpenAI and incur usage cost

sizes_to_run = [50, 60, 70]
sizes_to_run = [80, 90]

submitted_jobs = []
for sample_size in sizes_to_run:
    training_data_path = f"../data/temp/model_{sample_size}/training.jsonl"
    validation_data_path = f"../data/temp/model_{sample_size}/validation.jsonl"
    job = runFineTuning(training_data_path, validation_data_path)
    with open(f"../data/temp/model_{sample_size}/job_start.json", 'w') as f:
        json.dump(job.__str__(), f, indent=4)



Uploading training file
Uploading validation file
Submitting fine-tuning job
Submitted job ftjob-kJDKNUXtkOjJ6f1ZlZTIMwk3 for file ../data/temp/model_60/training.jsonl
Uploading training file
Uploading validation file
Submitting fine-tuning job
Submitted job ftjob-14hf6eczeaaST4V6Tniso8DU for file ../data/temp/model_70/training.jsonl


### Monitor the jobs

In [30]:
while True:
    current_jobs = client.fine_tuning.jobs.list(limit=10)
    df = makeJobsDataframe(current_jobs.data)
    clear_output(wait=True)
    print(f"Updated at {datetime.now()}")
    display(df)
    time.sleep(10)

Updated at 2023-11-09 13:21:40.532381


Unnamed: 0,ID,Training File,Status,Duration,TrainedTokens,TokensPerMinute,FT ID
0,ftjob-14hf6eczeaaST4V6Tniso8DU,file-XC0lxrDfmQg4t4gAQ1jmNVge,running,9.192172,0,0.0,
1,ftjob-kJDKNUXtkOjJ6f1ZlZTIMwk3,file-jBE3mjrdneN2u94yu9c8hhCh,running,9.242172,0,0.0,
2,ftjob-LiL1r8rE2HEpdaj4KYnIjz9U,file-JyvEGeQTceZWqs5gtcwX314T,running,33.158839,0,0.0,
3,ftjob-Y9CLf0O6axyYeHU3b3pNFrZc,file-Dgml7KFJvBrAlIDQ5LlSSbVk,succeeded,22.05,16455,746.258503,ft:gpt-3.5-turbo-0613:hypercolor::8J4dFhGo
4,ftjob-FMNe8R1qb21f4m4LywYxcdxp,file-a4ci3mqKw1CbSyNrQUxpZza7,succeeded,17.433333,8658,496.634799,ft:gpt-3.5-turbo-0613:hypercolor::8J4N4EYY
5,ftjob-Jlr2b3eW1qbSIJOjgrUbBk2y,file-B1y2Kgy61uxBA6KNOqHpvset,succeeded,103.066667,32352,313.89392,ft:gpt-3.5-turbo-1106:aa-engineering::8IAIy8LD
6,ftjob-BM8V9hMp1nHn0v2Sn9YRagSg,file-LpT2K57tcfuXsO0fmtXLFibN,succeeded,78.5,16455,209.617834,ft:gpt-3.5-turbo-1106:aa-engineering::8I9vALSP
7,ftjob-v2MTWHznXSCoi9zIhV4xeGfv,file-fxIq8wsI48lNp8khtYfj4Flo,succeeded,63.033333,8658,137.355896,ft:gpt-3.5-turbo-1106:aa-engineering::8I9g9RO0
8,ftjob-KUl5tSNid5Rq08EK9JFiySDt,file-tmOKf3KsbaOQLcIDMvx7lMbq,succeeded,16.916667,31632,1869.871921,ft:gpt-3.5-turbo-0613:aa-engineering::8HnGqN6a
9,ftjob-hjCv26zXKV13we6T1GoZSU3I,file-MTHHGTfNOed6X1YWWF1zX8Ud,succeeded,18.55,23943,1290.727763,ft:gpt-3.5-turbo-0613:aa-engineering::8HnINqwQ


In [None]:
# Other useful commands
client.fine_tuning.jobs.list(limit=10)
#client.fine_tuning.jobs.list_events(id=job.id, limit=10)
#client.fine_tuning.jobs.cancel(job.id)
#client.fine_tuning.jobs.retrieve(id='ftjob-KUl5tSNid5Rq08EK9JFiySDt')

# Try the models

In [26]:
# Sample Size to Model ID
completed_models = {
    50: 'ft:gpt-3.5-turbo-0613:hypercolor::8J4N4EYY',
    100: 'ft:gpt-3.5-turbo-0613:hypercolor::8J4dFhGo',
    200: 'ft:gpt-3.5-turbo-0613:aa-engineering::8IAIy8LD'
}

In [27]:
def getSpamClassification_FineTune(fineTunedModelId, prompt):
  completion = client.chat.completions.create(
    model=fineTunedModelId,
    messages=[
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
    ]
  )
  result = completion.choices[0].message.content.lower() == 'spam'
  # print(prompt, "=>", result)
  return result


In [28]:
getSpamClassification_FineTune(completed_models[50], "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")

True

In [29]:
getSpamClassification_FineTune(completed_models[50], "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

False