# Part 1 - Fine Tuning

### Overview

This notebook demonstrates fine-tuning GPT 3.5 for text classification on a dataset of SMS text messages.

The following steps are covered:

* Loading and enriching SMS dataset
* Downsampling the dataset for fine tuning
* Training three fine-tuned models with sizes: 50, 100, 200
* Experimenting with the fine-tuned models

### Requirements

* Python 3 environment
    * python3 -m venv venv
    * Select venv kernel in VS Code
        * Upper-right corner of notebook in editor
* OpenAI Account
    * Need a valid API key: https://platform.openai.com/account/api-keys
* OpenAI Python Module
    * https://github.com/openai/openai-python
    * pip install --pre openai
    * Configure with API Key: 
        * Create .env file with `OPENAI_API_KEY=sk_XXXX_...`

In [1]:
# Install dependencies if needed
# %pip install pandas
# %pip install python-dotenv
# %pip install openai
# %pip show openai

### Resources

* https://platform.openai.com/docs/guides/fine-tuning
* https://platform.openai.com/docs/api-reference/fine-tuning


In [56]:
import pandas as pd
from openai import OpenAI
# from openai import AsyncOpenAI
import os
import json
from datetime import datetime
import time
from IPython.display import clear_output

%reload_ext autoreload
%autoreload 2
from src.util import getTrainTestSplit, makeJobsDataframe, distributionPreservingDownsample

In [57]:
from dotenv import load_dotenv; load_dotenv()
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Load SMS Dataset

In [58]:
sms_spam_all = pd.read_csv('../data/kaggle_sms_spam.csv', encoding='latin-1')[['label', 'prompt']]
sms_spam_all['spam_flag'] = sms_spam_all['label'].apply(lambda x: True if x == 'spam' else False)
sms_spam = sms_spam_all.drop_duplicates(subset=['prompt'])
print("Loaded sms data file with {} rows, kept {}".format(len(sms_spam_all), len(sms_spam)))
sms_spam.head()


Loaded sms data file with 5572 rows, kept 5169


Unnamed: 0,label,prompt,spam_flag
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False


# Set up System Prompt

In [59]:
systemPrompt = "You are a system for categorizing SMS text messages as being unwanted spam or normal messages."

# Create downsampled datasets at various sizes

We want to see how the dataset size affects model training time

In [67]:
sample_sizes = [10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 200]
test_data_size = 200

all_train_data, test_data = getTrainTestSplit(sms_spam, 'spam_flag', max(sample_sizes), test_data_size)

validation_data_path = "../data/temp"
os.makedirs(validation_data_path, exist_ok=True)
with open(f"{validation_data_path}/validation.jsonl", 'w') as f:
    for index, row in test_data.iterrows():
        f.write(json.dumps({
            "messages": [
                {"role": "system", "content": systemPrompt},
                {"role": "user", "content": row['prompt']},
                {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
            ]
        }) + "\n")

for sample_size in sample_sizes:
    
    model_path = f"../data/temp/model_{sample_size}"
    os.makedirs(model_path, exist_ok=True)

    train_data = distributionPreservingDownsample(all_train_data, 'spam_flag', sample_size)

    with open(f"{model_path}/training.jsonl", 'w') as f:
        for index, row in train_data.iterrows():
            f.write(json.dumps({
                "messages": [
                    {"role": "system", "content": systemPrompt},
                    {"role": "user", "content": row['prompt']},
                    {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
                ]
            }) + "\n")



# Do Fine Tuning

In [62]:
foundationModel = "gpt-3.5-turbo-1106"

def runFineTuning(training_data_path, validation_data_path):
    print("Uploading training file: {}".format(training_data_path))
    training_file = client.files.create(
        file=open(training_data_path, "rb"),
        purpose='fine-tune'
    )
    print("Uploading validation file: {}".format(validation_data_path))
    validation_file = client.files.create(
        file=open(validation_data_path, "rb"),
        purpose='fine-tune'
    )
    print("Submitting fine-tuning job for foundation model {}".format(foundationModel))
    job = client.fine_tuning.jobs.create(training_file=training_file.id, validation_file=validation_file.id, model=foundationModel)
    print("Submitted job {}".format(job.id))
    return job

### Submit a training job for each sample size we are testing

In [68]:
# Running this cell will start jobs with OpenAI and incur usage cost

# sizes_to_run = [10, 15, 20]
# sizes_to_run = [25, 30, 35]
sizes_to_run = [40, 45, 50]
# sizes_to_run = [60, 70, 80]
# sizes_to_run = [90, 100, 200]


submitted_jobs = []
for sample_size in sizes_to_run:
    training_data_path = f"../data/temp/model_{sample_size}/training.jsonl"
    validation_data_path = f"../data/temp/validation.jsonl"
    job = runFineTuning(training_data_path, validation_data_path)
    with open(f"../data/temp/model_{sample_size}/job_start.json", 'w') as f:
        json.dump(job.__str__(), f, indent=4)



Uploading training file: ../data/temp/model_35/training.jsonl
Uploading validation file: ../data/temp/validation.jsonl
Submitting fine-tuning job for foundation model gpt-3.5-turbo-1106
Submitted job ftjob-Dp5kaFNVOJikGve79K5M9KVH


In [99]:
sizes_to_run = [10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80 ,90, 100, 200]
# sizes_to_run = [50, 60, 70]
all_job_data = []
for sample_size in sample_sizes:
    all_job_data.append({
        'sample_size': sample_size,
        'job': None
    })

scheduled_jobs = []
while len(scheduled_jobs) < len(all_job_data):

    clear_output(wait=True)

    current_jobs = client.fine_tuning.jobs.list(limit=10)
    df = makeJobsDataframe(current_jobs.data)
    display(df)

    num_active_jobs = len([j for j in current_jobs.data if j.status not in ["succeeded", "cancelled", "error"]])
    print("Active jobs: {}".format(num_active_jobs))
    if num_active_jobs < 3:

        print("Ready to submit new job, currently active: {}".format(num_active_jobs))
        for job_data in all_job_data:
            if job_data['job'] is None:
                model_path = f"../data/temp/model_{job_data['sample_size']}"
                training_data_path = f"{model_path}/training.jsonl"
                validation_data_path = f"../data/temp/validation.jsonl"
                job = runFineTuning(training_data_path, validation_data_path)

                print("json path: {}".format(f"{model_path}/job_start.json"))

                with open(f"{model_path}/job_start.json", 'w') as f:
                    json.dump(job.__str__(), f, indent=4)
                job_data['job'] = job.id
                scheduled_jobs.append(job)
                break
    else:
        print("Waiting for jobs to finish, {} remain".format(len(all_job_data) - len(scheduled_jobs)))
        print(f"Updated at {datetime.now()}")
        time.sleep(20)



Updated at 2023-11-09 21:17:32.785522


Unnamed: 0,ID,Training File,Status,Duration,TrainedTokens,TokensPerMinute,FT ID
0,ftjob-yvzU8bBhvblvjvnh7qBmvcXb,file-RH8yR4RaQ8a1iTp6dbsb4lpE,running,5.751037,0,0.0,
1,ftjob-Rc8n7qsLcoRIqwlfX4eVjnnk,file-N09I06xISGTANtsWcTkFbvSo,running,9.851037,0,0.0,
2,ftjob-CqneHDnaVERvElsAVcdYaTGL,file-RfgJ0qWNehDhKMVe81OtZMkA,succeeded,23.033333,13308,577.771346,ft:gpt-3.5-turbo-1106:hypercolor::8JCJ48nL
3,ftjob-f6FiM7pHeGKccwBwziWd1c7w,file-yzfNU9aY8KHNgcUiUWz9qQdQ,succeeded,20.216667,11676,577.543281,ft:gpt-3.5-turbo-1106:hypercolor::8JCDiUJu
4,ftjob-TyDA4tqEd6WPYB2DrvsRqk1a,file-CTC5xzpESQ0vNuUr73BHaFeA,succeeded,19.25,9954,517.090909,ft:gpt-3.5-turbo-1106:hypercolor::8JC9mQ5T
5,ftjob-rzc89kHag5HSWCQ1j5LFxlkG,file-OrEFBDBCeM84QxKmKdCKOZO4,succeeded,16.4,8532,520.243902,ft:gpt-3.5-turbo-1106:hypercolor::8JBwWNER
6,ftjob-rDXDYFQzCUQZMI9v5NOETuju,file-kqozmupBqqo4vkLlDrqYvRMV,succeeded,14.25,7800,547.368421,ft:gpt-3.5-turbo-1106:hypercolor::8JBtje7h
7,ftjob-dEj5ggwOc2Qf4yfGNiTcDkIL,file-WR06movHzKQ8bBuWSFzTHqD9,succeeded,23.416667,6927,295.814947,ft:gpt-3.5-turbo-1106:hypercolor::8JBqnboO
8,ftjob-lRMXQrXBZYnfFaGjbEaBCSl9,file-uoYnFeUz5smrJJAMDnuWf2Ny,succeeded,12.333333,6066,491.837838,ft:gpt-3.5-turbo-1106:hypercolor::8JBfhbX6
9,ftjob-IK1WleV7VSxmGwQz2RDyL6Ps,file-vNV41RPJU5lzteI2bMZR2QUY,succeeded,22.116667,5247,237.241899,ft:gpt-3.5-turbo-1106:hypercolor::8JBgIVeX


Active jobs: 2
Ready to submit new job, currently active: 2
Uploading training file: ../data/temp/model_200/training.jsonl
Uploading validation file: ../data/temp/validation.jsonl
Submitting fine-tuning job for foundation model gpt-3.5-turbo-1106
Submitted job ftjob-79ZCSpZIG2Si87yDBJyOWpEk
json path: ../data/temp/model_200/job_start.json


### Monitor the jobs

In [116]:
# Running this cell will continuously monitor the status of the jobs and display the results
# While this monitoring is running you will not be able to run other cells in this notebook
# Cancel the cell to stop monitoring

while True:
    current_jobs = client.fine_tuning.jobs.list(limit=10)
    df = makeJobsDataframe(current_jobs.data)
    clear_output(wait=True)
    print(f"Updated at {datetime.now()}")
    display(df)
    time.sleep(10)

Updated at 2023-11-09 22:19:36.245923


Unnamed: 0,ID,Training File,Status,Duration,TrainedTokens,TokensPerMinute,FT ID
0,ftjob-79ZCSpZIG2Si87yDBJyOWpEk,file-LefbTqmZpYmr1DK8qSpRdLqD,succeeded,58.45,32352,553.498717,ft:gpt-3.5-turbo-1106:hypercolor::8JDE3R1k
1,ftjob-yvzU8bBhvblvjvnh7qBmvcXb,file-RH8yR4RaQ8a1iTp6dbsb4lpE,succeeded,28.016667,16446,587.007733,ft:gpt-3.5-turbo-1106:hypercolor::8JCf0Fge
2,ftjob-Rc8n7qsLcoRIqwlfX4eVjnnk,file-N09I06xISGTANtsWcTkFbvSo,succeeded,25.583333,15060,588.664495,ft:gpt-3.5-turbo-1106:hypercolor::8JCYhUyu
3,ftjob-CqneHDnaVERvElsAVcdYaTGL,file-RfgJ0qWNehDhKMVe81OtZMkA,succeeded,23.033333,13308,577.771346,ft:gpt-3.5-turbo-1106:hypercolor::8JCJ48nL
4,ftjob-f6FiM7pHeGKccwBwziWd1c7w,file-yzfNU9aY8KHNgcUiUWz9qQdQ,succeeded,20.216667,11676,577.543281,ft:gpt-3.5-turbo-1106:hypercolor::8JCDiUJu
5,ftjob-TyDA4tqEd6WPYB2DrvsRqk1a,file-CTC5xzpESQ0vNuUr73BHaFeA,succeeded,19.25,9954,517.090909,ft:gpt-3.5-turbo-1106:hypercolor::8JC9mQ5T
6,ftjob-rzc89kHag5HSWCQ1j5LFxlkG,file-OrEFBDBCeM84QxKmKdCKOZO4,succeeded,16.4,8532,520.243902,ft:gpt-3.5-turbo-1106:hypercolor::8JBwWNER
7,ftjob-rDXDYFQzCUQZMI9v5NOETuju,file-kqozmupBqqo4vkLlDrqYvRMV,succeeded,14.25,7800,547.368421,ft:gpt-3.5-turbo-1106:hypercolor::8JBtje7h
8,ftjob-dEj5ggwOc2Qf4yfGNiTcDkIL,file-WR06movHzKQ8bBuWSFzTHqD9,succeeded,23.416667,6927,295.814947,ft:gpt-3.5-turbo-1106:hypercolor::8JBqnboO
9,ftjob-lRMXQrXBZYnfFaGjbEaBCSl9,file-uoYnFeUz5smrJJAMDnuWf2Ny,succeeded,12.333333,6066,491.837838,ft:gpt-3.5-turbo-1106:hypercolor::8JBfhbX6


KeyboardInterrupt: 

In [105]:
# Other useful commands
# client.fine_tuning.jobs.list(limit=10)
#client.fine_tuning.jobs.list_events(id=job.id, limit=10)
# client.fine_tuning.jobs.cancel('ftjob-eFMitAHD9fqWwYrrADQrNjKL')
client.fine_tuning.jobs.retrieve('ftjob-CqneHDnaVERvElsAVcdYaTGL')

FineTuningJob(id='ftjob-CqneHDnaVERvElsAVcdYaTGL', created_at=1699584848, error=None, fine_tuned_model='ft:gpt-3.5-turbo-1106:hypercolor::8JCJ48nL', finished_at=1699586230, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-n3iT5I0sZST0QX1nSKkPHmb7', result_files=['file-0UqqpkrOLdDGWfl35yI2BatS'], status='succeeded', trained_tokens=13308, training_file='file-RfgJ0qWNehDhKMVe81OtZMkA', validation_file='file-YUY9bd7zDbcXXcpa8FurH6DF')

# Try the models

In [117]:
def buildFineTuneModelIdMap(job_data_list):
    fine_tune_model_id_map = {}
    for job_data in job_data_list:
        job = client.fine_tuning.jobs.retrieve(job_data['job'])
        # if job.status == 'succeeded':
        fine_tune_model_id_map[job_data['sample_size']] = job.fine_tuned_model

    return fine_tune_model_id_map

fine_tuned_models = buildFineTuneModelIdMap(all_job_data)
fine_tuned_models

{10: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBJIU02',
 15: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBKfVCh',
 20: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBTXhIj',
 25: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBTuMaq',
 30: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBgIVeX',
 35: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBfhbX6',
 40: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBqnboO',
 45: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBtje7h',
 50: 'ft:gpt-3.5-turbo-1106:hypercolor::8JBwWNER',
 60: 'ft:gpt-3.5-turbo-1106:hypercolor::8JC9mQ5T',
 70: 'ft:gpt-3.5-turbo-1106:hypercolor::8JCDiUJu',
 80: 'ft:gpt-3.5-turbo-1106:hypercolor::8JCJ48nL',
 90: 'ft:gpt-3.5-turbo-1106:hypercolor::8JCYhUyu',
 100: 'ft:gpt-3.5-turbo-1106:hypercolor::8JCf0Fge',
 200: 'ft:gpt-3.5-turbo-1106:hypercolor::8JDE3R1k'}

In [110]:
def getSpamClassification_FineTune(fineTunedModelId, prompt):
  completion = client.chat.completions.create(
    model=fineTunedModelId,
    messages=[
      {"role": "system", "content": systemPrompt},
      {"role": "user", "content": prompt}
    ]
  )
  result = completion.choices[0].message.content.lower() == 'spam'
  # print(prompt, "=>", result)
  return result


In [111]:
getSpamClassification_FineTune(fine_tuned_models[50], "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's")

True

In [112]:
getSpamClassification_FineTune(fine_tuned_models[50], "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

False