In [None]:
# * Import dataset
#     * What datasets???
# * Data Cleaning
# * Open AI SDK
# * Kick off a fine tune
# * Can I share a pre-built fine tune that others can use?
# * Test on some live examples
# * Demonstrate adding more training data to an existing fine tune

# Do a section on Prompt Engineering too
# Compare how prompt engineering vs fine tuning compares for out-of-sample data using sms+youtube data

### Requirements:

* Python 3 environment
    * python3 -m venv venv
    * source ./venv/bin/activate
* pandas
    * pip3 install pandas
* python-dotenv
    * pip3 install python-dotenv
* OpenAI Account
    * Need a valid API key: https://platform.openai.com/account/api-keys
* OpenAI CLI
    * https://github.com/openai/openai-python
    * pip install --pre openai
    * Configure with API Key: 
        * Create .env file with `OPENAI_API_KEY=sk_XXXX_...`

### Resources

* https://platform.openai.com/docs/guides/fine-tuning


In [16]:
import pandas as pd
import openai
import os
import json
from dotenv import load_dotenv; load_dotenv()
from src.util import distributionPreservingDownsample

In [13]:
openai.api_key = os.environ['OPENAI_API_KEY']

In [14]:
sms_spam_all = pd.read_csv('../data/kaggle_sms_spam.csv', encoding='latin-1')[['label', 'prompt']]
sms_spam_all['spam_flag'] = sms_spam_all['label'].apply(lambda x: True if x == 'spam' else False)
sms_spam = sms_spam_all.drop_duplicates(subset=['prompt'])
print("Loaded sms data file with {} rows, kept {}".format(len(sms_spam_all), len(sms_spam)))
sms_spam.head()


Loaded sms data file with 5572 rows, kept 5169


Unnamed: 0,label,prompt,spam_flag
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False


# Downsample to smaller dataset

When first training the model, it can be helpful to work with a smaller dataset.  This distributionPreservingDownsample will select a random subset of the dataframe, with the same ratio of spam/ham in the dataset.

In [15]:
downsampled = distributionPreservingDownsample(sms_spam, 'spam_flag', 500)
print("Downsampled {} -> {}".format(len(sms_spam),len(downsampled)))
print("Ratio of ham/spam in original dataset: {} / {}".format(len(sms_spam[sms_spam['spam_flag'] == True]) / len(sms_spam),len(sms_spam[sms_spam['spam_flag'] == False]) / len(sms_spam)))
print("Ratio of ham/spam in downsampled dataset: {} / {}".format(len(downsampled[downsampled['spam_flag'] == True]) / len(downsampled),len(downsampled[downsampled['spam_flag'] == False]) / len(downsampled)))
downsampled.head()

Downsampled 5169 -> 500
Ratio of ham/spam in original dataset: 0.12633004449603405 / 0.873669955503966
Ratio of ham/spam in downsampled dataset: 0.126 / 0.874


Unnamed: 0,label,prompt,spam_flag
1960,spam,Guess what! Somebody you know secretly fancies...,True
3228,spam,Ur cash-balance is currently 500 pounds - to m...,True
190,spam,Are you unique enough? Find out from 30th Augu...,True
374,spam,"Thanks for your Ringtone Order, Reference T91....",True
2073,spam,FreeMsg: Claim ur 250 SMS messages-Text OK to ...,True


# Data Cleaning

### Define System Prompt

In [17]:
systemPrompt = "You are a system for categorizing SMS text messages as being unwanted spam or normal messages."

### Select which data set to use (downsampled or full)

In [22]:
# dfToUse = df
# jsonlFilename = "sms_spam_full"
dfToUse = downsampled
jsonlFilename = "sms_spam_small"

jsonl_data_path = f"../data/temp/{jsonlFilename}.jsonl"

### Create a JSONL file containing rows formatted in OpenAI Chat Format:
https://platform.openai.com/docs/guides/fine-tuning/example-format

In [25]:
with open(jsonl_data_path, 'w') as f:
    for index, row in dfToUse.iterrows():
        f.write(json.dumps({
            "messages": [
                {"role": "system", "content": systemPrompt},
                {"role": "user", "content": row['prompt']},
                {"role": "assistant", "content": "spam" if row['spam_flag'] else "ham"}
            ]
        }) + "\n")

# Do Fine Tuning

### Upload JSONL file to OpenAI

In [28]:
uploadedFile = openai.File.create(
  file=open(jsonl_data_path, "rb"),
  purpose='fine-tune'
)
uploadedFile

<File file id=file-A9NBysw0N5tsSopAdpkcOF6S at 0x12bd63e30> JSON: {
  "object": "file",
  "id": "file-A9NBysw0N5tsSopAdpkcOF6S",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 149751,
  "created_at": 1698726497,
  "status": "processed",
  "status_details": null
}

### Start the Fine Tuning job

In [38]:
job = openai.FineTuningJob.create(training_file=uploadedFile.id, model="gpt-3.5-turbo")
job


<FineTuningJob fine_tuning.job id=ftjob-3xQGCgLB44R1C5jG0hrvcIbt at 0x12bcc42f0> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-3xQGCgLB44R1C5jG0hrvcIbt",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698726834,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-n3iT5I0sZST0QX1nSKkPHmb7",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-A9NBysw0N5tsSopAdpkcOF6S",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}

### Monitor Job Status

In [52]:
# Get job details - training is ready when "fine_tuned_model" is no longer null
status = openai.FineTuningJob.retrieve(job.id)
display(status)

if status.fine_tuned_model is None:
    print("Training not complete yet")
else:
    print("Training complete!")

fineTunedModelId = status.fine_tuned_model

<FineTuningJob fine_tuning.job id=ftjob-3xQGCgLB44R1C5jG0hrvcIbt at 0x12c3c6b70> JSON: {
  "object": "fine_tuning.job",
  "id": "ftjob-3xQGCgLB44R1C5jG0hrvcIbt",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698726834,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-n3iT5I0sZST0QX1nSKkPHmb7",
  "result_files": [],
  "status": "running",
  "validation_file": null,
  "training_file": "file-A9NBysw0N5tsSopAdpkcOF6S",
  "hyperparameters": {
    "n_epochs": 3
  },
  "trained_tokens": null,
  "error": null
}

Training not complete yet


In [45]:
# Other useful commands
#openai.FineTuningJob.list(limit=10)
#openai.FineTuningJob.list_events(id=job.id, limit=10)
#openai.FineTuningJob.cancel(job.id)

# Try out the model

In [None]:
completion = openai.ChatCompletion.create(
  model=fineTunedModelId,
  messages=[
    {"role": "system", "content": systemPrompt},
    {"role": "user", "content": "Hey what's happening? Want to get some ramen?"}
  ]
)
print(completion.choices[0].message)

In [None]:
completion = openai.ChatCompletion.create(
  model=fineTunedModelId,
  messages=[
    {"role": "system", "content": systemPrompt},
    {"role": "user", "content": "Hey what's happening? Do you need help recovering your lost crypto tokens?"}
  ]
)
print(completion.choices[0].message)