In [1]:

import openai
import os
import json
import pandas as pd
import time
import datetime

In [2]:
from dotenv import load_dotenv; load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

In [3]:
fine_tuned_models = {
    50: 'ft:gpt-3.5-turbo-1106:aa-engineering::8I9g9RO0',
    100: 'ft:gpt-3.5-turbo-1106:aa-engineering::8I9vALSP',
    200: 'ft:gpt-3.5-turbo-1106:aa-engineering::8IAIy8LD'
}

# Define Classification APIs

These APIs take an input message and use either a fine-tuned model or the general-purpose model to predict whether it is spam.  Each returns a boolean: True for spam.

In [4]:
# !pip install retry
from retry import retry

In [5]:
# Fine Tuned Model API

fineTunePrompt = "You are a system for categorizing SMS text messages as being unwanted spam or normal messages."

@retry(delay=0, backoff=2, max_delay=10)
async def getSpamClassification_FineTune(fineTunedModelId, prompt):
  completion = await openai.ChatCompletion.acreate(
    model=fineTunedModelId,
    messages=[
      {"role": "system", "content": fineTunePrompt},
      {"role": "user", "content": prompt}
    ]
  )
  result = completion.choices[0].message.content.lower() == 'spam'
  # print(prompt, "=>", result)
  return result



In [6]:
# General Purpose Model API

generalModelPrompt = "You will be provided with a text message. You will need to classify the text message as spam, ham. Spam is a text message that is spam, harmful, abusive, or otherwise unwanted. Ham is a text message that is not spam."

@retry(delay=0, backoff=2, max_delay=10)
async def getSpamClassification_GeneralModel(message):
    response = await openai.ChatCompletion.acreate(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": generalModelPrompt},
            {"role": "user", "content": message}
        ],
        temperature=0,
        max_tokens=256
    )
    return response.choices[0].message.content.lower() == 'spam'

# Predict on Validation Data

Each fine-tuned model has a validation dataset in addition to its training data.  Here we predict on those datasets for each fine tuned model and the general-purpose model.

Predicting on the entire validation set takes some time.  OpenAI has a rate limit of 60 requests per minute.  It's also not free, so it's not something we want to have to do more than once.

To make this code robust to things like network errors, we start by creating a dataframe that contains the validation data and a blank column for the results.  The code will run predictions for each row that has an empty result.  This means that this code can be restarted in case of failure.

Additionally, to avoid having to re-run all the predictions in case of kernel restart, we save the resulting dataframe to file where it can be optionally reloaded.

In [7]:
rows = []
for sample_size in fine_tuned_models.keys():
    fineTunedModelId = fine_tuned_models[sample_size]
    validation_data_path = f"../data/temp/model_{sample_size}/validation.jsonl"
    with open(validation_data_path, 'r') as f:

        # To test this on a smaller dataset, we can optionally use "[:5]" to take only the first 5 lines
        # for line in f.readlines()[:5]:
        for line in f.readlines():
            data = json.loads(line)
            prompt = data['messages'][1]['content']
            completion = data['messages'][2]['content']
            rows.append({
                'model': fineTunedModelId,
                'sample_size': sample_size,
                'prompt': prompt,
                'expected': completion == 'spam',
                'predicted': None
            })
            rows.append({
                'model': 'general',
                'sample_size': sample_size,
                'prompt': prompt,
                'expected': completion == 'spam',
                'predicted': None
            })    

validation_df = pd.DataFrame(rows)      
print("Prepared empty validation dataframe with {} rows".format(len(validation_df)))           

Prepared empty validation dataframe with 700 rows


In [8]:
# If a previous result is available, we can optionally load it here instead of re-running the validation
# validation_df = pd.read_csv('../data/temp/validation_results.csv')
# validation_df

In [11]:
# Note: Running this cell will take a while and incur API usage costs

# Use Throttler to limit the number of requests per minute
# %pip install throttler
from throttler import Throttler
throttler = Throttler(rate_limit=19, period=20)

# Use Semaphore to control the number of concurrent requests
import asyncio
semaphore = asyncio.Semaphore(10)

print("Running validation, {} items remaining".format(validation_df['predicted'].isnull().sum()))

# async def classifyRow(index, row):
#     async with semaphore:
#         async with throttler:
#             print("Predicting on validation row {} / {}".format(index+1, len(validation_df)))
#             if row['model'] == 'general':
#                 result = await getSpamClassification_GeneralModel(row['prompt'])
#             else:
#                 result = await getSpamClassification_FineTune(row['model'], row['prompt'])
#         validation_df.loc[index, 'predicted'] = result

# tasks = [classifyRow(index, row) for index, row in validation_df.iterrows()]

# # Schedule the tasks for execution
# for task in tasks:
#     asyncio.ensure_future(task)

# # Wait for all tasks to complete
# await asyncio.gather(*tasks)

# asyncio.run(asyncio.gather(*(classifyRow(index, row) for index, row in validation_df.iterrows())))        

start = time.time()
for index, row in validation_df.iterrows():
    if row['predicted'] is None:
        async with throttler:
            elapsedSeconds = time.time() - start
            print("{}  Predicting on validation row {} / {}".format(str(datetime.timedelta(seconds=elapsedSeconds)), index+1, len(validation_df)))
            if row['model'] == 'general':
                result = await getSpamClassification_GeneralModel(row['prompt'])
            else:
                result = await getSpamClassification_FineTune(row['model'], row['prompt'])
        validation_df.loc[index, 'predicted'] = result

validation_df['predicted'] = validation_df['predicted'].astype(bool)
validation_df['correct'] = validation_df['expected'] == validation_df['predicted']
validation_df.to_csv('../data/temp/validation_results.csv', index=False)
print("Saved validation results to ../data/temp/validation_results.csv")
validation_df.head()


Running validation, 581 items remaining
0:00:00.006995  Predicting on validation row 120 / 700
0:00:00.413971  Predicting on validation row 121 / 700
0:00:00.687551  Predicting on validation row 122 / 700
0:00:01.249849  Predicting on validation row 123 / 700
0:00:01.571406  Predicting on validation row 124 / 700
0:00:01.945423  Predicting on validation row 125 / 700
0:00:02.259776  Predicting on validation row 126 / 700
0:00:02.651120  Predicting on validation row 127 / 700
0:00:02.939800  Predicting on validation row 128 / 700
0:00:03.337975  Predicting on validation row 129 / 700
0:00:03.624033  Predicting on validation row 130 / 700
0:00:04.029996  Predicting on validation row 131 / 700
0:00:04.427712  Predicting on validation row 132 / 700
0:00:05.213995  Predicting on validation row 133 / 700
0:00:05.468360  Predicting on validation row 134 / 700
0:00:07.072667  Predicting on validation row 135 / 700
0:00:07.347170  Predicting on validation row 136 / 700
0:00:07.923684  Predictin

In [66]:
# Create a confusion matrix for each sample size and model, put them into a dataframe

#%pip install scikit-learn
from sklearn.metrics import confusion_matrix


grouped = validation_df.groupby('sample_size')

rows = []
for sample_size in grouped.groups:
    group = grouped.get_group(sample_size)
    # display(group)

    fineTuneModelPredictions = group[group['model'] != 'general']
    generalModelPredictions = group[group['model'] == 'general']

    fineTuneConfusionMatrix = confusion_matrix(fineTuneModelPredictions['expected'], fineTuneModelPredictions['predicted'], labels=[True, False])
    # print(fineTuneConfusionMatrix)
    fineTuneModelAccuracy = (fineTuneConfusionMatrix[0][0] + fineTuneConfusionMatrix[1][1]) / (fineTuneConfusionMatrix[0][0] + fineTuneConfusionMatrix[0][1] + fineTuneConfusionMatrix[1][0] + fineTuneConfusionMatrix[1][1])
    rows.append([sample_size, 'fine-tuned', fineTuneConfusionMatrix[0][0], fineTuneConfusionMatrix[0][1], fineTuneConfusionMatrix[1][0], fineTuneConfusionMatrix[1][1], fineTuneModelAccuracy])

    generalConfusionMatrix = confusion_matrix(generalModelPredictions['expected'], generalModelPredictions['predicted'], labels=[True, False])
    # print(generalConfusionMatrix)
    generalModelAccuracy = (generalConfusionMatrix[0][0] + generalConfusionMatrix[1][1]) / (generalConfusionMatrix[0][0] + generalConfusionMatrix[0][1] + generalConfusionMatrix[1][0] + generalConfusionMatrix[1][1])
    rows.append([sample_size, 'general', generalConfusionMatrix[0][0], generalConfusionMatrix[0][1], generalConfusionMatrix[1][0], generalConfusionMatrix[1][1], generalModelAccuracy])

confusion_matrix_df = pd.DataFrame(rows, columns=['sample_size', 'model', 'true_positive', 'false_positive', 'false_negative', 'true_negative', 'accuracy'])
confusion_matrix_df

Unnamed: 0,sample_size,model,true_positive,false_positive,false_negative,true_negative
0,50,fine-tuned,5,0,0,0
1,50,general,5,0,0,0
2,100,fine-tuned,5,0,0,0
3,100,general,5,0,0,0
4,200,fine-tuned,5,0,0,0
5,200,general,5,0,0,0
