In [2]:
%reload_ext autoreload
%autoreload 2
from src.util import distributionPreservingDownsample

import pandas as pd
import openai
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
apikey = os.environ['OPENAI_API_KEY']
openai.api_key = apikey

# Load and preprocess data

In [4]:
allCommentsDf = pd.read_json('data/comments.json')[['id', 'deleted', 'display_name', 'text', 'spam_flag']]
df = allCommentsDf[allCommentsDf['text'].str.len() < 2000]
df = df.drop_duplicates(subset=['text'])
df['spam'] = ['ham' if x == False else 'spam' for x in df['spam_flag']]

print("Loaded data file with {} rows, kept {}".format(len(allCommentsDf), len(df)))
df.head()

Loaded data file with 8210 rows, kept 6462


Unnamed: 0,id,deleted,display_name,text,spam_flag,spam
0,8741,False,Deb Musta Ginkel,I have very many fond memories growing up wit...,False,ham
1,8740,True,ROSE,I NEED AN URGENT LOVE SPELL CASTER TO BRING BA...,True,spam
3,8738,False,Joyce Ann Mosimann,Dear Janet and Bruce. Gary and I are sending ...,False,ham
4,8737,False,Ed,Where is my friends buried I went by our lady ...,False,ham
5,8736,True,Ramone,I had the opportunity to meet and sit withe ma...,True,spam


# Downsample to smaller dataset

When first training the model, it can be helpful to work with a smaller dataset.  This distributionPreservingDownsample will select a random subset of the dataframe, with the same ratio of spam/ham in the dataset.

In [5]:
downsampled = distributionPreservingDownsample(df, 'spam_flag', 500)
print("Downsampled {} -> {}".format(len(df),len(downsampled)))
print("Ratio of True / False in original dataset: {} / {}".format(len(df[df['spam_flag'] == True]) / len(df),len(df[df['spam_flag'] == False]) / len(df)))
print("Ratio of True / False in downsampled dataset: {} / {}".format(len(downsampled[downsampled['spam_flag'] == True]) / len(downsampled),len(downsampled[downsampled['spam_flag'] == False]) / len(downsampled)))


Downsampled 6462 -> 500
Ratio of True / False in original dataset: 0.14035902197462086 / 0.8596409780253791
Ratio of True / False in downsampled dataset: 0.14 / 0.86


# Build prompt->completion data set

In [6]:
# dfToUse = df
# jsonlFilename = "comments_full"
dfToUse = downsampled
jsonlFilename = "comments_small"
train = dfToUse[['text', 'spam']].copy()
train.rename(columns={'spam': 'completion', 'text': 'prompt'}, inplace=True)
train.to_json(f"data/{jsonlFilename}.jsonl", orient='records', lines=True)
train.head()

Unnamed: 0,prompt,completion
3098,Save Your Marriage from divorce today. Get You...,spam
2251,GREAT LOVE SPELL CASTER DR PETER THAT HELP ME ...,spam
2633,"Hello friend, I recommend this great powerful ...",spam
2501,\nDon't by any chance trust these online inves...,spam
910,CRYPTOCURRENCY RECOVERY/ BTC RECOVERY\n\nI had...,spam


# Use OpenAI CLI data cleaner to prepare train/validation datasets

In [11]:
!openai tools fine_tunes.prepare_data -f data/{jsonlFilename}.jsonl -q

Analyzing...

- Your file contains 500 prompt-completion pairs
- Based on your data it seems like you're trying to fine-tune a model for classification
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for training
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for 

# Kick off a Fine-Tune training request
Buckle up, this will take a while.  I'm currently seeing it take 2-4 hours for OpenAI to start my training jobs.

In [7]:
model = "ada"

trainFile = f"data/{jsonlFilename}_prepared_train.jsonl"
validateFile = f"data/{jsonlFilename}_prepared_valid.jsonl"

command = f"OPENAI_API_KEY={apikey} openai api fine_tunes.create -t {trainFile} -v {validateFile} -m {model} --compute_classification_metrics --classification_positive_class \" ham\""
!{command}

In [8]:
# Grab the fine tune id from the output of the previous command
ftId = 'ft-1AjDPOyOMzWc6ve1htacVUsg'

In [9]:
command = f"OPENAI_API_KEY={apikey} openai api fine_tunes.follow -i {ftId}"
!{command}

[2023-07-16 23:46:22] Created fine-tune: ft-1AjDPOyOMzWc6ve1htacVUsg

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-1AjDPOyOMzWc6ve1htacVUsg



In [None]:
# List all created fine-tunes
command = f"OPENAI_API_KEY={apikey} openai api fine_tunes.list"
!{command}

# Try out the trained model
Spot check a couple out-of-sample comments

In [15]:
def isSpam(comment):
    res = openai.Completion.create(
        model=os.environ['FINE_TUNE_MODEL_ID'],
        prompt=comment[:1500] + "\n\n###\n\n",
        max_tokens=1,
        temperature=0,
        logprobs=2)
    label = res.choices[0].text
    if (label == " spam"):
        return True
    elif (label == " ham"):
        return False
    else:
        print("Error, unexpected model output: {}".format(label))
        return False

In [16]:
hamPrompt = "I am so Happy we were able to share a few more memories at Grandma and Grandma’s house last summer. You are all in my heart. Thank you cousin Connie for the LOVE"
spamPrompt = "Hello everyone my names are ALEX JACKSON from the UK, I want to use this golden medium to appreciate Doctor Abdul a great spell caster for helping me retrieving back my relationship with my ex lover when he ended and turned back on me for quite a long time now (6 months ago). He performed a spell for me and within 48 hours after the spell had been cast I received a text from my ex saying that he is sorry for the pains and tears that he had caused me and that he will not do such a thing to me again in his life. I was surprised but later accepted him back again. Anyone that is in the same line of problem or different one that wants to contact a spell caster should happily contact Doctor Abdul now on this email address.doctorabdulspellcaster@gmail.com or message him through his Whatsapp +2348108728256"

In [18]:
print("hamPrompt is spam: {}".format(isSpam(hamPrompt)))
print("spamPrompt is spam: {}".format(isSpam(spamPrompt)))

hamPrompt is spam: False
spamPrompt is spam: True
