In [2]:
import pandas as pd
import openai
import os
import json
from src.util import distributionPreservingDownsample

In [3]:
from dotenv import load_dotenv; load_dotenv()
openai.api_key = os.environ['OPENAI_API_KEY']

### Resources

* https://platform.openai.com/docs/guides/prompt-engineering
* https://platform.openai.com/docs/api-reference/chat


# Load Data

In [4]:
sms_spam_all = pd.read_csv('../data/kaggle_sms_spam.csv', encoding='latin-1')[['label', 'prompt']]
sms_spam_all['spam_flag'] = sms_spam_all['label'].apply(lambda x: True if x == 'spam' else False)
sms_spam = sms_spam_all.drop_duplicates(subset=['prompt'])
print("Loaded sms data file with {} rows, kept {}".format(len(sms_spam_all), len(sms_spam)))
sms_spam.head()


Loaded sms data file with 5572 rows, kept 5169


Unnamed: 0,label,prompt,spam_flag
0,ham,"Go until jurong point, crazy.. Available only ...",False
1,ham,Ok lar... Joking wif u oni...,False
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,True
3,ham,U dun say so early hor... U c already then say...,False
4,ham,"Nah I don't think he goes to usf, he lives aro...",False


# Set Up Prompt

In [5]:
systemPrompt = "You will be provided with a text message. You will need to classify the text message as spam, ham. Spam is a text message that is spam, harmful, abusive, or otherwise unwanted. Ham is a text message that is not spam."

async def isSpam(message):
    response = await openai.ChatCompletion.acreate(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": systemPrompt},
            {"role": "user", "content": message}
        ],
        temperature=0,
        max_tokens=256
    )
    return response.choices[0].message.content == 'Spam'

### Test a couple examples

In [6]:
await isSpam("Hey what's happening? Want to get some ramen?")

False

In [7]:
await isSpam("Hey what's happening? Do you need help recovering your lost crypto tokens?")

True

# Performance Testing

### Predict on a subset of the full the dataset

In [23]:
downsampled = distributionPreservingDownsample(sms_spam, 'spam_flag', 300)
print("Downsampled {} -> {}".format(len(sms_spam),len(downsampled)))
print("Ratio of ham/spam in original dataset: {} / {}".format(len(sms_spam[sms_spam['spam_flag'] == True]) / len(sms_spam),len(sms_spam[sms_spam['spam_flag'] == False]) / len(sms_spam)))
print("Ratio of ham/spam in downsampled dataset: {} / {}".format(len(downsampled[downsampled['spam_flag'] == True]) / len(downsampled),len(downsampled[downsampled['spam_flag'] == False]) / len(downsampled)))
downsampled.head()

Downsampled 5169 -> 300
Ratio of ham/spam in original dataset: 0.12633004449603405 / 0.873669955503966
Ratio of ham/spam in downsampled dataset: 0.12333333333333334 / 0.8766666666666667


Unnamed: 0,label,prompt,spam_flag
3966,spam,YOU HAVE WON! As a valued Vodafone customer ou...,True
3986,spam,Ringtone Club: Gr8 new polys direct to your mo...,True
5,spam,FreeMsg Hey there darling it's been 3 week's n...,True
3421,spam,Freemsg: 1-month unlimited free calls! Activat...,True
1624,spam,500 free text msgs. Just text ok to 80488 and ...,True


In [24]:
predictions = downsampled.copy()
predictions['prediction'] = None
predictions.head()

Unnamed: 0,label,prompt,spam_flag,prediction
3966,spam,YOU HAVE WON! As a valued Vodafone customer ou...,True,
3986,spam,Ringtone Club: Gr8 new polys direct to your mo...,True,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,True,
3421,spam,Freemsg: 1-month unlimited free calls! Activat...,True,
1624,spam,500 free text msgs. Just text ok to 80488 and ...,True,


In [None]:
# %pip install throttler
from throttler import Throttler

throttler = Throttler(rate_limit=60, period=60)

for i, row in predictions.iterrows():
    if (predictions.loc[i, 'prediction'] == None):
        async with throttler:
            predictions.loc[i, 'prediction'] = await isSpam(row['prompt'])
            print("Predicted {} for row {}: {}".format(predictions.loc[i, 'prediction'], i, row['prompt']))


In [5]:
predictions = pd.read_csv('../data/temp/predictions.csv')

In [38]:
predictions
predictions.to_csv('../data/temp/predictions.csv', index=False)

In [6]:
#%pip install scikit-learn
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(predictions['spam_flag'], predictions['prediction'])

print(cm)

[[254   9]
 [ 12  25]]


In [8]:
# %pip install throttler
from throttler import Throttler

throttler = Throttler(rate_limit=60, period=60)

sample_sizes = [50, 100, 200]

rows = []
n = 0
for sample_size in sample_sizes:
    validation_data_path = f"../data/temp/model_{sample_size}/validation.jsonl"
    with open(validation_data_path, 'r') as f:
        for line in f.readlines():
            data = json.loads(line)
            prompt = data['messages'][1]['content']
            expected = data['messages'][2]['content']
            async with throttler:
                result = await isSpam(prompt)
                rows.append([sample_size, prompt, expected=='spam', result])

            n = n+1
            if n > 20:
                break
            else:
                print("n = {}".format(n))
    break

predictions = pd.DataFrame(rows, columns=['sample_size', 'prompt', 'expected', 'result'])
predictions
    

n = 1
n = 2
n = 3
n = 4
n = 5
n = 6
n = 7
n = 8
n = 9
n = 10
n = 11
n = 12
n = 13
n = 14
n = 15
n = 16
n = 17
n = 18
n = 19
n = 20


Unnamed: 0,sample_size,prompt,expected,result
0,50,"Romantic Paris. 2 nights, 2 flights from ï¿½79...",True,True
1,50,"URGENT! Your mobile No *********** WON a ï¿½2,...",True,True
2,50,Free 1st week entry 2 TEXTPOD 4 a chance 2 win...,True,True
3,50,Wan2 win a Meet+Greet with Westlife 4 U or a m...,True,True
4,50,You can stop further club tones by replying \S...,True,True
5,50,"Claim a 200 shopping spree, just call 08717895...",True,True
6,50,"Come to me, slave. Your doing it again ... Goi...",False,False
7,50,U meet other fren dun wan meet me ah... Muz b ...,False,False
8,50,"G says you never answer your texts, confirm/deny",False,False
9,50,"K so am I, how much for an 8th? Fifty?",False,False
