In [1]:
# !pip install -q openai datasets
%load_ext dotenv
%dotenv

In [2]:
# import json
import numpy as np

from openai import AzureOpenAI
from datasets import load_dataset
from sklearn.metrics import classification_report
# from google.colab import userdata
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
azure_api_key = os.getenv('azure_api_key')
azure_api_endpoint = os.getenv('azure_endpoint')

In [4]:
client = AzureOpenAI(
  azure_endpoint = azure_api_endpoint,
  api_key=azure_api_key,
  api_version="2024-02-01"
)

In [5]:
model_name = 'gpt-35-turbo' # deployment name

**Examples and Gold Examples**

A set of examples and gold examples for sentiment classification of Amazon product reviews is hosted in a HuggingFace dataset. Let us load this data and take a look at the samples in this data.

In [6]:
amazon_reviews = load_dataset("vijayagrawal/spam-email-classification")

Generating examples split: 100%|██████████| 30/30 [00:00<00:00, 3206.57 examples/s]
Generating gold_examples split: 100%|██████████| 30/30 [00:00<00:00, 5481.56 examples/s]


As is evident from the above output, the data set has 32 samples as examples and 32 samples as gold examples.

In [7]:
amazon_reviews_examples_df = amazon_reviews['examples'].to_pandas()
amazon_reviews_gold_examples_df = amazon_reviews['gold_examples'].to_pandas()

In [8]:
amazon_reviews_examples_df.shape, amazon_reviews_gold_examples_df.shape

((30, 2), (30, 2))

As the above outputs indicate, there are 32 examples and 32 gold examples. We will sample from the examples to create the few shot prompt and evaluate the prompt on all 32 gold examples.

In [9]:
amazon_reviews_examples_df.sample(4)
#amazon_reviews_examples_df

Unnamed: 0,content,label
6,Hot singles in your area want to meet you! Cli...,spam
27,FREE VIAGRA delivered to your door! Best prices!,spam
24,Investment opportunity! 500% returns guaranteed!,spam
25,The kids are staying at Grandmas this weekend.,ham


In [10]:
#amazon_reviews_gold_examples_df

**Assembling the prompt**

In [12]:
system_message = """
Your task is to classify the provided email content input as either “spam” or “ham” (not spam).
Consider the following guidelines:
1.	If the email body contains phrases typically associated with spam (e.g., offers for quick money, urgent requests, suspicious links), label it as “spam”.
2.	If the email body appears to be from a trusted source and does not contain any suspicious content, label it as “ham” (not spam).
"""

In [13]:
few_shot_prompt = [{'role':'system', 'content': system_message}]

We need to iterate over the rows of the examples DataFrame to append these examples as `user` and `assistant` messages to the few-shot prompt. We achieve this using the `iterrows` method.

In [14]:
for index, row in amazon_reviews_examples_df.iterrows():
    print('Example content:')
    print(row[0])
    print('Example Label:')
    print(row[1])
    break

Example content:
CONGRATULATIONS! You have won a free iPhone 13! Click here to claim your prize now!
Example Label:
spam


  print(row[0])
  print(row[1])


Notice that the label is an integer. However, LLMs accept only strings. So we need to convert the integer label to a string label as we assemble the few-shot prompt. Let us assemble a few-shot prompt with 4 examples.

In [15]:
for index, row in amazon_reviews_examples_df.sample(4).iterrows():
    example_review = row[0]
    example_label = row[1]

    few_shot_prompt.append(
        {
            'role': 'user',
            'content': example_review
        }
    )

    few_shot_prompt.append(
        {
            'role': 'assistant',
            'content': str(example_label) # LLMs accept only string inputs
        }
    )

  example_review = row[0]
  example_label = row[1]


In [16]:
few_shot_prompt

[{'role': 'system',
  'content': '\nYour task is to classify the provided email content input as either “spam” or “ham” (not spam).\nConsider the following guidelines:\n1.\tIf the email body contains phrases typically associated with spam (e.g., offers for quick money, urgent requests, suspicious links), label it as “spam”.\n2.\tIf the email body appears to be from a trusted source and does not contain any suspicious content, label it as “ham” (not spam).\n'},
 {'role': 'user',
  'content': 'FREE VIAGRA delivered to your door! Best prices!'},
 {'role': 'assistant', 'content': 'spam'},
 {'role': 'user',
  'content': 'Make $5000 weekly working from home! No experience needed!'},
 {'role': 'assistant', 'content': 'spam'},
 {'role': 'user',
  'content': 'CONGRATULATIONS! You have won a free iPhone 13! Click here to claim your prize now!'},
 {'role': 'assistant', 'content': 'spam'},
 {'role': 'user',
  'content': '100% Natural weight loss pill! Lose 30 pounds in 30 days!'},
 {'role': 'assis

We now have 4 examples in the few shot prompt that is ready for use. Before we deploy this prompt, we need to get an estimate of the performance of this prompt. Here is where we use gold examples to estimate the accuracy.

## Evaluation

In [17]:
predictions, ground_truths = [], []

In [18]:
for index, row in tqdm(amazon_reviews_gold_examples_df.iterrows()):
    gold_review = row[0]
    gold_label = row[1]

    user_input = [{'role':'user', 'content': gold_review}]

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=few_shot_prompt + user_input,
            temperature=0
        )

        predictions.append(response.choices[0].message.content) # convert the string label back to int
        ground_truths.append(gold_label)
    except Exception as e:
        print(e) # Log error and continue
        continue

  gold_review = row[0]
  gold_label = row[1]
30it [00:18,  1.59it/s]


In [19]:
predictions = np.array(predictions)
ground_truths = np.array(ground_truths)
(predictions == ground_truths).mean()

np.float64(0.9666666666666667)

The output above indicates that the accuracy of the few-shot prompt on gold examples. More fine-grained evaluation (e.g., F1 score) could also be used to establish the estimated accuracy of the prompt.

In [20]:
print(classification_report(ground_truths, predictions))

              precision    recall  f1-score   support

         ham       0.94      1.00      0.97        15
        spam       1.00      0.93      0.97        15

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



>More examples does not imply better accuracy. Increasing the number of examples in the few-shot prompt beyond 16 is not known to yield better performance.