### Imports and setup

In [37]:
import os
import openai
import pandas as pd
import numpy as np
import pickle
from dotenv import load_dotenv
from typing import Dict, Optional
from pprint import pprint
from datasets import load_dataset
from IPython.display import display, HTML

In [38]:
load_dotenv()
openai.api_key = os.getenv('OPENAI_KEY')

### Content

* [Data Processing](#data_processing)
* [Prompts and functions](#prompts_and_functions])
* [First prompt test](#first_test)
* [Second prompt test](#second_test)
* [Full run](#full_run)

#### <a id='load_data'>Load Data</a>

In [3]:
# Load Amazon product dataset. We'll use 'validation' split for few-shot prompting, and 'test' split to generate feature-bullets

fs_dataset = load_dataset('iarbel/amazon-product-data-filter', split='validation')
test_dataset = load_dataset('iarbel/amazon-product-data-filter', split='test')

Downloading readme:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/236k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/716 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/204 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/103 [00:00<?, ? examples/s]

In [4]:
# Prepare Pandas DFs with the relevant columns

columns_to_use = ['asin', 'category', 'title', 'tech_process', 'labels']

fs_df = fs_dataset.to_pandas()[columns_to_use]
test_df = test_dataset.to_pandas()[columns_to_use]

In [5]:
# Inspect few-shot DF 

fs_df.head(3)

Unnamed: 0,asin,category,title,tech_process,labels
0,B08N7RXCC6,smartwatches,Citizen CZ Smart Grey Plated Silicone Strap St...,Brand is Citizen. Style is Citizen CZ Smart. C...,\n- SMARTWATCH POWERED BY WEAR OS BY GOOGLE IS...
1,B00Y3B7AHS,speakers,JBL Flip 4 Waterproof Portable Bluetooth Speak...,Brand is JBL. Speaker Type is Outdoor. Connect...,\n- WIRELESSLY CONNECT UP TO 2 SMARTPHONES: or...
2,B07GXJZVQW,office_and_school_supplies,Early Buy Sticky Notes 6 Bright Color 6 Pads S...,Color is 6 Pads. Size is 3 x 3 inches. Brand i...,"\n- 3 IN X: 3 in, 6 Pads / Pack, 100 Sheets / ..."


#### Embed and Index

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import DistanceStrategy

In [7]:
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_KEY')

In [8]:
# Product titles will serve as the reference for few-shot context. Therefore we embed and index titles from fs_df

documents = fs_df['title'].to_list()

In [9]:
# Inspect the data

documents[:3]

['Citizen CZ Smart Grey Plated Silicone Strap Stainless Steel Smartwatch Touchscreen, Heartrate, GPS, Speaker, Bluetooth, Notifications, iPhone and Android Compatible, Powered by Google Wear OS',
 'JBL Flip 4 Waterproof Portable Bluetooth Speaker (Black) (Renewed)',
 'Early Buy Sticky Notes 6 Bright Color 6 Pads Self-Stick Notes 3 in x 3 in, 100 Sheets/Pad']

In [13]:
# Create the DB. We use OpenAI for the embedding, and use the strategy `MAX_INNER_PRODUCT` as we want to use cosine similarity

# db = FAISS.from_texts(documents, OpenAIEmbeddings(), 
#                       distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)

In [14]:
# We save the index and vector-store for future use, to avoid future costs and time of embedding and indexing
# db.save_local('data/vector_stores/amazon-product-embedding')

# Load a locally saved vector store
# db = FAISS.load_local('data/vector_stores/amazon-product-embedding', OpenAIEmbeddings(),
#                       distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)

In [11]:
# Next, we select a product from the test set and search other products with similar titles to it

example_title = test_df.iloc[0]['title']

pprint(example_title)

('WeMo Smart Video Doorbell - Apple HomeKit Secure Video with HDR - Smart Home '
 'Products Video Doorbell Camera - Ring Doorbell for Security Camera System - '
 'WiFi Camera Doorbell w/ 223° FOV & 2-Way Audio')


In [12]:
# Embed the vector. We can embed and search at once, but in this case it's better to have the actual vector

example_vector = OpenAIEmbeddings().embed_query(example_title)

In [16]:
# Save / load a local vector

# with open('data/vector_stores/example_vector.pkl', 'wb') as handle:
#     pickle.dump(example_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('data/vector_stores/example_vector.pkl', 'rb') as handle:
#     example_vector = pickle.load(handle)

In [17]:
# Inspect the vector. We'll see that its length is 1536 (OpenAI embedding dim), and verify that it's L-2 normalized.

print(f'Vector dim: {len(example_vector)}')
print(f'Vector norm: {np.linalg.norm(np.array(example_vector))}')

Vector dim: 1536
Vector norm: 1.0


In [18]:
# Get the top 5 most similar titles to the example title
similarity_list = db.similarity_search_with_score_by_vector(example_vector, k=5)

for i in similarity_list:
    title = i[0].page_content
    score = i[1]
    category = fs_df[fs_df['title'] == title].iloc[0]['category']
    pprint(f'Title: {i[0].page_content}\nCategory: {category}\nScore: {i[1]:.4f}\n\n')

('Title: Brilliant Smart Home Control (3-Switch Panel) — Alexa Built-In & '
 'Compatible with Ring, Sonos, Hue, Google Nest, Wemo, SmartThings, Apple '
 'HomeKit — In-Wall Touchscreen Control for Lights, Music, & More\n'
 'Category: smart_home_products\n'
 'Score: 0.8510\n'
 '\n')
('Title: Brilliant Smart Dimmer Switch (Light Almond) — Compatible with Alexa, '
 'Google Assistant, Apple HomeKit, Hue, LIFX, SmartThings, TP-Link, Wemo and '
 'More\n'
 'Category: smart_home_products\n'
 'Score: 0.8283\n'
 '\n')
('Title: Ousmile Smart Sunrise Hatch Alarm Clock, Fast Wireless Charger '
 'Intelligent Atmosphere Lamp, RGB Night Light Bluetooth Speaker APP Control '
 'Table Desk Lamp for Bedroom Home Decer Office Gift\n'
 'Category: smart_home_products\n'
 'Score: 0.8276\n'
 '\n')
('Title: BWLLNI Lighted Vanity Mirror with Lights, Makeup Mirror with Storage '
 'Shelves, Vanity Mirror with Lights 12 Dimmable LED Bulbs, 3 Color Lighting '
 'Modes, Smart Touch Control, Detachable 10x Magnification

In [19]:
# We can see that results are indeed satisfactory. 
# Recall that titles need not be identical or similar, we only wish to use products that are similar enough, so that their few-shot examples will be helpful.

# Another option is to use Maximal Marginal Relevance (MMR), so we get more diversity in the results

similarity_list_mmr = db.max_marginal_relevance_search_with_score_by_vector(example_vector, k=5, lambda_mult=0.5)
similarity_list_mmr.sort(key=lambda tup: -tup[1])

for i in similarity_list_mmr:
    title = i[0].page_content
    score = i[1]
    category = fs_df[fs_df['title'] == title].iloc[0]['category']
    pprint(f'Title: {i[0].page_content}\nCategory: {category}\nScore: {i[1]:.4f}\n\n')

('Title: Brilliant Smart Home Control (3-Switch Panel) — Alexa Built-In & '
 'Compatible with Ring, Sonos, Hue, Google Nest, Wemo, SmartThings, Apple '
 'HomeKit — In-Wall Touchscreen Control for Lights, Music, & More\n'
 'Category: smart_home_products\n'
 'Score: 0.8510\n'
 '\n')
('Title: HDMI Video Capture Card, 4K HDMI to USB Capture Card Full HD 1080P '
 '30fps, Record via DSLR, Camcorder, Action Cam for Live Streaming, Compatible '
 'with Nintendo Switch, PS4, Xbox One, PC\n'
 'Category: computer_input_devices\n'
 'Score: 0.8048\n'
 '\n')
('Title: VAULTEK Smart Station Home Centric Biometric Smart Safe with '
 'Bluetooth 2.0 and Auto Open Drawer + Wireless Phone Charger (Titanium Gray)\n'
 'Category: smart_home_products\n'
 'Score: 0.8005\n'
 '\n')
('Title: EIGIIS Smart Watch for Women 1.7" HD Waterproof Smartwatch Compatible '
 'with iPhone Samsung Android Phones Sports Fitness Tracker Watch with Heart '
 'Rate Sleep Monitor Pedometer\n'
 'Category: smartwatches\n'
 'Score: 0.798

#### <a id='prompts_and_functions'>Prompts and functions</a>

In [26]:
# Next we build some functions to help with prompting the OpenAI API
MAX_TOKENS = 700
USER_TXT = 'Write feature-bullets for an Amazon product page. ' \
           'Title: {title}. Technical details: {tech_data}.\n\n### Feature-bullets:'


class Conversation:
    """
    A class to construct conversations with the ChatAPI
    """
    def __init__(self):
        self.messages = [{'role': 'system',
                          'content': 'You are a helpful assistant. Your task is to write feature-bullets for an Amazon product page.'}]

    def add_message(self, role: str, content: str) -> None:
        # Validate inputs
        role = role.lower()
        last_role = self.messages[-1]['role']
        if role not in ['user', 'assistant']:
            raise ValueError('Roles can be "user" or "assistant" only')
        if role == 'user' and last_role not in ['system', 'assistant']:
            raise ValueError('"user" message can only follow "assistant" message')
        elif role == 'assistant' and last_role != 'user':
            raise ValueError('"assistant" message can only follow "user" message')
        
        message = {"role": role, "content": content}
        self.messages.append(message)
            
    def display_conversation(self) -> None:
        SEP = '\n'
        for message in self.messages:
            if message['role'] == 'system':
                display(HTML(f'<b>{message["content"]}</b>'))
            elif message['role'] == 'user':
                msg_align = message["content"].replace("Title:", "<br><b>Title:</b>")\
                    .replace("Technical details:", "<br><b>Technical details:</b>").replace("### Feature-bullets:", "<br><b>Feature-bullets:</b>")
                display(HTML(f'<p style="background-color:White; color:Black; padding:5px;">{msg_align}</p>'))
            else:
                msg_align = message["content"].lstrip(SEP).replace(SEP, "<br><br>")
                display(HTML(f'<p style="background-color:LightGray; color:Black; padding:5px;">{msg_align}</p>'))

def api_call(messages: Dict[str, str], temperature: float = 0.7, top_p: int = 1, n_responses: int = 1) -> dict:
    """
    A function to call the ChatAPI. Taken in a conversation, and the optional params temperature (controls randomness) and n_responses
    """
    params = {'model': 'gpt-3.5-turbo', 'messages': messages, 'temperature': temperature, 'max_tokens': MAX_TOKENS, 'n': n_responses, 'top_p': top_p}
    response = openai.ChatCompletion.create(**params)

    text = [response['choices'][i]['message']['content'] for i in range(n_responses)]
    out = {'object': 'chat', 'usage': response['usage']._previous, 'text': text}
    return out

#### <a id='first_test'>First prompt test - Zero shot</a>

In [27]:
# Get a single data point
example_title = test_df.iloc[0]['title']
example_tech = test_df.iloc[0]['tech_process']
example_labels = test_df.iloc[0]['labels']

# Init a conversation
conv1 = Conversation()
# Append a message
conv1.add_message('user', USER_TXT.format(title=example_title, tech_data=example_tech))

# Inspect the prompt
conv1.display_conversation()

In [28]:
# Call the API
res1 = api_call(conv1.messages, temperature=0.7)

# Print results
pprint(res1['text'][0])

('- Apple HomeKit Secure Video integration for enhanced security and privacy\n'
 '- High Dynamic Range (HDR) technology for clear and vivid video footage\n'
 '- Seamless integration with your smart home products for a complete security '
 'system\n'
 '- Wide 223° field of view (FOV) to capture a larger area of your property\n'
 '- Two-way audio communication for convenient and effective conversations '
 'with visitors\n'
 '- Wireless and wired connectivity options for flexible installation\n'
 '- HD resolution for crisp and detailed video quality\n'
 '- Night vision capability for round-the-clock monitoring\n'
 '- Advanced image sensor for accurate motion detection and alert '
 'notifications')


In [29]:
# Let's compare the output to the original feature-bullets
pprint(test_df.iloc[0]['labels'])

('\n'
 '- COMPATIBLE WITH HOMEKIT SECURE VIDEO: With Apple HomeKit this smart video '
 'doorbell camera can send notifications to your Apple iPhone, helping you see '
 'who is at your door with face recognition. NOTE: Only compatible with a '
 'wired 16-24V AC doorbell system.\n'
 '- WIFI CAMERA WITH WIDE FIELD OF VIEW: Our home security camera has a super '
 'wide FOV measuring 178° vertical x 140° horizontal x 223° diagonal, so '
 'you’ll never miss a doorbell ring or nearby activity.\n'
 '- CLEAR PICTURE IN ANY LIGHTING: This smart home doorbell camera uses '
 'infrared technology to help get crisp video, even in the dark with low-light '
 'sensitivity and an HD camera.\n'
 '- DUAL WIFI BANDS BRING PEACE OF MIND: The 2.4GHz WiFi band offers a solid, '
 'long distance connection that easily penetrates walls, while the 5GHz wifi '
 'band offers greater speed while in closer range.\n'
 '- REVIEW VIDEO WITH EASE: With your existing iCloud storage plan and our '
 'HomeKit Secure Video en

In [30]:
# API usage stats
res1['usage']

{'prompt_tokens': 132, 'completion_tokens': 117, 'total_tokens': 249}

#### <a id='second_test'>Second prompt test</a>

In [31]:
# Now, we construct a few-shot example. We will search for titles that are similar to our target product and provide these as examples for the LLM

class FewShotData:
    def __init__(self, few_shot_df: pd.DataFrame, vector_db: FAISS):
        self.few_shot_df = few_shot_df
        self.vector_db = vector_db
        
    def extract_few_shot_data(self, target_title: str, k_shot: int = 2, **db_kwargs) -> pd.DataFrame:
         # Find relevant products
        target_title_vector = OpenAIEmbeddings().embed_query(target_title)
        similarity_list_mmr = self.vector_db.max_marginal_relevance_search_with_score_by_vector(target_title_vector, k=k_shot, **db_kwargs)
        few_shot_titles = [i[0].page_content for i in similarity_list_mmr]
        
        # Extract relevant data
        few_shot_data = self.few_shot_df[self.few_shot_df['title'].isin(few_shot_titles)][['title', 'tech_process', 'labels']]
        return few_shot_data

    def construct_few_shot_conversation(self, target_title: str, target_tech_data: str, few_shot_data: pd.DataFrame) -> Conversation:
        # Structure the few-shott data
        fs_titles = few_shot_data['title'].to_list()
        fs_tech_data = few_shot_data['tech_process'].to_list()
        fs_labels = few_shot_data['labels'].to_list()
    
        # Init a conversation, populate with few-shot data
        conv = Conversation()
        for title, tech_data, lables in zip(fs_titles, fs_tech_data, fs_labels):
            conv.add_message('user', USER_TXT.format(title=title, tech_data=tech_data))
            conv.add_message('assistant',lables)
            
        # Add the final user prompt
        conv.add_message('user', USER_TXT.format(title=target_title, tech_data=target_tech_data))
        return conv
    

In [33]:
# Init a `FewShotData` instance
fs_example = FewShotData(few_shot_df=fs_df, vector_db=db)
fs_data = fs_example.extract_few_shot_data(target_title=example_title, k_shot=3)

fs_data

Unnamed: 0,title,tech_process,labels
36,Brilliant Smart Home Control (3-Switch Panel) ...,Brand is Brilliant. Actuator Type is Touch. Nu...,\n- EASY SMART HOME CONTROL FOR EVERYONE: Bril...
101,"HDMI Video Capture Card, 4K HDMI to USB Captur...","Brand is LinkBand. Hardware Interface is HDMI,...",\n- STUNNING 1080P VIDEO QUALITY: The HDMI cap...
137,"Bone Conduction Speaker, True Wireless Speaker...",Brand is heypower. Speaker Type is Surround So...,\n- TURN ANYTHING HOLLOW INTO A SPEAKER: Snugg...


In [34]:
# Construct a few-shot conversation + 
fs_conv = fs_example.construct_few_shot_conversation(target_title=example_title, target_tech_data=example_tech, few_shot_data=fs_data.iloc[:-1])

fs_conv.display_conversation()

In [39]:
res_2shot = api_call(fs_conv.messages, temperature=0.7)

In [40]:
pprint(res_2shot['text'][0])

('- SMART VIDEO DOORBELL: The WeMo Smart Video Doorbell is a sleek and '
 'advanced doorbell camera that enhances the security of your home. \n'
 '- APPLE HOMEKIT SECURE VIDEO: This doorbell camera is compatible with Apple '
 'HomeKit Secure Video, providing you with a secure and private way to view '
 'and manage your video footage.\n'
 '- HIGH DEFINITION RESOLUTION: Capture clear and detailed video footage with '
 'the HD resolution feature of this doorbell camera. \n'
 '- ENHANCED NIGHT VISION: The night vision feature allows you to see clearly '
 'even in low light conditions, ensuring the safety of your home day and '
 'night. \n'
 '- WIDE FIELD OF VIEW: With a 223° field of view, this doorbell camera covers '
 'a wider area, allowing you to monitor your surroundings effectively. \n'
 '- TWO-WAY AUDIO: Communicate with visitors at your door using the two-way '
 'audio feature, providing a convenient and interactive experience. \n'
 '- EASY INSTALLATION: The WeMo Smart Video Doorbe

In [41]:
res_2shot['usage']

{'prompt_tokens': 971, 'completion_tokens': 222, 'total_tokens': 1193}