In [2]:
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
import time
import os
from openai import OpenAI
import dotenv
dotenv.load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [26]:
token = os.getenv("RUNPOD_TOKEN")
open_ai_base_url = os.getenv("RUNPOD_EMBEDDING_URL")
model_name = os.getenv("MODEL_NAME")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME")

In [27]:
pc = Pinecone(api_key = pinecone_api_key)
client = OpenAI(
    api_key = token,
    base_url = open_ai_base_url
)

# Try out the embeddings

In [None]:
output = client.embeddings.create(input=["hello world"], model = 'BAAI/bge-small-en-v1.5')
embeddings = output.data[0].embedding
print(embeddings)

# WRANGLE DATASET

In [3]:
df = pd.read_json("products/products.jsonl", lines = True)
df.head()

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp
2,Latte,Coffee,"Smooth and creamy, our latte combines rich esp...","[Espresso, Steamed Milk, Milk Foam]",4.75,4.8,Latte.jpg
3,Chocolate Chip Biscotti,Bakery,"Crunchy and delightful, this chocolate chip bi...","[Flour, Sugar, Chocolate Chips, Eggs, Almonds,...",2.5,4.6,chocolat_biscotti.jpg
4,Espresso shot,Coffee,"A bold shot of rich espresso, our espresso is ...",[Espresso],2.0,4.9,Espresso_shot.webp


In [6]:
df['text'] = df['name'] + " : " + df['description'] +\
      " -- Ingredients: "+ df["ingredients"].astype(str) +\
      " -- Price: "+ df["price"].astype(str) +\
      " -- rating: "+ df["rating"].astype(str)

In [7]:
df['text'][0]

"Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations. -- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam'] -- Price: 4.5 -- rating: 4.7"

In [8]:
texts = df['text'].tolist()

In [9]:
with open("products/Merry\'s_way_about_us.txt") as f:
    Merry_way_about_section = f.read()

Merry_way_about_section = "Coffee shop Merry's way about section:" + Merry_way_about_section
texts.append(Merry_way_about_section)

In [10]:
with open("products/menu_items_text.txt") as f:
    menu_items_text = f.read()

menu_items_text = "Menu Items: " + menu_items_text
texts.append(menu_items_text)

In [12]:
texts

["Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations. -- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam'] -- Price: 4.5 -- rating: 4.7",
 "Jumbo Savory Scone : Deliciously flaky and buttery, this jumbo savory scone is filled with herbs and cheese, creating a mouthwatering experience. Perfect for a hearty snack or a light lunch, it pairs beautifully with your favorite coffee or tea. -- Ingredients: ['Flour', 'Butter', 'Cheese', 'Herbs', 'Baking Powder', 'Salt'] -- Price: 3.25 -- rating: 4.3",
 "Latte : Smooth and creamy, our latte combines rich espresso with velvety steamed milk, creating a perfect balance of flavor and texture. Enjoy it as a comforting treat any time of day, whether you're starting your morning or taking a midday break. -- Ingredients: ['Espre

# Generate Embeddings

In [10]:
output = client.embeddings.create(input=texts, model = model_name)

In [18]:
embeddings = output.data

# Push data to Pinecone (vector dtabase)

You can search by the closest relatable word. You need no do the cosine similarity by hand. This vector database does this by itself. 

In [None]:
pc.create_index(
    name = index_name,
    dimension = 384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [21]:
# wait for the index to be ready
while not pc.describe_index(index_name).status.ready:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0]
    vectors.append(
        {
            "id": entry_id,
            "values": e.embedding,
            "metadata": {'text': text}
        }
    )

index.upsert(vectors=vectors, namespace = 'ns1')


{'upserted_count': 18}

# Get closest Documents

In [None]:
output = client.embeddings.create(input=["Is Cappuccino lactose-free?"], model=model_name)
embedding = output.data[0].embedding
embedding

In [24]:
results = index.query(
    namespace='ns1',
    vector=embedding,
    top_k = 3,
    include_values = False,
    include_metadata=True
)
results

{'matches': [{'id': 'Cappuccino ',
              'metadata': {'text': 'Cappuccino : A rich and creamy cappuccino '
                                   'made with freshly brewed espresso, steamed '
                                   'milk, and a frothy milk cap. This '
                                   'delightful drink offers a perfect balance '
                                   'of bold coffee flavor and smooth milk, '
                                   'making it an ideal companion for relaxing '
                                   'mornings or lively conversations. -- '
                                   "Ingredients: ['Espresso', 'Steamed Milk', "
                                   "'Milk Foam']-- Price: 4.5-- rating: 4.7"},
              'score': 0.734453261,
              'values': []},
             {'id': 'Sugar Free Vanilla syrup ',
              'metadata': {'text': 'Sugar Free Vanilla syrup : Enjoy the sweet '
                                   'flavor of vanilla without the 