In [1]:
# !pip install pandas
# !pip install pinecone
# !pip unstall openai
# !pip install python-dotenv

In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
from openai import OpenAI
import pandas as pd
from time import time
import dotenv
dotenv.load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [None]:
MODEL_NAME= os.getenv("MODEL_NAME") 
MODEL_API_KEY: str = os.getenv("MODEL_API_KEY")
MODEL_BASE_URL: str = os.getenv("MODEL_BASE_URL")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_INDEX_NAME

'coffeeshop'

In [3]:
pc = Pinecone(api_key=PINECONE_API_KEY)

client = OpenAI( 
    api_key=MODEL_API_KEY, 
    base_url=MODEL_BASE_URL
)

## Try out embeddings

In [4]:
output = client.embeddings.create(input = ["helloo there"],model=MODEL_NAME)
embeddings = output.data[0].embedding
print(embeddings)

[-0.016199292615056038, 0.012460206635296345, 0.0027233045548200607, -0.006484383717179298, -0.0023211718071252108, -0.002991417422890663, -0.011296304874122143, -0.0034219848457723856, -0.009488842450082302, 0.01113991066813469, 0.0052106911316514015, -0.014134650118649006, -0.009222684428095818, -0.010968104936182499, -0.01876971498131752, -0.0003280042146798223, 0.0017325228545814753, 0.019111178815364838, 0.007432062644511461, -0.004237538203597069, -0.008229111321270466, 0.00010997745994245633, -3.650747021310963e-05, 0.016325488686561584, -0.010122595354914665, -0.0004662011924665421, -0.006419879850000143, -0.012511479668319225, -0.012334615923464298, -0.0405137836933136, 0.0007523968815803528, -0.006116293836385012, -0.00032283898326568305, -0.0042363558895885944, -0.008268295787274837, -0.003396575106307864, 0.004570931661874056, 0.010331609286367893, -0.004563603550195694, -0.010918628424406052, -0.004062174819409847, -0.0002865112037397921, -0.011089582927525043, -0.00246431

In [5]:
len(embeddings)

2048

## Wrangle dataset

In [18]:
df=pd.read_json('products/products.jsonl',lines=True)

In [19]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino_01.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,savory_scone_01.jpg


In [20]:
df['text'] =  df['name']+" : "+df['description'] + \
                " -- Ingredients: " + df['ingredients'].astype(str) + \
                " -- Price: " + df['price'].astype(str) + \
                " -- rating: " + df['rating'].astype(str) 

In [21]:
df['text'].head()

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
2    Latte : Smooth and creamy, our latte combines ...
3    Chocolate Chip Biscotti : Crunchy and delightf...
4    Espresso shot : A bold shot of rich espresso, ...
Name: text, dtype: object

In [22]:
texts = df['text'].tolist()

In [24]:
with open('products/milos_about_us.txt') as f:
    milos_about_section = f.read()
    
milos_about_section = "Coffee shop Milos about section: " + milos_about_section
texts.append(milos_about_section)

In [25]:
with open('products/menu_items_text.txt') as f:
    menue_items_text = f.read()
    
menue_items_text = "Menu Items: " + menue_items_text
texts.append(menue_items_text)

## Generate Embeddings

In [26]:
output = client.embeddings.create(input = texts,model=MODEL_NAME)

In [27]:
embeddings = output.data

## Push data to database

In [17]:
pc.create_index(
    name=PINECONE_INDEX_NAME,
    dimension=2048, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [28]:
# Wait for the index to be ready
while not pc.describe_index(PINECONE_INDEX_NAME).status['ready']:
    time.sleep(1)

index = pc.Index(PINECONE_INDEX_NAME)

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0].strip()
    vectors.append({
        "id": entry_id,
        "values": e.embedding,
        "metadata": {'text': text}
    })
    
index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 21}

## Get Closest documents

In [29]:
output = client.embeddings.create(input = ["Is Cappuccino lactose-free?"],model=MODEL_NAME)
embeding = output.data[0].embedding

In [30]:
results = index.query(
    namespace="ns1",
    vector=embeding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'Menu Items',
              'metadata': {'text': 'Menu Items: Menu Items\n'
                                   '\n'
                                   'Cappuccino - $4.50\n'
                                   'Jumbo Savory Scone - $3.25\n'
                                   'Latte - $4.75\n'
                                   'Chocolate Chip Biscotti - $2.50\n'
                                   'Espresso shot - $2.00\n'
                                   'Hazelnut Biscotti - $2.75\n'
                                   'Chocolate Croissant - $3.75\n'
                                   'Dark chocolate (Drinking Chocolate) - '
                                   '$5.00\n'
                                   'Cranberry Scone - $3.50\n'
                                   'Croissant - $3.25\n'
                                   'Almond Croissant - $4.00\n'
                                   'Ginger Biscotti - $2.50\n'
                                   'Oatmeal Scone - $3.