In [1]:
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import pandas as pd
import time
import os
import dotenv
dotenv.load_dotenv()

True

In [2]:
token = os.getenv("RUNPOD_TOKEN")
open_ai_base_url = os.getenv("RUNPOD_EMBEDDING_URL")
model_name = os.getenv("MODEL_NAME")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME")

In [3]:
pc = Pinecone(api_key=pinecone_api_key)
client = OpenAI(
    api_key=token,
    base_url=open_ai_base_url
)

# Try out embeddings

In [4]:
output = client.embeddings.create(input=["Hello world"], model=model_name)
embedding = output.data[0].embedding
print(embedding)

[0.015215358696877956, -0.02272770367562771, 0.008572462946176529, -0.07437602430582047, 0.003935400862246752, 0.0027780423406511545, -0.03130016475915909, 0.0446622259914875, 0.04399107024073601, -0.007783094421029091, -0.02524453029036522, -0.033374641090631485, 0.014376416802406311, 0.046340107917785645, 0.00868686381727457, -0.0160466730594635, 0.007504718378186226, -0.019005851820111275, -0.11470626294612885, -0.01813640259206295, 0.1262989193201065, 0.029729057103395462, 0.025229275226593018, -0.0341678224503994, -0.04109290614724159, 0.006604762282222509, 0.010349494405090809, 0.02239212580025196, 0.004431139677762985, -0.12776325643062592, -0.016061928123235703, -0.020348157733678818, 0.04737734794616699, 0.011585026979446411, 0.06827463209629059, 0.007413197308778763, -0.018044881522655487, 0.040970880538225174, -0.010196959599852562, 0.02370392717421055, 0.010410508140921593, -0.02846301719546318, 0.008091977797448635, -0.015253492631018162, 0.03090357594192028, -0.0659561008

In [5]:
len(embedding)

384

# Wrangle Dataset

In [6]:
df = pd.read_json("products/products.jsonl", lines=True)

In [7]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


In [8]:
df['text'] = df['name'] + " : " + df["description"] +\
    " -- Ingredients: " + df["ingredients"].astype(str) +\
    " -- Price: " + df["price"].astype(str) +\
    " -- rating: " + df["rating"].astype(str)

In [9]:
df['text'].head(2)

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
Name: text, dtype: object

In [10]:
texts = df['text'].tolist()

In [11]:
with open("products/Merry's_way_about_us.txt") as f:
    Merry_way_about_section = f.read()

Merry_way_about_section = "Coffee shop Merry's Way about section: " + Merry_way_about_section
texts.append(Merry_way_about_section)

In [12]:
with open("products/menu_items_text.txt") as f:
    menu_items_text = f.read()

menu_items_text = "Menu Items: " + menu_items_text
texts.append(menu_items_text)

# Generate Embeddings

In [13]:
output = client.embeddings.create(input = texts,model=model_name)

In [14]:
embeddings = output.data

# Push data to database

In [None]:
index_name = "coffeeshop"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)


In [16]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for text, e in zip(texts, embeddings):
    entry_id = text.split(":")[0].strip()
    vectors.append({
        "id": entry_id,
        "values": e.embedding,
        "metadata": {'text': text}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 20}

# Get Closest documents

In [17]:
output = client.embeddings.create(input = ["Is Cappuccino lactose-free?"],model=model_name)
embedding = output.data[0].embedding

In [18]:
results = index.query(
    namespace="ns1",
    vector=embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [], 'namespace': 'ns1', 'usage': {'read_units': 1}}


In [19]:
results

{'matches': [], 'namespace': 'ns1', 'usage': {'read_units': 1}}