# Import statements

- `import os` to read and write files
- `import pandas` for data manipulation
- `import openai` to use OpenAI for embedding models
- `import pinecone` to use Pinecone as the vector database
- `import tqdm` to show progress as a code cell runs

In [1]:
import os
import pandas as pd
import openai
import pinecone
from tqdm.notebook import tqdm



# Setting up API keys

In [2]:
import os

# If using .env file, load it
try:
    from dotenv import load_dotenv
    load_dotenv()
except:
    pass

# Access the API keys
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
# pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")

# Load csv data

In [3]:
df = pd.read_csv('Ethiopia_39_topics.csv')

df.head()

Unnamed: 0,Topic,URL,Description
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...


## Combining columns into a single text field for embedding

In [4]:
df['combined_text'] = df.apply(lambda row: f"{row['Topic']} {row['URL']} {row['Description']}", axis=1)

df.head()

Unnamed: 0,Topic,URL,Description,combined_text
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,Identify risk factors https://rhlaiservice2.bl...
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,Measure temperature https://rhlaiservice2.blob...
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,Weigh the baby https://rhlaiservice2.blob.core...
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,Examine the baby https://rhlaiservice2.blob.co...
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,Count a baby's breaths https://rhlaiservice2.b...


# Generate embeddings using OpenAI models

We will be using text-embedding-3-small as the embedding model. This is OpenAI's new and fastest model

In [13]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [14]:
tqdm.pandas()

df['embedding'] = df['combined_text'].progress_apply(lambda x: get_embedding(x))

  0%|          | 0/39 [00:00<?, ?it/s]

In [15]:
df.head()

Unnamed: 0,Topic,URL,Description,combined_text,embedding
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,Identify risk factors https://rhlaiservice2.bl...,"[0.00830182433128357, 0.04434720054268837, 0.0..."
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,Measure temperature https://rhlaiservice2.blob...,"[-0.0020057621877640486, 0.009801158681511879,..."
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,Weigh the baby https://rhlaiservice2.blob.core...,"[0.0159198846668005, 0.017715105786919594, -0...."
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,Examine the baby https://rhlaiservice2.blob.co...,"[0.02978324331343174, 0.07284960895776749, 0.0..."
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,Count a baby's breaths https://rhlaiservice2.b...,"[0.006141426507383585, 0.035783570259809494, 0..."


In [16]:
pc = pinecone.Pinecone(api_key=pinecone_api_key)

In [19]:
index_name = 'rag-project'

print(pc.list_indexes().names())

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pinecone.Index(index_name)

['rhl-ethiopia', 'wow-app', 'rhl-hmbs', 'rhl-atlanta', 'rag-project']


ForbiddenException: (403)
Reason: Forbidden
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '973a8b26ecd27b76697b42824a1a1287', 'Date': 'Fri, 27 Sep 2024 17:38:55 GMT', 'Server': 'Google Frontend', 'Content-Length': '196', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"FORBIDDEN","message":"Request failed. You've reached the max serverless indexes allowed in project Default (5). To add more serverless indexes, upgrade your plan."},"status":403}
