# Import statements

- `import os` to read and write files
- `import pandas` for data manipulation
- `import openai` to use OpenAI for embedding models
- `import pinecone` to use Pinecone as the vector database
- `import tqdm` to show progress as a code cell runs

In [1]:
import os
import pandas as pd
import openai
import pinecone
from tqdm.notebook import tqdm



# Setting up API keys

In [2]:
# If using .env file, load it
try:
    from dotenv import load_dotenv
    load_dotenv()
except:
    pass

# Access the API keys
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Load csv data

In [3]:
df = pd.read_csv('preprocessed_data.csv')

df.head()

Unnamed: 0,Topic,URL,Description,Combine_text,lemmatized_text
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,identify risk factors a woman is at a risk of ...,identify risk factor a woman be at a risk of g...
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,measure temperature here's how to determine th...,measure temperature here 's how to determine t...
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,weigh the baby here's how to take the baby's w...,weigh the baby here 's how to take the baby 's...
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,examine the baby the baby's exam is best done ...,examine the baby the baby 's exam be best do o...
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,count a baby's breaths here's how to count the...,count a baby 's breath here 's how to count th...


# Generate embeddings using OpenAI models

We will be using text-embedding-3-small as the embedding model. This is OpenAI's new and fastest model

In [4]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [5]:
tqdm.pandas()

df['embedding'] = df['lemmatized_text'].progress_apply(lambda x: get_embedding(x))

  0%|          | 0/39 [00:00<?, ?it/s]

In [6]:
df.head()

Unnamed: 0,Topic,URL,Description,Combine_text,lemmatized_text,embedding
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,identify risk factors a woman is at a risk of ...,identify risk factor a woman be at a risk of g...,"[0.01685132086277008, 0.056897733360528946, 0...."
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,measure temperature here's how to determine th...,measure temperature here 's how to determine t...,"[0.004066101741045713, 0.007667058613151312, -..."
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,weigh the baby here's how to take the baby's w...,weigh the baby here 's how to take the baby 's...,"[0.02262173779308796, 0.020612085238099098, -0..."
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,examine the baby the baby's exam is best done ...,examine the baby the baby 's exam be best do o...,"[0.018320854753255844, 0.07097376137971878, 0...."
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,count a baby's breaths here's how to count the...,count a baby 's breath here 's how to count th...,"[0.022354714572429657, 0.0497504286468029, 0.0..."


# Pinecone Initialzation

In [7]:
pc = pinecone.Pinecone(api_key=pinecone_api_key)

### Setup Pinecone Vector Database

In [8]:
index_name = 'rag-project'

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

### Convert embeddings and get data ready to be upserted

In [9]:
# Convert embeddings to the required format
data = [
    (
        str(i),  # Unique ID for each vector
        df.iloc[i]['embedding'],  # Embedding vector
        {
            'Topic': df.iloc[i]['Topic'],
            'Video URL': df.iloc[i]['URL'],
            'Description': df.iloc[i]['Description']
        }  # Metadata
    )
    for i in range(len(df))
]

### Upsert Data to Pinecone

In [10]:
# Define batch size
batch_size = 100

# Upsert in batches
for i in tqdm(range(0, len(data), batch_size)):
    batch = data[i:i+batch_size]
    index.upsert(vectors=batch)

  0%|          | 0/1 [00:00<?, ?it/s]

# Test case

In [11]:
# Example query text
query_text = "Want to know about feeding baby milk"

# Generate embedding for the query
query_embedding = get_embedding(query_text)

# Query Pinecone index
results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

# Display the results
for match in results['matches']:
    print(f"Score: {match['score']}")
    print(f"Topic: {match['metadata']['Topic']}")
    print(f"URL: {match['metadata']['Video URL']}")
    print(f"Description: {match['metadata']['Description']}\n")

Score: 0.609344363
Topic: Feed with cup
URL: https://rhlaiservice2.blob.core.windows.net/hmbs-enc2/Feed_with_Cup_CB.mp4
Description: When the small baby is awakening to a feed every two to 4 hours, try feeding him by cup. He is ready to cup feed if he can swallow milk without coughing, choking or turning blue. Help the mother recognize the baby's feeding signals. Eyes open, looking around, moving his mouth and his tongue. Feed the baby whenever he signals his hunger. Teach the mother to cup feed by first showing her the steps, then guiding her as she feeds her baby. Have her wash her hands, then pour the baby's measured volume of milk into a small cup. Next, have the mother wrap her baby and hold him in a nearly upright position on her lap. Her hand supports his head and neck. The mother rests the cup lightly
on the baby's lower lip. She then tips the cup until the milk reaches the top edge and lets the baby take the milk. A small baby needs to pace himself during cup feeding. Here he 