In [7]:
!pip install -q openai sentence-transformers


import openai
from sentence_transformers import SentenceTransformer
import time
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
from google.colab import userdata


OPENAI_API_KEY = userdata.get('OPEN_AI_KEY')
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

openai.api_key = OPENAI_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)


embed_model = SentenceTransformer('all-MiniLM-L6-v2')

business_faq = [
    {"question": "What is your return policy?", "answer": "We accept returns within 30 days of purchase with original receipt."},
    {"question": "How do I track my order?", "answer": "You can track your order using the tracking number in your confirmation email."},
    {"question": "What payment methods do you accept?", "answer": "We accept Visa, Mastercard, American Express, and PayPal."},
    {"question": "Do you offer international shipping?", "answer": "Yes, we ship to over 50 countries worldwide."},
    {"question": "How can I contact customer service?", "answer": "Call us at 1-800-123-4567 or email support@example.com."}
]

documents = [f"Q: {item['question']} A: {item['answer']}" for item in business_faq]

def get_gemini_embeddings(texts):
    """Generates Gemini embeddings for a list of texts."""
    embeddings = genai.embed_content(
        model="models/embedding-001",
        content=texts,
        task_type="retrieval_document"
    )
    return embeddings['embedding']

document_embeddings = []
batch_size = 32

for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch_texts = documents[i:i_end]
    batch_embeddings = get_gemini_embeddings(batch_texts)
    document_embeddings.extend(batch_embeddings)

document_embeddings_np = np.array(document_embeddings)

def find_similar_documents(query_embedding, document_embeddings_array, top_k=3):
    """Finds the top_k most similar documents to a query embedding."""
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    similarities = cosine_similarity(query_embedding_np, document_embeddings_array)
    top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]
    return top_k_indices

def retrieve(query, top_k=3):
    """Retrieves top_k most similar documents to a query using Gemini embeddings."""
    query_embedding = get_gemini_embeddings(query)
    top_k_indices = find_similar_documents(query_embedding, document_embeddings_np, top_k=top_k)


    retrieved_documents = [documents[i] for i in top_k_indices]
    return retrieved_documents


def generate_answer(query, context):
    prompt = f"""Answer the question based on the context below. If you don't know the answer, say "I don't know".

Context:
{context}

Question: {query}
Answer:"""

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0,
        max_tokens=150
    )

    return response.choices[0].message.content.strip()

def ask_question(question):
    retrieved = retrieve(question)


    context = "\n\n".join(retrieved)


    answer = generate_answer(question, context)

    return answer

print("\nBusiness QA Bot is ready! Try asking a question:")
print("Example questions:")
print("- What is your return policy?")
print("- How do I track my order?")
print("- What payment methods do you accept?")

while True:
    user_input = input("\nYour question (type 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    response = ask_question(user_input)
    print(f"\nAnswer: {response}")



Business QA Bot is ready! Try asking a question:
Example questions:
- What is your return policy?
- How do I track my order?
- What payment methods do you accept?

Your question (type 'quit' to exit): when will the package arrive


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
!pip install --upgrade datasets

# Task
Replace the Pinecone vector database implementation in the provided Python code with a local similarity search using Gemini embeddings. Use the Google API key stored in the "GOOGLE_API_KEY" file for the Gemini embeddings. The code should still perform a RAG (Retrieval Augmented Generation) task, where relevant documents are retrieved based on a query and then used as context to generate an answer using OpenAI.

## Install necessary libraries

### Subtask:
Ensure the Google Generative AI library is installed.


**Reasoning**:
The subtask is to install the `google-generativeai` library. This can be done using pip in a code block.



In [None]:
!pip install google-generativeai

## Initialize gemini embeddings

### Subtask:
Set up the Gemini embeddings model using your Google API key.


**Reasoning**:
Set up the Gemini embeddings model using your Google API key by importing the necessary library, getting the API key from secrets, and configuring the library.



In [None]:
import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

## Embed documents

### Subtask:
Generate embeddings for the documents using the Gemini embeddings model.


**Reasoning**:
Define a function to get Gemini embeddings and then iterate through the documents in batches to generate and store the embeddings.



In [None]:
def get_gemini_embeddings(texts):
    """Generates Gemini embeddings for a list of texts."""
    embeddings = genai.embed_content(
        model="models/embedding-001",
        content=texts,
        task_type="retrieval_query" # Or "retrieval_document" depending on usage later
    )
    return [item['embedding'] for item in embeddings['embeddings']]

document_embeddings = []
batch_size = 32 # Reuse the batch_size from the previous code cell

for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch_texts = documents[i:i_end]
    batch_embeddings = get_gemini_embeddings(batch_texts)
    document_embeddings.extend(batch_embeddings)

print(f"Generated embeddings for {len(document_embeddings)} documents")

**Reasoning**:
The previous code failed because the structure of the `embeddings` object returned by `genai.embed_content` was not as expected. I need to inspect the structure of the returned object to correctly extract the embeddings.



In [None]:
def get_gemini_embeddings(texts):
    """Generates Gemini embeddings for a list of texts."""
    embeddings = genai.embed_content(
        model="models/embedding-001",
        content=texts,
        task_type="retrieval_document" # Changed to retrieval_document for document embedding
    )
    # Inspect the structure of the embeddings object
    print(embeddings)
    return [item['embedding'] for item in embeddings['embeddings']]

# Temporarily test with a small batch to inspect the output
test_batch = documents[:2]
test_embeddings_output = get_gemini_embeddings(test_batch)

# Continue with the rest of the code after fixing the extraction
document_embeddings = []
batch_size = 32

for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch_texts = documents[i:i_end]
    # Assuming the structure is now understood and the extraction logic is correct
    batch_embeddings = get_gemini_embeddings(batch_texts) # This call will still print the structure
    document_embeddings.extend(batch_embeddings)

print(f"Generated embeddings for {len(document_embeddings)} documents")

**Reasoning**:
The print output shows that the embedding is directly under the key 'embedding', not nested under 'embeddings'. I need to correct the extraction logic in the `get_gemini_embeddings` function.



In [None]:
def get_gemini_embeddings(texts):
    """Generates Gemini embeddings for a list of texts."""
    embeddings = genai.embed_content(
        model="models/embedding-001",
        content=texts,
        task_type="retrieval_document"
    )
    # Corrected extraction: Access the 'embedding' key directly
    return embeddings['embedding']

document_embeddings = []
batch_size = 32

for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch_texts = documents[i:i_end]
    batch_embeddings = get_gemini_embeddings(batch_texts)
    document_embeddings.extend(batch_embeddings)

print(f"Generated embeddings for {len(document_embeddings)} documents")

## Implement similarity search

### Subtask:
Replace the Pinecone index search with a method to perform similarity search on the Gemini embeddings. This could involve using libraries like scikit-learn or Faiss for efficient search.


**Reasoning**:
Import cosine_similarity and numpy, convert document_embeddings to a numpy array, and define the find_similar_documents function to calculate cosine similarity and return top_k indices.



In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

document_embeddings_np = np.array(document_embeddings)

def find_similar_documents(query_embedding, document_embeddings_array, top_k=3):
    """Finds the top_k most similar documents to a query embedding."""
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    similarities = cosine_similarity(query_embedding_np, document_embeddings_array)
    # Get indices of top_k similar documents
    top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]
    return top_k_indices

## Update retrieval function

### Subtask:
Modify the `retrieve` function to use the new similarity search method to find relevant documents based on the query embedding.


**Reasoning**:
Implement the `retrieve` function to use the local similarity search method with Gemini embeddings.



In [None]:
def retrieve(query, top_k=3):
    """Retrieves top_k most similar documents to a query using Gemini embeddings."""
    query_embedding = genai.embed_content(
        model="models/embedding-001",
        content=query,
        task_type="retrieval_query"
    )['embedding'] # Access the 'embedding' key directly

    top_k_indices = find_similar_documents(query_embedding, document_embeddings_np, top_k=top_k)

    # Retrieve the actual text of the top_k documents
    retrieved_documents = [documents[i] for i in top_k_indices]
    return retrieved_documents

print("retrieve function updated to use local similarity search.")

## Update rag query function

### Subtask:
Modify the `rag_query` function to use the updated `retrieve` function.


## Summary:

### Data Analysis Key Findings

*   The `google-generativeai` library was confirmed to be already installed, allowing for the use of Gemini embeddings.
*   The Gemini embeddings model was successfully initialized using the provided Google API key.
*   A function was created to generate Gemini embeddings for document texts in batches, correcting an initial issue with accessing the embedding data.
*   A local similarity search method was implemented using cosine similarity from the scikit-learn library.
*   The `retrieve` function was updated to utilize the local similarity search with Gemini embeddings for query retrieval.
*   No changes were required for the `rag_query` function as it was already compatible with the updated `retrieve` function.

### Insights or Next Steps

*   The implementation successfully replaced the external Pinecone vector database with a local similarity search using Gemini embeddings, providing a self-contained RAG system.
*   For larger datasets, consider implementing more efficient local search methods like Faiss to improve retrieval performance.


## Install necessary libraries

### Subtask:
Ensure the Google Generative AI library is installed.

**Reasoning**:
The subtask is to install the `google-generativeai` library. This can be done using pip in a code block.

In [None]:
!pip install google-generativeai

## Initialize gemini embeddings

### Subtask:
Set up the Gemini embeddings model using your Google API key.

**Reasoning**:
Set up the Gemini embeddings model using your Google API key by importing the necessary library, getting the API key from secrets, and configuring the library.

In [None]:
import google.generativeai as genai
from google.colab import userdata

GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

## Embed documents

### Subtask:
Generate embeddings for the documents using the Gemini embeddings model.

**Reasoning**:
Define a function to get Gemini embeddings and then iterate through the documents in batches to generate and store the embeddings.

In [None]:
def get_gemini_embeddings(texts):
    """Generates Gemini embeddings for a list of texts."""
    embeddings = genai.embed_content(
        model="models/embedding-001",
        content=texts,
        task_type="retrieval_document" # Changed to retrieval_document for document embedding
    )
    # Corrected extraction: Access the 'embedding' key directly
    return embeddings['embedding']

document_embeddings = []
batch_size = 32 # Reuse the batch_size from the previous code cell

for i in range(0, len(documents), batch_size):
    i_end = min(i + batch_size, len(documents))
    batch_texts = documents[i:i_end]
    batch_embeddings = get_gemini_embeddings(batch_texts)
    document_embeddings.extend(batch_embeddings)

print(f"Generated embeddings for {len(document_embeddings)} documents")

## Implement similarity search

### Subtask:
Replace the Pinecone index search with a method to perform similarity search on the Gemini embeddings. This could involve using libraries like scikit-learn or Faiss for efficient search.

**Reasoning**:
Import cosine_similarity and numpy, convert document_embeddings to a numpy array, and define the find_similar_documents function to calculate cosine similarity and return top_k indices.

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

document_embeddings_np = np.array(document_embeddings)

def find_similar_documents(query_embedding, document_embeddings_array, top_k=3):
    """Finds the top_k most similar documents to a query embedding."""
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    similarities = cosine_similarity(query_embedding_np, document_embeddings_array)
    # Get indices of top_k similar documents
    top_k_indices = np.argsort(similarities[0])[-top_k:][::-1]
    return top_k_indices

## Update retrieval function

### Subtask:
Modify the `retrieve` function to use the new similarity search method to find relevant documents based on the query embedding.

**Reasoning**:
Implement the `retrieve` function to use the local similarity search method with Gemini embeddings.

In [None]:
def retrieve(query, top_k=3):
    """Retrieves top_k most similar documents to a query using Gemini embeddings."""
    query_embedding = genai.embed_content(
        model="models/embedding-001",
        content=query,
        task_type="retrieval_query"
    )['embedding'] # Access the 'embedding' key directly

    top_k_indices = find_similar_documents(query_embedding, document_embeddings_np, top_k=top_k)

    # Retrieve the actual text of the top_k documents
    retrieved_documents = [documents[i] for i in top_k_indices]
    return retrieved_documents

print("retrieve function updated to use local similarity search.")

## Update rag query function

### Subtask:
Modify the `rag_query` function to use the updated `retrieve` function.

## Summary:

### Data Analysis Key Findings

* The `google-generativeai` library was confirmed to be already installed, allowing for the use of Gemini embeddings.
* The Gemini embeddings model was successfully initialized using the provided Google API key.
* A function was created to generate Gemini embeddings for document texts in batches, correcting an initial issue with accessing the embedding data.
* A local similarity search method was implemented using cosine similarity from the scikit-learn library.
* The `retrieve` function was updated to utilize the local similarity search with Gemini embeddings for query retrieval.
* No changes were required for the `rag_query` function as it was already compatible with the updated `retrieve` function.

### Insights or Next Steps

* The implementation successfully replaced the external Pinecone vector database with a local similarity search using Gemini embeddings, providing a self-contained RAG system.
* For larger datasets, consider implementing more efficient local search methods like Faiss to improve retrieval performance.

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

In [None]:
# Define a sample query
sample_query = "What is the capital of France?" # Or any other query you want to test

# Retrieve documents for the sample query
retrieved_docs = retrieve(sample_query)

print("Retrieved Documents:")
for i, doc in enumerate(retrieved_docs):
    print(f"Document {i+1}:\n{doc}\n---")

# Generate the prompt that will be sent to the Gemini model
context_for_prompt = "\n\n".join(retrieved_docs)
prompt_for_gemini = f"""Answer the question based on the context below. If you don't know the answer, say \"I don't know\".

Context:
{context_for_prompt}

Question: {sample_query}
Answer:"""

print("\nPrompt sent to Gemini:")
print(prompt_for_gemini)

# You can then manually check if the retrieved documents contain the answer and if the prompt is correctly formatted.
# Running the full rag_query(sample_query) again would show the final answer.
# final_answer = rag_query(sample_query)
# print(f"\nFinal Answer from RAG: {final_answer}")

In [None]:
# Display the first few documents to understand the dataset content
print("Sample Documents from the dataset:")
for i, doc in enumerate(documents[:5]): # Displaying the first 5 documents
    print(f"Document {i+1}:\n{doc}\n---")

In [None]:
def retrieve(query, top_k=3):
    """Retrieves top_k most similar documents to a query using Gemini embeddings."""
    query_embedding = get_gemini_embeddings(query) # Use the corrected function
    top_k_indices = find_similar_documents(query_embedding, document_embeddings_np, top_k=top_k)

    # Retrieve the actual text of the top_k documents
    retrieved_documents = [documents[i] for i in top_k_indices]
    return retrieved_documents

print("retrieve function defined.")

Now that the `retrieve` function is defined, you can run the cells sequentially to use the RAG QA Bot.