# RAG Model for QA Bot

## install dependencies

In [1]:
%pip install openai pinecone-client torch transformers datasets
%pip freeze > requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import openai
from pinecone import Pinecone, ServerlessSpec
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import torch
import os
import numpy as np
from dotenv import load_dotenv

load_dotenv()

  from tqdm.autonotebook import tqdm


True

## configurations

In [None]:
'''
your .env file should look like this:

OPENAI_API_KEY=your_openai_api_key
PINECONE_API_KEY=your_pinecone_api_key
'''

In [3]:
openai.api_key = os.getenv('OPENAI_API_KEY')

# Initialize Pinecone
pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

# Define index name
INDEX_NAME = "business-qa-bot"

# Create index if it doesn't exist
if INDEX_NAME not in pinecone_client.list_indexes():
    pinecone_client.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # Adjust to your Pinecone environment
        )
    )

# Connect to the index
index = pinecone_client.Index(INDEX_NAME)

## load dataset

In [4]:
# Load a business-related dataset
# Replace this with a dataset that contains Q&A pairs for the specific business domain

data = load_dataset("ArunSharmaaaaa/business_for_chatbot", split="train[:1000]")


In [5]:
print(data[0])

{'<s> [INST] business context, what factors influence work-life integration? [/INST] Hsieh explored diverse interests like playing poker that expanded his business perspectives while maintaining a holistic view on achieving happiness professionally and personally.</s>': '<s> [INST] could expanding habitual spaces increase mentions ? [/INST] Associating a product with a strong existing trigger outside its normal habitat, like pairing a candy with coffee breaks, can increase verbal mentions by linking it to something regularly thought of in a new context.</s>'}


In [6]:
# Prepare documents for indexing
documents = [
    {"id": str(i), 
     "context": list(qa.keys())[0].split('[/INST]')[0].replace('<s> [INST] ', '').strip(),  # Extract the question
     "answer": list(qa.values())[0].split('[/INST]')[1].replace(' [/INST]', '').replace('</s>', '').strip()}  # Extract the answer
    for i, qa in enumerate(data)
]

# Extract only unique contexts for vectorization
document_texts = list(set(doc["context"] for doc in documents))


In [7]:
# samples
print(documents[:5])

[{'id': '0', 'context': 'business context, what factors influence work-life integration?', 'answer': 'Associating a product with a strong existing trigger outside its normal habitat, like pairing a candy with coffee breaks, can increase verbal mentions by linking it to something regularly thought of in a new context.'}, {'id': '1', 'context': 'business context, what factors influence work-life integration?', 'answer': 'For contracts that are valued at $500,000 a federal prime contractor must submit a subcontracting plan that includes a plan for the use of woman- owned businesses.'}, {'id': '2', 'context': 'business context, what factors influence work-life integration?', 'answer': 'Set personal goals, make realistic plans, and lean on advisors.'}, {'id': '3', 'context': 'business context, what factors influence work-life integration?', 'answer': 'Consider emotional/ psychological sides of change.'}, {'id': '4', 'context': 'business context, what factors influence work-life integration?

## Pinecone

In [8]:
# Use a pre-trained transformer model to create embeddings

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
# Function to embed text

def embed_text(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()

In [None]:
# Generate embeddings and upsert to Pinecone
# 1st approach: Embed the 'context' (question) for each document

for doc in documents:
    # Assuming you want to embed the 'context' (question)
    embedding = embed_text(doc["context"])  # Using 'context' for embedding
    index.upsert([(doc["id"], embedding, {"text": doc["context"]})])  # Store 'context' in Pinecone


In [10]:
# 2nd approach: Store Both Context and Answer

# Generate embeddings and upsert to Pinecone
for doc in documents:
    embedding = embed_text(doc["context"])  # Using 'context' for embedding
    index.upsert([
        (
            doc["id"], 
            embedding, 
            {"context": doc["context"], "answer": doc["answer"]}  # Store both context and answer
        )
    ])



In [None]:
# new

# Generate embeddings and batch upsert to Pinecone
batch_size = 100  # Adjust based on your dataset size and system capacity
batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

for batch in batches:
    upserts = []
    for doc in batch:
        # Ensure 'context' and 'answer' keys exist
        context = doc.get("context", "")
        answer = doc.get("answer", "")
        
        # Embed the context
        embedding = embed_text(context)
        
        # Add to upserts
        upserts.append((doc["id"], embedding, {"context": context, "answer": answer}))
    
    # Perform batch upsert
    index.upsert(upserts)


## Build QA Bot

In [35]:
# for 1st approach: Embed the 'context' (question) for each document

def answer_query(query):
    try:
        # Embed the user query
        query_embedding = embed_text(query)
    
        # Search Pinecone for similar contexts
        search_results = index.query(
            vector=query_embedding, 
            top_k=3, 
            include_metadata=True
        )
    
        # Combine retrieved contexts
        context = "\n".join([result["metadata"]["context"] for result in search_results["matches"]])
    
        # Generate an answer using OpenAI
        response = openai.Completion.create(
            engine="gpt-4", 
            prompt=f"Context: {context}\n\nQuestion: {query}\nAnswer:",
            max_tokens=150
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "I'm sorry, I couldn't process your request at the moment."


In [11]:
# for 2nd approach: Store Both Context and Answer

def answer_query(query):
    try:
        # Embed the user query
        query_embedding = embed_text(query)
    
        # Search Pinecone for similar contexts
        search_results = index.query(
            vector=query_embedding, 
            top_k=3, 
            include_metadata=True
        )
    
        # Check if answers are directly available from Pinecone
        retrieved_answers = [result["metadata"].get("answer", "") for result in search_results["matches"]]
        retrieved_contexts = [result["metadata"].get("context", "") for result in search_results["matches"]]
        
        # Combine retrieved contexts
        combined_context = "\n".join(retrieved_contexts)
        
        # If retrieved answers are sufficient and relevant, use them directly
        if all(retrieved_answers):
            return "\n".join(retrieved_answers)
        
        # Otherwise, use OpenAI to generate a response based on the context
        response = openai.Completion.create(
            engine="gpt-4", 
            prompt=f"Context: {combined_context}\n\nQuestion: {query}\nAnswer:",
            max_tokens=150
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "I'm sorry, I couldn't process your request at the moment."
    

In [34]:
def perform_query(query_embedding):
    try:
        # Ensure query_embedding is a 1D list
        if isinstance(query_embedding, np.ndarray):
            query_embedding = query_embedding.tolist()
        elif not isinstance(query_embedding, list):
            raise ValueError("query_embedding must be a list or numpy array")

        # Query Pinecone
        search_results = index.query(
            vector=query_embedding,
            top_k=3,
            include_metadata=True
        )
        # print("Search results:", search_results)
        return search_results
    except Exception as e:
        print(f"Error during Pinecone query: {e}")
        return None


## test QA Bot

In [None]:
# single output

query = "What are the key factors affecting work-life balance?"
query_embedding = embed_text(query)

search_results = perform_query(query_embedding)

if search_results:
    best_match = max(search_results["matches"], key=lambda match: match["score"])
    print("---------------------------------------------")
    print(f"question: {query}")
    print("---------------------------------------------")
    print("response:")
    print(f"context: {best_match['metadata']['context']}")
    print(f"answer: {best_match['metadata']['answer']}")
    print("---------------------------------------------")
    

---------------------------------------------
question: What are the key factors affecting work-life balance?
---------------------------------------------
response:
context: business context, what factors influence work-life integration?
answer: Recognizing achievements through celebrations and promotions motivates employees and fuels innovation.
---------------------------------------------


In [37]:
# get top 3 raw results

query = "What are the key factors affecting work-life balance?"
query_embedding = embed_text(query)

search_results = perform_query(query_embedding)
if search_results:
    for match in search_results["matches"]:
        print(f"ID: {match['id']}, Score: {match['score']}, Metadata: {match['metadata']}")


ID: 96, Score: 0.664621949, Metadata: {'answer': 'Recognizing achievements through celebrations and promotions motivates employees and fuels innovation.', 'context': 'business context, what factors influence work-life integration?'}
ID: 94, Score: 0.664621949, Metadata: {'answer': "C rises force innovation, as Sk ylab's damaged launch did by requiring fast, creative solutions.", 'context': 'business context, what factors influence work-life integration?'}
ID: 95, Score: 0.664621949, Metadata: {'answer': 'When people are mutually accountable for results, there is less room for blame and self-justification.', 'context': 'business context, what factors influence work-life integration?'}


## clean up

In [None]:
# Delete the index if no longer needed

pinecone_client.delete_index(INDEX_NAME)