In [None]:
# Overview

This notebook demonstrates the process of creating a knowledge retrieval system using various tools and libraries. The workflow includes:

1. **Data Preparation**: Loading and preprocessing a dataset to be used for knowledge retrieval.
2. **Embedding Generation**: Using a pre-trained Sentence Transformer model to generate embeddings for the dataset.
3. **Indexing**: Creating and populating a Pinecone index with the generated embeddings.
4. **Querying**: Implementing a retrieval system to search for relevant documents based on user queries.
5. **Custom Agent Tool**: Setting up a custom agent tool using LangChain to interact with the knowledge base and answer questions.

By the end of this notebook, you will have a functional knowledge retrieval system capable of answering queries based on the provided dataset.


In [None]:
from langchain_groq import ChatGroq
from langchain.chains import LLMChain
from langchain.agents import initialize_agent
from langchain.tools import Tool,BaseTool
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
import os
from datasets import load_dataset
import pandas as pd
from sentence_transformers import SentenceTransformer
import warnings
from pinecone import Pinecone,ServerlessSpec

In [2]:
warnings.filterwarnings('ignore')

In [None]:
## Setting up Data for Indexing

### Stanford Question Answering Dataset (SQuAD)

SQuAD 1.1 contains 100,000+ question-answer pairs on 500+ articles.

In [3]:
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'validation': 'plain_text/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/rajpurkar/squad/" + splits["train"])

In [None]:
df.head()

In [None]:
df.drop_duplicates(subset='context',inplace=True)
df.head() # as we need context to store in db

In [None]:
df['answers'].head(10)

In [None]:
# Setting up the Groq inference key for any LLM

In [8]:

llm = ChatGroq(temperature=0.0, model='mixtral-8x7b-32768', api_key=os.getenv('GROQ_API_KEY'),verbose=True)

In [None]:
### Sentence-Transformers: all-MiniLM-L6-v2

`all-MiniLM-L6-v2` is a compact transformer model for generating sentence embeddings, part of the Sentence-Transformers library.

**Technical Details:**
- **Architecture**: MiniLM-L6
- **Layers**: 6
- **Embedding Size**: 384 dimensions

In [None]:
embeddings = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings

In [None]:
import os

### Setting up Pinecone API Key

To use Pinecone for indexing and querying, you need to set up your Pinecone API key. Follow these steps:

1. **Sign Up/Log In to Pinecone**: If you don't have an account, sign up at [Pinecone](https://www.pinecone.io/). If you already have an account, log in.

2. **Get API Key**: Once logged in, navigate to the API keys section in your Pinecone dashboard. Create a new API key if you don't have one, and copy it.

3. **Set API Key in Environment Variables**: Store your API key in an environment variable for security. You can do this by adding the following line to your `.bashrc`, `.zshrc`, or equivalent shell configuration file:
    ```bash
    export PINECONE_API_KEY='your_api_key_here'
    ```
    Replace `'your_api_key_here'` with the actual API key you copied.

4. **Load API Key in Jupyter Notebook**: In your Jupyter Notebook, load the API key using the `os` module:
    ```python
    api_key = os.getenv('PINECONE_API_KEY')
    ```

By following these steps, you will have set up your Pinecone API key and be ready to use Pinecone for indexing and querying.

In [10]:
pc=Pinecone(api_key="api_key")

In [11]:
index_name="langchain-retrieval-agent"

In [12]:
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [None]:
Index=pc.Index(index_name)
Index

In [None]:
Index.describe_index_stats()

**Note**: The process of adding data to Pinecone can be time-consuming due to the large dataset (80,000 rows). Therefore, the index is not being created in this example. However, the steps provided are correct and can be followed for actual implementation.

You can check the entries in Pinecone section


In [None]:
batch_size = 500  # Pushing into vector in batch sizes (batch processing)

for i in range(0, len(df), batch_size):
    i_end = i + batch_size
    split_data = df.iloc[i:i_end]  # Splitting data frame
    
    # Initialize metadata list for this batch
    metadata = []

    # Create metadata entries for each row in the batch
    for index, row_data in split_data.iterrows():
        metadata.append({
            'title': row_data['title'],
            'context': row_data['context']
        })
    
    # Extract documents and IDs
    documents = split_data['context'].tolist()  # Convert to list of strings
    ids = split_data['id'].tolist()  # Convert to list of IDs
    
    # Create document embeddings
    embedded_documents = embeddings.encode(documents)
    
    # Upsert into Pinecone index
    Index.upsert(vectors=zip(ids, embedded_documents, metadata))


In [None]:
from langchain.vectorstores import Pinecone
vector_store = Pinecone(
    index=Index,  # Pinecone index instance
    embedding_function=embeddings.encode,  # Function to convert queries into embeddings
    metadata_field="context"  # Metadata field to retrieve or filter on
)

In [None]:
query = "when was the college of engineering in the University of Notre Dame established?"

vector_store.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

### Custom Agent Tool

The following chain will now act as a tool for the agent, enabling it to interact with the knowledge base and answer queries effectively.

In [None]:
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5, # 5 conversions memory
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever()
)

In [None]:
Custom_Tool=Tool(
    name='Knowledge Base',
    func=qa.invoke,
    description='Useful for answering general question answers'
)
tools=[Custom_Tool]

In [None]:
custom_agent=initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

In [None]:
custom_agent("can you tell me some facts about the University of Notre Dame?")