In [16]:
from langchain import PromptTemplate # with this we can create a prompt
from langchain.chains import RetrievalQA # with this we can ask questions to the model
from langchain.embeddings import HuggingFaceEmbeddings # with this we can convert text to vector
from langchain.vectorstores import Pinecone # with this we can store the vector
import pinecone 
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # With this we can split the text into smaller chunks
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers   
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
import os

In [3]:
PINECONE_API_KEY = ""
# To get this pinecone env, you need o create a new one in pinecone index.
# PINECONE_API_ENV = ""

In [4]:
# Extract data from pdf
def load_pdf(data):
    loader= DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [5]:
extracted_data = load_pdf("data/")

In [6]:
extracted_data[2]

Document(page_content='This book is dedicated to Chris,\nwho glows in his father’s eye', metadata={'source': 'data\\Robert B Cialdini - Influence_ The Psychology of Persuasion (2007).pdf', 'page': 2})

In [7]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunk = text_splitter.split_documents(extracted_data)
    return text_chunk

In [8]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

1545


In [9]:
# download the embedding model
def download_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_embedding_model()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [12]:
query_results = embeddings.embed_query("Hello World")
print("length", len(query_results))

length 384


In [13]:
# Initialize pinecone
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("mindcontrol")

In [20]:
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'mindcontrol-caeac3f.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'mindcontrol',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [22]:
# Creating embeddings for each of the text chunks and store them in pinecone. (This is our knowledge base) 
docsearch = PineconeVectorStore.from_texts([t.page_content for t in text_chunks], index_name='mindcontrol', embedding=embeddings)

In [29]:
# If we already have an index we can load it (This is also checking ranked results)
docsearch = PineconeVectorStore.from_existing_index( index_name='mindcontrol', embedding=embeddings) # with this we can load the vector

query = "What is mind control"

docs = docsearch.similarity_search(query, k=3)# here k is the number of results we want


In [30]:
print(docs)

[Document(page_content='weapons of influence. In this instance, once we realize that obedience\nto authority is mostly rewarding, it is easy to allow ourselves the con-\nvenience of automatic obedience. The simultaneous blessing and bane\nof such blind obedience is its mechanical character. We don’t have to\nthink; therefore, we don’t. Although such mindless obedience leads us\nto appropriate action in the great majority of cases, there will be con-\nspicuous exceptions—because we are reacting rather than thinking.'), Document(page_content='New Directions in Attribution Research, Vol. 2., ed., Harvey, Ickes, and\nKidd. Potomac, Md.: Lawrence Erlbaum Associates, 1978.\n——. “Minding Matters,” Advances in Experimental Social Psychology,\nVol. 22, ed. L. Berkowitz. New York: Academic Press, 1989.\nLatané, B., and J. M. Darley. “Group Inhibition of Bystander Intervention\nin Emergencies.” Journal of Personality and Social Psychology 10 (1968):\n215–21.232 / Influence'), Document(page_conten

In [31]:
# Defining Prompt Template
promp_template = """
Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know. Do not make up an answer.
context: {context}
question: {question}
Only return the helpful answer and information about the context. Do not make up an answer.
Helpful Answer:
"""

In [32]:
# Creating Prompt Template
prompt = PromptTemplate(template=promp_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": prompt} # Created this chain type argument because I will be using Retrieval QA chain

In [33]:
# Loading llm model
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={"max_new_tokens": 512,
                            "temperature": 0.8})

### Explaining Retriever Class below:

Importing necessary modules and classes:
Python
```
from langchain.schema.retriever import BaseRetriever
from typing import List
from langchain.schema import Document
from pydantic import BaseModel
```
These lines import the necessary modules and classes. BaseRetriever is the base class for all retrievers, List is a typing construct that denotes a list, Document is a class that represents a document, and BaseModel is a base class for Pydantic models.

<b>Defining the PineconeRetriever class:</b>
```
class PineconeRetriever(BaseRetriever, BaseModel):
    pinecone_vector_store: PineconeVectorStore
```

This line defines the PineconeRetriever class which inherits from both BaseRetriever and BaseModel. It has a single attribute pinecone_vector_store of type PineconeVectorStore.

<b>Defining the constructor:</b>
```
def __init__(self, pinecone_vector_store: PineconeVectorStore, **data):
    super().__init__(pinecone_vector_store=pinecone_vector_store, **data)
```
The constructor takes two arguments: pinecone_vector_store and **data. It calls the constructor of the superclass with these arguments.

<b>Defining the _get_relevant_documents method:</b>
```

def _get_relevant_documents(self, query: str) -> List[Document]:
    return self.pinecone_vector_store.similarity_search(query)
```
This method takes a query string and returns a list of Document objects that are relevant to the query. It does this by calling the similarity_search method of the pinecone_vector_store object.

<b>Defining the _aget_relevant_documents method:</b>
```
async def _aget_relevant_documents(self, query: str) -> List[Document]:
    return self.pinecone_vector_store.similarity_search(query)
```
This method is the asynchronous version of _get_relevant_documents. It does the same thing but in an asynchronous manner. This can be useful when dealing with IO-bound tasks.

In [61]:
# Defining retriever
from langchain.schema.retriever import BaseRetriever
from typing import List
from langchain.schema import Document
from pydantic import BaseModel

class PineconeRetriever(BaseRetriever, BaseModel):
    pinecone_vector_store: PineconeVectorStore

    def __init__(self, pinecone_vector_store: PineconeVectorStore, **data):
        super().__init__(pinecone_vector_store=pinecone_vector_store, **data)

    def _get_relevant_documents(self, query: str) -> List[Document]:
        return self.pinecone_vector_store.similarity_search(query)

    async def _aget_relevant_documents(self, query: str) -> List[Document]:
        return self.pinecone_vector_store.similarity_search(query)


In [63]:
# Create a PineconeRetriever object
pinecone_retriever = PineconeRetriever(docsearch)

In [64]:
# Creating Retrieval QA  Object
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=pinecone_retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [65]:
while True:
    user_input = input(f"Ask your question: ")
    if query == "exit":
        break
    else:
        result = qa({"query": user_input})
        print(result)

{'query': 'What is influence?', 'result': 'Influence refers to the power or ability to affect the behavior, attitudes, or actions of others through various psychological mechanisms, such as authority, reciprocity, liking, and scarcity. Influence can be used for positive or negative purposes, depending on the intentions of the individual or group using it.\nContext: The book "Influence" by Robert B. Cialdini, Ph.D., explores the science behind influence and how it works in various situations. The author includes examples of influence in action throughout the book, including instances where individuals have been influenced by authority figures, such as doctors, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges and lawy, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges or judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges, judges and corpor