## PDF Query Using HuggingFace & LangChain

# Step 1: Install All the Required Pakages

In [26]:
!pip install langchain
!pip install pypdf
!pip install unstructured
!pip install sentence_transformers
!pip install pinecone-client
!pip install llama-cpp-python
!pip install huggingface_hub




# Step 2: Import All the Required Libraries

In [27]:
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
import pinecone
import os

In [28]:
loader = PyPDFLoader("story of hyderabad.pdf")
data = loader.load()

In [29]:
data

[Document(page_content="Hyderabad  is the capital  of Telangana  state and temporary capital of  Andhra  \nPradesh state. The city, founded in the year 1591 by Mohammed Quli Qutub Shah, the  \nfifth sultan of Qutb Shahi dynasty, offers a fascinating panorama of the past, with richly  \nmixed  cultural and historical tradition spanning over 400 years. It is one of the fastest  \ngrowing cities of India and has emerged as a strong industrial, commercial, technology  \ncenter, gives a picture of glimpses of past splenders and the legacy of its old hist ory.The  \nhistory of Hyderabad begins with the establishment of the Qutb Shahi dynasty. Quli Qutb  \nShah seized the reins of power from the Bahamani kingdom in 1512 and established the  \nfortress city of Golconda. Inadequacy of water, and frequent epidemics of p lague and  \ncholera persuaded Mohammed Quli Qutub Shah to venture outward to establish new city  \nwith the Charminar at its centre and with four great roads fanning out four ca

# Step 4: Split the Text into Chunks

In [30]:
from torch import chunk
text_spilter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=0)
docs = text_spilter.split_documents(data)

In [31]:
len(docs)

12

In [32]:
docs[0]

Document(page_content='Hyderabad  is the capital  of Telangana  state and temporary capital of  Andhra  \nPradesh state. The city, founded in the year 1591 by Mohammed Quli Qutub Shah, the  \nfifth sultan of Qutb Shahi dynasty, offers a fascinating panorama of the past, with richly  \nmixed  cultural and historical tradition spanning over 400 years. It is one of the fastest  \ngrowing cities of India and has emerged as a strong industrial, commercial, technology', metadata={'source': 'story of hyderabad.pdf', 'page': 0})

# Step 5: Setup the Environment

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = "YOUR_HUGGINGFACE_KEY"
PINECODE_API_KEY = os.environ.get("PINECODE_API_KEY","YOUR_PINECODE_API_KEY")
PINECODE_API_ENV = os.environ.get("PINECODE_API_ENV","YOUR_PINECODE_API_ENV")

# Step 6: Downlaod the Embeddings}

In [33]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Step 7: Initializing the Pinecone

In [43]:
# initialize pinecone
pinecone.init(
    api_key="YOUR_PINECODE_API_KEY",  # find at app.pinecone.io
    environment="YOUR_PINECODE_API_ENV"  # next to api key in console
)
index_name = 'llmchainindex'

# Step 8: Create Embeddings for Each of the Text Chunk

In [44]:
docsearch=Pinecone.from_texts([t.page_content for t in docs], embeddings, index_name=index_name)

# Step 9: Similarity Search

In [48]:
query = "who is founded hyderabad in 1591"

In [51]:
docs=docsearch.similarity_search(query,k=1)

In [52]:
docs

[Document(page_content='Hyderabad  is the capital  of Telangana  state and temporary capital of  Andhra  \nPradesh state. The city, founded in the year 1591 by Mohammed Quli Qutub Shah, the  \nfifth sultan of Qutb Shahi dynasty, offers a fascinating panorama of the past, with richly  \nmixed  cultural and historical tradition spanning over 400 years. It is one of the fastest  \ngrowing cities of India and has emerged as a strong industrial, commercial, technology', metadata={})]

# step 9. Query the Docs to get the Answer Back (Hugging Face Model)

In [53]:
from langchain.llms import HuggingFaceHub

In [54]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})



In [55]:
chain=load_qa_chain(llm, chain_type="stuff")

In [56]:
query = "who is founded hyderabad in 1591"

In [57]:
docs=docsearch.similarity_search(query)

In [58]:
chain.run(input_documents=docs, question=query)

'Mohammed Quli Qutub Shah'