In [1]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain import HuggingFaceHub
import os
import torch
import textwrap

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<your HF token>"


In [2]:
#Load the document
loader = PyPDFLoader('/Users/indira/Documents/Indira/Tech/Books/nnlp.pdf')
documents = loader.load()

In [3]:
# split the document
text_splitter = CharacterTextSplitter('\n' , chunk_size=1000 , chunk_overlap = 50)
text_chunks = text_splitter.split_documents(documents)

In [4]:
len(text_chunks)

235

In [6]:
#1. Embeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') #,model_kwargs={'device': 'cuda'})
#2. Vector store
vectorstore=Chroma.from_documents(text_chunks, embeddings)
#3. Generator model
llm=HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":1, "max_length":1000})

In [22]:
## RetrievalQA chain
chain =  RetrievalQA.from_chain_type(llm=llm, 
                                     chain_type = "stuff",
                                     return_source_documents=True, 
                                     retriever=vectorstore.as_retriever())


In [30]:
%time
query = "How does the size of sliding window matter?"
result=chain({"query": query}, return_only_outputs=True)
wrapped_text = textwrap.fill(result['result'], width=500)
wrapped_text

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 22.9 µs


'Larger windows tend to produce more topical similarities (i.e. dog", bark" and leash" will be grouped together, as well as walked", run" and walk- ing"), while smaller windows tend to produce more functional and syntactic similarities'

In [None]:
##ignore

In [None]:
## you can pass the pipeline as input llm to langchain 
## or load the model using HuggingFaceHub
## Pipeline not supported for T5
