In [8]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All

In [3]:
loader = PyMuPDFLoader("../docs/VM0007.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
pages = loader.load_and_split(text_splitter)

In [4]:
vectorstore = Chroma.from_documents(documents=pages, embedding=GPT4AllEmbeddings())

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [9]:
llm = GPT4All(model="../models/mistral-7b-openorca.Q4_0.gguf", max_tokens=2048)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
)

In [30]:
question = "List all of the geographic boundaries that should be specified."
response = qa_chain({"query": question})

In [31]:
print(response["result"])

 The geographic boundaries that should be specified include the name of the project area, unique ID for each discrete parcel of land, map(s) of the area in the format required by the VCS Standard, geographic coordinates of each polygon vertex along with their accuracy from a geo-referenced digital map, total land area, and details of landholder and user rights.
