In [39]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv, dotenv_values
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader # for loading the pdf
import pprint
import glob
from pypdf import PdfReader
import chromadb
import os
import re

## Load configuration from Shared and Secret

In [40]:
config = {
    **dotenv_values(".env.shared"),  # load shared development variables
    **dotenv_values(".env.secret"),  # load sensitive variables
    **os.environ,  # override loaded values with environment variables
}

In [41]:
db_type = "faiss"
collection_name = "loi"
db_name = collection_name + "_db_" + db_type
vector_db_location_value = "VECTOR_DB_PATH"
ABS_PATH = os.path.dirname(config[vector_db_location_value])
DB_DIR = os.path.join(ABS_PATH, db_name)

In [42]:
def get_documents(config):
    loader = PyPDFDirectoryLoader(collection_name, silent_errors = True)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = config["CHUNK_SIZE"],
        chunk_overlap = 20,
        length_function = len,
        is_separator_regex = False
    )
    
    data = loader.load_and_split(text_splitter)
    return data

In [43]:
def has_documents(vectorstore):
    documents = vectorstore.similarity_search("")
    return len(documents) > 0

In [44]:
def replace_newlines_and_spaces(text):
    # Replace all newline characters with spaces
    text = text.replace("\n", " ")

    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    return text



In [45]:
def init_vectordb(config):
    db_path_exists = os.path.exists(DB_DIR)
    embeddings = OpenAIEmbeddings(openai_api_key=config['OPENAI_API_KEY'])

   
    vectorstore = None
    if db_path_exists:
        print("Loading existing vectordb...")
        vectorstore = FAISS.load_local(folder_path=db_name, embeddings=embeddings)
        print("Finished loading existing vectordb.")
    else:
        print("Loading vectordb...")
        documents = []
        for num, doc in enumerate(get_documents(config)):
            doc.page_content = replace_newlines_and_spaces(doc.page_content)
            documents.append(doc)    
        vectorstore = FAISS.from_documents(documents=docs, embedding=embeddings)
        vectorstore.save_local(folder_path=db_name)
        print("Finished loading vectordb.")
    return vectorstore

In [46]:
vectorstore = init_vectordb(config)

Loading existing vectordb...
Finished loading existing vectordb.


In [47]:
print("Has documents: " + str(has_documents(vectorstore)))

Has documents: True


In [48]:
template = """
        You are a bot that answers questions about the content of the document. If you don't know the answer,
        simply state that you don't know.

        {context}

        Question: {question}    
"""

questions=[
    "Can I perform preventative maintenance with a sport pilot certificate?", 
    "How am I exempted from 100 hour inspections for a club flying aircraft?"
]

    
PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
llm = ChatOpenAI(openai_api_key=config['OPENAI_API_KEY'], temperature=0.2, model="gpt-4")
#llm = GPT4All(model=modelpath)
#llm = GPT4All()
qa_with_source = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever(), chain_type_kwargs={"prompt": PROMPT, }, return_source_documents=True, )
for question in questions:
    pprint.pprint(qa_with_source(question))


{'query': 'Can I perform preventative maintenance with a sport pilot '
          'certificate?',
 'result': 'Yes, the holder of a sport pilot certificate may perform '
           'preventive maintenance, but only on an aircraft that has been '
           'issued a special airworthiness certificate.',
 'source_documents': [Document(page_content='pilot certificate may perform preventive maintenance on an aircraft owned or operated by \nthat pilot and issued a special aiIWorthiness certificate in the light-sport category.', metadata={'source': 'loi/McCreary_2018_Legal_Interpretation.pdf', 'page': 2}),
                      Document(page_content='That regulation, with the exception that the holder of a sport pilot certificate may perform \npreventive maintenance only on an aircraft that has been issued a special airworthiness certificate', metadata={'source': 'loi/McCreary_2018_Legal_Interpretation.pdf', 'page': 0}),
                      Document(page_content='AGC-200, addressing the ques