In [18]:
import os 
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader, CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from config import CHUNK_SIZE, CHUNK_OVERLAP, MODEL

In [19]:
# Maps extensions to doc loaders
ext2loader = {
    ".csv": (CSVLoader, {}),
    ".docx": (Docx2txtLoader, {}),
    ".pdf": (PyPDFLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
}

Load document

In [20]:
def load_doc(file_path,print_report=False):
    file_extension = os.path.splitext(file_path)[1]

    if file_extension in ext2loader:
            
        loader_type, loader_args = ext2loader[file_extension]
        loader = loader_type(file_path, **loader_args)
        load = loader.load()

        # if print_report:
            # print(f"Number of pages: {len(load)}")
            # print(f"Length of a page: {len(load[1].page_content)}")
            # print("Content of a page:", load[1].page_content)
        
        return load

    raise ValueError(f" '{file_extension}' file type not supported")

Splitter

In [21]:
def splitter(documents,print_report=False):
    splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

    chunks = splitter.split_documents(documents)

    if print_report:
        print(f"Number of chunks: {len(chunks)}")
        print(f"Length of a chunk: {len(chunks[1].page_content)}")
        print("Content of a chunk:", chunks[1].page_content)

    return chunks

Convert to Word Embedding and Insert into Vector DB

In [22]:
def word_embeddings(chunks):
    embeddings = OllamaEmbeddings(model=MODEL)
    vectorstore = FAISS.from_documents(chunks, embeddings)

    vectorstore.save_local("faiss_index")
    
    return vectorstore 

Retriever

In [23]:
def retrieve(query,vectorstore):
    retriever = vectorstore.as_retriever()
    return retriever.invoke(query)

In [24]:
document = load_doc("got.txt", print_report=True)
chunks = splitter(document, print_report=True)
vectorstore = word_embeddings(chunks)

Number of chunks: 80
Length of a chunk: 145
Content of a chunk: The story begins when King Robert visits the northern castle Winterfell to ask Ned Stark to be his right-hand assistant, or Hand of the King. The


In [25]:
retrieve(query="where did the eggs come from?",vectorstore=vectorstore)

[Document(metadata={'source': 'got.txt'}, page_content='help. Daenerys receives three dragon eggs as a wedding gift and becomes immediately fascinated by them.'),
 Document(metadata={'source': 'got.txt'}, page_content='Everyone thinks Bran simply fell while climbing around the castle.'),
 Document(metadata={'source': 'got.txt'}, page_content='He knows it has to do with something the Hand learned about King Robert’s children. Through a spy, Robert learns that Daenerys Targaryen is pregnant.'),
 Document(metadata={'source': 'got.txt'}, page_content='she is holding Sansa for her own protection). Tywin Lannister, father to Tyrion, Cersei, and Jaime, wages war with Catelyn and her son, Robb Stark.')]

In [26]:
vectorstore.similarity_search(query="good network", k=2 ,filter={"source":"Networks_notes.pdf"})

[]

## Load the saved Store and try to run the retriever
resource: https://python.langchain.com/docs/integrations/vectorstores/faiss/#saving-and-loading

In [27]:
embeddings = OllamaEmbeddings(model=MODEL)
new_vector_store = FAISS.load_local(
    "faiss_index", embeddings, allow_dangerous_deserialization=True
)
retrieve(query="Principles of Networking",
         vectorstore=new_vector_store
)

[Document(metadata={'source': 'got.txt'}, page_content='He knows it has to do with something the Hand learned about King Robert’s children. Through a spy, Robert learns that Daenerys Targaryen is pregnant.'),
 Document(metadata={'source': 'got.txt'}, page_content='it as belonging to Tyrion Lannister. Ned tells Catelyn he will try to determine who killed the former Hand, Jon Arryn, and who tried to kill Bran.'),
 Document(metadata={'source': 'got.txt'}, page_content='she is holding Sansa for her own protection). Tywin Lannister, father to Tyrion, Cersei, and Jaime, wages war with Catelyn and her son, Robb Stark.'),
 Document(metadata={'source': 'got.txt'}, page_content='Mormont and the Dothrakis find her with three newborn dragons at her breast.')]

In [28]:
new_vector_store.similarity_search(query="good network", k=2 ,filter={"source":"Networks_notes.pdf"})

[]