In [1]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer, util

from tqdm import tqdm
import os
import shutil

data_path = 'nasa_articles/'
faiss_path = "faiss_db"

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
def load_documents():
    loader = DirectoryLoader(data_path, glob='*.md', show_progress=True, 
                             use_multithreading=True, loader_cls=TextLoader)
    docs = loader.load()
    return docs

docs = load_documents()

100%|██████████| 1000/1000 [00:00<00:00, 3399.75it/s]


In [3]:
# Split the documents into chunks so we can retrieve information more granularly (rather than the entire document)

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= 500,
        chunk_overlap = 250,
        length_function = len,
        add_start_index = True,
    )

    chunks = text_splitter.split_documents(documents)
    print(f'Split {len(documents)} documents into {len(chunks)} chunks')
    
    # Demonstrate what a chunk looks like (not necessary code)
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

chunks = split_text(docs)

Split 1000 documents into 12854 chunks
“To put it simply, this is the kind of rock we had hoped to find when we decided to investigate Jezero Crater,” said Ken Farley, project scientist for Perseverance at Caltech in Pasadena, California. “Nearly all the minerals in the rock we just sampled were made in water; on Earth, water-deposited minerals are often good at trapping and preserving ancient organic material and biosignatures. The rock can even tell us about Mars climate conditions that were present when it was formed.”
{'source': 'nasa_articles/nasa_article_355.md', 'start_index': 1094}


In [4]:
# Create a DB to later query each chunk. (Uses vector embeddings as the key)
# Create vector embeddings for each chunk
# Embeddings are vectors in an n-dimensonal space. Eg. similar words or chunks will point in similar directions.
# Embeddings from OpenAI for example (but costs per x tokens) -> Alternative Embedding from Huggingface: Opensource & free 
 
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

def save_to_faiss(chunks: list[Document]):
    if os.path.exists(faiss_path):
        # clear previous chroma dbs
        print("Removing previous contents.")
        shutil.rmtree(faiss_path)

    db = None
    with tqdm(total=len(docs), desc="Ingesting documents") as pbar:
        for d in docs:
            if db:
                db.add_documents([d])
            else:
                db = FAISS.from_documents([d], embeddings)
            pbar.update(1) 

    try:
        # print("Creating VectorStore.")
        # db = FAISS.from_documents(docs, embeddings)
        print("Saving DB")
        db.save_local(folder_path="faiss_db", index_name="nasa_index")
        print(f'Saved {len(chunks)} chunks to {faiss_path}.')
    except (ValueError, RuntimeError) as e:
        print("Fiass store failed \n", e)

save_to_faiss(chunks)



Removing previous contents.


Ingesting documents: 100%|██████████| 1000/1000 [06:08<00:00,  2.71it/s]

Saving DB
Saved 12854 chunks to faiss_db.





In [5]:
sentences = ["This is an example sentence", "Each sentence has been converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)



tensor([[0.3651]])