In [16]:
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer, util

import os
import shutil

os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'HF_API_KEY'

data_path = 'nasa_articles/'
chroma_path = "chroma"

In [11]:
def load_documents():
    loader = DirectoryLoader(data_path, glob='*.md', show_progress=True, 
                             use_multithreading=True, loader_cls=TextLoader)
    docs = loader.load()
    return docs

docs = load_documents()

100%|██████████| 1000/1000 [00:00<00:00, 2166.87it/s]


In [12]:
# Split the documents into chunks so we can retrieve information more granularly (rather than the entire document)

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index = True,
    )

    chunks = text_splitter.split_documents(documents)
    print(f'Split {len(documents)} documents into {len(chunks)} chunks')
    
    # Demonstrate what a chunk looks like (not necessary code)
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

chunks = split_text(docs)

Split 1000 documents into 6876 chunks
NASA Partnerships Bring 2024 Total Solar Eclipse to Everyone 
 On Monday, April 8, NASA and its partners will celebrate the wonders of the total solar eclipse as it passes over North America, with the path of totality in the United States, from Kerrville, Texas, to Houlton, Maine.

Solar eclipse NASA

Eclipses are an important contribution to NASA’s research into the Sun’s outer atmosphere, or corona, and the part of Earth’s atmosphere where space weather happens. They’re also an inspirational opportunity for the public to get involved, learn, and connect with our place in the universe.

On Monday, April 8, NASA and its partners will celebrate the wonders of the total solar eclipse as it passes over North America, with the path of totality in the United States, from Kerrville, Texas, to Houlton, Maine.
{'source': 'nasa_articles/nasa_article_364.md', 'start_index': 0}


In [13]:
# Create a Chroma DB to query each chunk. (Uses vector embeddings as the key)
# Create vector embeddings for each chunk
# Embeddings are vectors in an n-dimensonal space. Eg. similar words or chunks will point in similar directions.
# Embeddings from OpenAI for example (but costs per x tokens)
# Alternative Embedding from Huggingface: Opensource & free 
 
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(chroma_path):
    # clear previous chroma dbs
        shutil.rmtree(chroma_path)

    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=chroma_path
    )
    # should automatically save but persist forces it to save
    db.persist()
    print(f'Saved {len(chunks)} chunks to {chroma_path}.')

save_to_chroma(chunks)



ImportError: Could not import chromadb python package. Please install it with `pip install chromadb`.

In [None]:
sentences = ["This is an example sentence", "Each sentence has been converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)

tensor([[0.3651]])