In [29]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain_nomic.embeddings import NomicEmbeddings
from langchain.evaluation.embedding_distance.base import PairwiseEmbeddingDistanceEvalChain
import os
import shutil

nomic_api_key = os.getenv("NOMIC_API_KEY")

data_path = 'nasa_articles/'
chroma_path = "chroma"



In [30]:
def load_documents():
    loader = DirectoryLoader(data_path, glob='*.md')
    docs = loader.load()
    return docs

In [31]:
# Split the documents into chunks so we can retrieve information more granularly (rather than the entire document)
# TODO find a reasonable chunk size for the articles

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index = True,
    )

    chunks = text_splitter.split_documemts(documents)
    print(f'Split {len(documents)} documents into {len(chunks)} chunks')
    
    # Demonstrate what a chunk looks like (not necessary code)
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [32]:
# Create a Chroma DB to query each chunk. (Uses vector embeddings as the key)
# Using OpenAIEmbeddings with OpenAI account to create vector embeddings for each chunk
# Embeddings are vectors in an n-dimensonal space. Eg. similar words or chunks will point in similar directions.

# Alternative Embedding: Opensource & free 
# https://docs.nomic.ai/atlas/capabilities/data-interface
# https://python.langchain.com/v0.1/docs/integrations/text_embedding/nomic/
embeddings = NomicEmbeddings(nomic_api_key=nomic_api_key, dimensionality=64, model="nomic-embed-text-v1.5")
 
def save_to_chroma(chunks: list[Document]):
    if os.path.exists(chroma_path):
    # clear previous chroma dbs
        shutil.rmtree(chroma_path)

    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=chroma_path
    )
    # should automatically save but persist forces it to save
    db.persist()
    print(f'Saved {len(chunks)} chunks to {chroma_path}.')

In [None]:
# Seems to use OPENAI, alternatively I could use my own distance functions from ML_from_scratch...
# distance between 2 vectors (compare embedding distance)
chain = PairwiseEmbeddingDistanceEvalChain() 
# example evaluation
x = chain.evaluate_string_pairs(prediction='apple', prediction_b='pie')
print(x)