In [27]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain import HuggingFaceHub, LLMChain
from sentence_transformers import SentenceTransformer, util
import torch

import os
import shutil

os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'HF_API_KEY'

data_path = 'nasa_articles/'
chroma_path = "chroma"

# Other options: Using Ollama (manages and runs llms locally on your computer)

In [28]:
# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id='google/flan-t5-xl',
    model_kwargs={'temperature':1e-10}
)

In [29]:
def load_documents():
    loader = DirectoryLoader(data_path, glob='*.md')
    docs = loader.load()
    return docs

In [30]:
# Split the documents into chunks so we can retrieve information more granularly (rather than the entire document)
# TODO find a reasonable chunk size for the articles

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index = True,
    )

    chunks = text_splitter.split_documemts(documents)
    print(f'Split {len(documents)} documents into {len(chunks)} chunks')
    
    # Demonstrate what a chunk looks like (not necessary code)
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [31]:
# Create a Chroma DB to query each chunk. (Uses vector embeddings as the key)
# Using OpenAIEmbeddings with OpenAI account to create vector embeddings for each chunk
# Embeddings are vectors in an n-dimensonal space. Eg. similar words or chunks will point in similar directions.

# Alternative Embedding: Opensource & free 
# https://docs.nomic.ai/atlas/capabilities/data-interface
# https://python.langchain.com/v0.1/docs/integrations/text_embedding/nomic/
# embeddings = NomicEmbeddings(nomic_api_key=nomic_api_key, dimensionality=64, model="nomic-embed-text-v1.5")
 
embeddings = [] # one from huggingface

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(chroma_path):
    # clear previous chroma dbs
        shutil.rmtree(chroma_path)

    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=chroma_path
    )
    # should automatically save but persist forces it to save
    db.persist()
    print(f'Saved {len(chunks)} chunks to {chroma_path}.')

In [32]:
sentences = ["This is an example sentence", "Each sentence has been converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)



tensor([[0.3651]])