In [34]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

import torch
import os
import shutil

os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'HF_API_KEY'

data_path = 'nasa_articles/'
chroma_path = "chroma"

In [35]:
# initialize Hub LLM
hub_llm = HuggingFaceHub(
        repo_id='google/flan-t5-xl',
    model_kwargs={'temperature':1e-10}
)

In [36]:
def load_documents():
    loader = DirectoryLoader(data_path, glob='*.md')
    docs = loader.load()
    return docs

In [37]:
# Split the documents into chunks so we can retrieve information more granularly (rather than the entire document)
# TODO find a reasonable chunk size for the articles

def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= 1000,
        chunk_overlap = 500,
        length_function = len,
        add_start_index = True,
    )

    chunks = text_splitter.split_documemts(documents)
    print(f'Split {len(documents)} documents into {len(chunks)} chunks')
    
    # Demonstrate what a chunk looks like (not necessary code)
    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [38]:
# Create a Chroma DB to query each chunk. (Uses vector embeddings as the key)
# Create vector embeddings for each chunk
# Embeddings are vectors in an n-dimensonal space. Eg. similar words or chunks will point in similar directions.
# Embeddings from OpenAI for example (but costs per x tokens)
# Alternative Embedding from Huggingface: Opensource & free 
 
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")

def save_to_chroma(chunks: list[Document]):
    if os.path.exists(chroma_path):
    # clear previous chroma dbs
        shutil.rmtree(chroma_path)

    db = Chroma.from_documents(
        chunks, embeddings, persist_directory=chroma_path
    )
    # should automatically save but persist forces it to save
    db.persist()
    print(f'Saved {len(chunks)} chunks to {chroma_path}.')



In [39]:
sentences = ["This is an example sentence", "Each sentence has been converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_1= model.encode(sentences[0], convert_to_tensor=True)
embedding_2 = model.encode(sentences[1], convert_to_tensor=True)

util.pytorch_cos_sim(embedding_1, embedding_2)

tensor([[0.3651]])

In [None]:
def get_retriever(db):
    #Get the retriever that returns the top 4 similar chunks
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
    return retriever


In [None]:
# Loading optimized version of the model to make inference faster
model_name = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Instantiate one of the model classes of the library (with a causal language modeling head) from a pretrained model.
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
# Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# First, create a text_generation pipeline using the loaded model and its tokenizer.

# Next, create a prompt template - this should follow the format of the model, so if you substitute the model checkpoint, make sure to use the appropriate formatting.