In [1]:
# %pip install chromadb tiktoken

In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

from transformers import AutoModel
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

from gdoc_sync.integrations.udemy import RedundantFilterRetriever

# Retrieve from Chroma

- define llm for handling retrieved documents
- define model for handling embeddings
- instantiate vectorstore to retrieve data from
- chain for data retrieval

In [3]:
base_url = "http://10.0.0.56:7860/"
base_url = "http://172.21.240.1:1234/v1"

chat = ChatOpenAI(temperature=0, base_url=base_url, openai_api_key="not_needed")

In [4]:
model_name = "jinaai/jina-embeddings-v2-base-en"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
)

In [8]:
db = Chroma(persist_directory="./storage", embedding_function=embeddings)

# retriever = db.as_retriever()
retriever = RedundantFilterRetriever(embeddings=embeddings, chroma=db)

In [9]:
chain = RetrievalQA.from_chain_type(
    llm=chat,
    retriever=retriever,
    chain_type="stuff",
    # chain_type="map_reduce",
    # chain_type = "map_rerank",
    # chain_type = "refine",
    verbose=True,
)

In [10]:
result = chain.run("what is an interesting fact about the english language?")
result



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


'Dreamt is the only English word that ends with the letters "mt."'