In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from bs4 import BeautifulSoup as Soup

# https://docs.trychroma.com/troubleshooting#sqlite
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

url = "https://lilianweng.github.io/"
loader = RecursiveUrlLoader(url=url, max_depth=2, extractor=lambda x: Soup(x, "html.parser").text)
docs = loader.load()
print(f'Total {len(docs)} pages found.')

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(docs)
print(f'Total {len(docs)} chunked documents.')

  from .autonotebook import tqdm as notebook_tqdm
Created a chunk of size 1010, which is longer than the specified 1000
Created a chunk of size 1153, which is longer than the specified 1000
Created a chunk of size 1702, which is longer than the specified 1000
Created a chunk of size 2254, which is longer than the specified 1000
Created a chunk of size 2944, which is longer than the specified 1000
Created a chunk of size 2078, which is longer than the specified 1000
Created a chunk of size 1421, which is longer than the specified 1000
Created a chunk of size 1207, which is longer than the specified 1000
Created a chunk of size 1116, which is longer than the specified 1000
Created a chunk of size 1572, which is longer than the specified 1000
Created a chunk of size 1866, which is longer than the specified 1000
Created a chunk of size 1023, which is longer than the specified 1000
Created a chunk of size 1007, which is longer than the specified 1000
Created a chunk of size 3606, which is l

Total 28 pages found.
Total 1042 chunked documents.


In [12]:
import chromadb
import uuid

chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [13]:
collection = chroma_client.create_collection("ml_blog")
for doc in docs:
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content
    )
db = Chroma(client=chroma_client, collection_name="ml_blog", embedding_function=embedding_function)

/home/codespace/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:08<00:00, 9.47MiB/s]


In [18]:
query = "ways to prompt large language models?"
docs = db.similarity_search(query)
print(docs[0].page_content)

Or
@article{weng2023prompt,
  title   = "LLM-powered Autonomous Agents"",
  author  = "Weng, Lilian",
  journal = "lilianweng.github.io",
  year    = "2023",
  month   = "Jun",
  url     = "https://lilianweng.github.io/posts/2023-06-23-agent/"
}
References#
[1] Wei et al. “Chain of thought prompting elicits reasoning in large language models." NeurIPS 2022
[2] Yao et al. “Tree of Thoughts: Dliberate Problem Solving with Large Language Models." arXiv preprint arXiv:2305.10601 (2023).
[3] Liu et al. “Chain of Hindsight Aligns Language Models with Feedback
“ arXiv preprint arXiv:2302.02676 (2023).
[4] Liu et al. “LLM+P: Empowering Large Language Models with Optimal Planning Proficiency” arXiv preprint arXiv:2304.11477 (2023).
[5] Yao et al. “ReAct: Synergizing reasoning and acting in language models." ICLR 2023.
[6] Google Blog. “Announcing ScaNN: Efficient Vector Similarity Search” July 28, 2020.
[7] https://chat.openai.com/share/46ff149e-a4c7-4dd7-a800-fc4a642ea389
[8] Shinn & Labash. “