In [9]:
!pip install langchain chromadb sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [5]:
# packages
import re
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from uuid import uuid4

In [6]:
# max_length
def max_token_length(txt_list:list):
    max_length = 0
    for txt in txt_list:
        token_count = len(re.findall(r'\w+', txt))
        if token_count > max_length:
            max_length = token_count
    return f"Max Token Length: {max_length} tokens"

In [37]:
# Sentence splitter
# chroma default sentence model "all-MiniLM-L6-v2"
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# max input length: 256 characters
model_max_chunk_length = 256
token_splitter = SentenceTransformersTokenTextSplitter(
    tokens_per_chunk=model_max_chunk_length,
    model_name="all-MiniLM-L6-v2",
    chunk_overlap=0
)

In [33]:
# Real Implemetation
text_path = "./bible.txt"
with open(text_path, "r", encoding="utf-8") as f:
    text_raw = f.read()

In [34]:
text_raw[:1000]


'\ufeffThe Project Gutenberg eBook of The King James Version of the Bible\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: The King James Version of the Bible\n\nRelease date: August 1, 1989 [eBook #10]\n                Most recently updated: April 6, 2024\n\nLanguage: English\n\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***\nThe Old Testament of the King James Version of the Bible\nThe First Book of Moses: Called Genesis\nThe Second Book of Moses: Called Exodus\nThe Third Book of Moses: Called Leviticus\nThe Fourth Book of Moses: Called 

In [38]:
# Character splitter
character_splitter = RecursiveCharacterTextSplitter(
    separators=['\n    \n', '\n\n', '\n', '. '],
    chunk_size=1000,
    chunk_overlap=0,
)
text_splitted = character_splitter.split_text(text_raw)
print(f"Total number of splitted chunks: {len(text_splitted)}")

print(max_token_length(text_splitted))

text_tokens = []
for text in text_splitted:
    text_tokens.extend(token_splitter.split_text(text))
print(f"Total number of tokens: {len(text_tokens)}")

Total number of splitted chunks: 4986
Max Token Length: 219 tokens
Total number of tokens: 5363


In [40]:
# Check the token length
# reference: model card "By default, input text longer than 256 word pieces is truncated."
max_token_length(text_tokens)

'Max Token Length: 216 tokens'

In [41]:
embedding_fn = SentenceTransformerEmbeddingFunction()
chroma_db = chromadb.Client()
chroma_collection = chroma_db.create_collection("bible", embedding_function=embedding_fn)

In [42]:
# add all tokens to collection
ids = [str(uuid4()) for _ in range(len(text_tokens))]
chroma_collection.add(documents=text_tokens, ids=ids)

In [43]:
# Save the chroma collection
# Run a Query
res = chroma_collection.query(query_texts=["what did noah do?"], n_results=10)
res['documents']

[['7 : 11 in the six hundredth year of noah ’ s life, in the second month, the seventeenth day of the month, the same day were all the fountains of the great deep broken up, and the windows of heaven were opened. 7 : 12 and the rain was upon the earth forty days and forty nights. 7 : 13 in the selfsame day entered noah, and shem, and ham, and japheth, the sons of noah, and noah ’ s wife, and the three wives of his sons with them, into the ark ; 7 : 14 they, and every beast after his kind, and all the cattle after their kind, and every creeping thing that creepeth upon the earth after his kind, and every fowl after his kind, every bird of every sort. 7 : 15 and they went in unto noah into the ark, two and two of all flesh, wherein is the breath of life. 7 : 16 and they that went in, went in male and female of all flesh, as god had commanded him : and the lord shut him in.',
  '7 : 24 and the waters prevailed upon the earth an hundred and fifty days. 8 : 1 and god remembered noah, and ev