In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from sentence_transformers import SentenceTransformer
from chromadb import Client, Settings
import uuid
import time

In [1]:

# инициализируем векторную БД
chroma_db = Client(Settings(is_persistent=True,persist_directory='./.chroma'))

# получаем или создаем коллекцию
collection = chroma_db.create_collection(name="knowledge_base", get_or_create=True)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=50,
    length_function=len
)

knowledge_base_path = os.path.join('knowledge_base')

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

start_time = time.time()

# проходим по всем файлам из базы знаний
for filename in os.listdir(knowledge_base_path):

    filepath = os.path.join(knowledge_base_path, filename)
    allowed_ext = '.txt'
    if os.path.isfile(filepath) and allowed_ext in filename:
        chunks = []

        with open(filepath, 'r') as file:
            text_content = file.read()

            chunks.extend(text_splitter.split_text(text_content))

        tag = filename.replace(allowed_ext, '').replace('_', ' ').lower()

        embeddings = []
        ids = []
        metadatas=[]
        for idx, chunk in enumerate(chunks):
            metadatas.append({"source": filename, "chunk_id": idx, "tag": tag})

            embeddings.append(model.encode(chunk))

            ids.append(str(uuid.uuid1()))

        collection.add(
            documents=chunks,
            embeddings=embeddings,
            metadatas=metadatas,
            ids=ids
        )

    else:
        continue

end_time = time.time()

print(f'Execution time is {end_time-start_time}')

print(f'Chunk count in collection = {collection.count()}')

test = collection.query(query_embeddings=model.encode("After high school"))

print('test query result')
print(test)



Execution time is 187.84753441810608
Chunk count in collection = 4328
test query result
{'ids': [['c01f48c4-d04b-11f0-8105-0242ac1c000c', 'e11e58ee-d04b-11f0-8105-0242ac1c000c', 'c22a5df2-d04b-11f0-8105-0242ac1c000c', 'c24bc3f2-d04b-11f0-8105-0242ac1c000c', 'c246c758-d04b-11f0-8105-0242ac1c000c', 'ef91fe9e-d04b-11f0-8105-0242ac1c000c', 'ddeabb5e-d04b-11f0-8105-0242ac1c000c', '9e24499a-d04b-11f0-8105-0242ac1c000c', 'b1c0fbba-d04b-11f0-8105-0242ac1c000c', 'c296d4aa-d04b-11f0-8105-0242ac1c000c']], 'embeddings': None, 'documents': [['says, "12 years after high school, and I’m still at the nerd table." This fact implies he was born in 1980, the same year as Eshe, provided the following assumption that he graduated at the usual age of 18-years-old is supported by him having had a', 'school through to graduate school. The closest thing she had to a friend in high school was the janitor whom she regularly had lunch with until his wife found out, called Afowh a "puta" (Spanish for "whore") and 