In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import warnings
from transformers import AutoTokenizer
warnings.filterwarnings('ignore')

In [2]:
directory = 'demo_bot_data/ubuntu-docs'

In [4]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i+1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

### Load the source of knowledge

In [3]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredMarkdownLoader

In [5]:
loader = DirectoryLoader(directory, glob="**/*.md", use_multithreading=True, show_progress=True, loader_cls=UnstructuredMarkdownLoader)
docs = loader.load()

  0%|          | 0/54 [00:00<?, ?it/s]

100%|██████████| 54/54 [00:13<00:00,  4.04it/s]


### Splitting the document into chunks

In [6]:
def get_title_from_content(doc):
    title_key = "title: "
    title = ''
    start = doc.find(title_key)
    if start != -1:
        end = doc.find("\n", start)
        title = doc[start + len(title_key):end]
        title = title.replace('"', '')
    return title

In [7]:
page_content = []
metadata = []
for doc in docs:
    page_content.append(doc.page_content)
    doc.metadata['title'] = get_title_from_content(doc.page_content)
    metadata.append(doc.metadata)

In [8]:
chunk_size = 1024

In [9]:
splitter = MarkdownTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=150,
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True
    )
ubuntun_markdown_docs = splitter.create_documents(page_content, metadata)
for idx, ubuntun_markdown_doc in enumerate(ubuntun_markdown_docs):
    ubuntun_markdown_doc.metadata["id"] = idx

### Set up the vector store

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import Chroma
from chromadb.utils import embedding_functions

In [5]:
EMBEDDING_MODEL_NAME = "avsolatorio/GIST-Embedding-v0"

In [6]:
# Selecting “all-mpnet-base-v2” as the embedding model offers a well-rounded approach, balancing speed with high-quality performance. 
# This model is highlighted in the Massive Text Embedding Benchmark (MTEB) for its ability to provide superior embeddings efficiently.

from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key="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", model_name=EMBEDDING_MODEL_NAME
)

In [14]:
vectorstore_1 = Chroma.from_documents(
    documents=ubuntun_markdown_docs,
    embedding=embeddings,
    persist_directory='demo_bot/data/vectors',
    collection_name="ubuntu_docs",
)

# persisting the db to the disk
vectorstore_1.persist()
vectorstore_1 = None

  warn_deprecated(


In [7]:
vectorstore = Chroma(
    embedding_function=embeddings,
    persist_directory='demo_bot/data/vectors',
    collection_name="ubuntu_docs",
)