#  Document Chunking

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset

pd.set_option(
    "display.max_colwidth", None
)  # This will be helpful when visualizing retriever outputs

  from .autonotebook import tqdm as notebook_tqdm


### Loading Files

See: https://python.langchain.com/v0.2/docs/how_to/document_loader_markdown/

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader, TextLoader

In [3]:
import os
from typing import List
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.schema import Document

def validate_directory(directory_path: str) -> None:
    """Validate the directory path."""
    if not isinstance(directory_path, str):
        raise ValueError("The directory path must be a string.")
    if not os.path.isdir(directory_path):
        raise FileNotFoundError(f"The specified directory '{directory_path}' does not exist.")
    if not os.access(directory_path, os.R_OK):
        raise PermissionError(f"Cannot read from the directory: {directory_path}")

def load_markdown_documents(directory_path: str) -> List[Document]:
    """Load all markdown documents from a directory."""
    validate_directory(directory_path)
    
    try:
        loader = DirectoryLoader(
            directory_path, 
            glob="**/*.md", 
            loader_cls=TextLoader
        )
        
        documents = loader.load()
        
        if not documents:
            raise ValueError("No documents were loaded from the specified directory.")
        return documents
    
    except Exception as e:
        raise RuntimeError(f"An error occurred while loading documents: {str(e)}")

In [4]:
# Example usage
directory_path = "../data/raw/"
documents = load_markdown_documents(directory_path)
print(f"Loaded {len(documents)} documents")

Loaded 3 documents


In [5]:
print("First Document Metadata:", documents[0].metadata)
print("First Document Content:", documents[0].page_content[:100])

print("Second Document Metadata:", documents[1].metadata)
print("Second Document Content:", documents[1].page_content[:100])

print("Third Document Metadata:", documents[2].metadata)
print("Third Document Content:", documents[2].page_content[:100])

First Document Metadata: {'source': '../data/raw/conda-tutorial.md'}
First Document Content: CHAPTER 1 User guide

## 1.1 **Overview**

This page provides an overview of how to use conda. For a
Second Document Metadata: {'source': '../data/raw/regex-tutorial.md'}
Second Document Content: 
## 1. Regular Expression Tutorial

In this tutorial, I will teach you all you need to know to be ab
Third Document Metadata: {'source': '../data/raw/git-tutorial.md'}
Third Document Content: 
## Git Basics

If you can read only one chapter to get going with Git, this is it. This chapter cov


In [6]:
import re
import os
from typing import List
from langchain.schema import Document

def clean_string(text: str) -> str:
    """Remove special characters and return the cleaned string."""
    return re.sub(r'[^a-zA-Z]', '', text)

def clean_source(path: str) -> str:
    """Extract the filename from the path, remove the extension, replace non-alpha with spaces, and clean it."""
    filename = os.path.splitext(os.path.basename(path))[0]  # Extract filename without extension
    cleaned_filename = re.sub(r'[^a-zA-Z]', ' ', filename)  # Replace non-alpha characters with space
    cleaned_filename = re.sub(r'\s+', ' ', cleaned_filename).strip()  # Reduce multiple spaces to one, strip spaces
    return cleaned_filename

def split_single_document(document: Document, header: str = "##") -> List[Document]:
    """Split a single document by a given header."""
    split_docs = []
    content = document.page_content
    sections = content.split(header)[1:]  # Skip the content before the first '##'
    
    for section in sections:
        try:
            # Split the section into title and body
            title, *body = section.split('\n', 1)  # Split once by the first newline
            body_content = body[0].lstrip() if body else ''  # Handle empty bodies
            
            # Preserve metadata and add cleaned fields
            new_metadata = {
                **document.metadata,
                "title": title.strip(),
                "cleaned_title": clean_string(title.strip()),
                "cleaned_source": clean_source(document.metadata.get("source", "")),
            }
            
            # Create a new document
            new_doc = document.copy(update={"page_content": body_content, "metadata": new_metadata})
            split_docs.append(new_doc)
        except Exception as e:
            print(f"Error processing section: {e}")
        
    return split_docs


def split_documents_by_header(documents, header="##"):
    """ Split a list of documents by a given header """
    return [doc for document in documents for doc in split_single_document(document, header)]

# Split the loaded documents
split_documents = split_documents_by_header(documents)

print(f"Split into {len(split_documents)} smaller documents")
print(f"First document title: {split_documents[0].metadata['title']}")
print(f"First document title: {split_documents[0].metadata}")
print(f"First document content:\n{split_documents[0].page_content[:200]}")

Split into 369 smaller documents
First document title: 1.1 **Overview**
First document title: {'source': '../data/raw/conda-tutorial.md', 'title': '1.1 **Overview**', 'cleaned_title': 'Overview', 'cleaned_source': 'conda tutorial'}
First document content:
This page provides an overview of how to use conda. For an overview of what conda is and what it does, please see the *front page*.

The quickest way to start using conda is to go through the 20-minut


## Split Documents

In [7]:
from transformers import BertTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List

def split_documents_by_token_count(documents: List[Document], chunk_size: int = 256, chunk_overlap: int = 192) -> List[Document]:
    """Splits documents using BERT token count."""
    
    # Load BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=True)
    
    # Custom function to calculate the number of tokens in a text
    def get_length(text: str) -> int:
        tokens = tokenizer.tokenize(text)
        return len(tokens)
    
    # Custom separators
    custom_separators = ["\n\n", "\n", ".", " ", ""]
    
    # Use token count instead of character count in the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,  
        length_function=get_length,
        is_separator_regex=False,
        separators=custom_separators
    )
    
    # Split the documents using token count
    return text_splitter.split_documents(documents)

In [8]:
# Split the documents using token count
documents_chunked = split_documents_by_token_count(split_documents)
print("Number of chunks:", len(documents_chunked))

Number of chunks: 925


In [9]:
print(documents_chunked[6])

page_content='The conda command searches a default set of channels, and packages are automatically downloaded and updated from http://repo.continuum.io/pkgs/. You can modify what remote channels are automatically searched. You might want to do this to maintain a private or internal channel. For details, see *Channel locations (channels)*. See also Managing packages.

The conda package format is identical across platforms and operating systems.

To install conda packages, in the Terminal or an Anaconda Prompt, run:
conda install [packagename]
NOTE: Replace [packagename] with the desired package name.

A conda package includes a link to a tarball or bzipped tar archive, with the extension ".tar.bz2", which contains metadata under the info/ directory and a collection of files that are installed directly into an install prefix.

During the install process, files are extracted into the install prefix, except for files in the info/ directory. Installing the files of a conda package into an e

#  Document Embeddings

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

In [11]:
model_name = "thenlper/gte-small"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model = HuggingFaceEmbeddings(
    multi_process=True,
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [21]:
from langchain_chroma import Chroma

KNOWLEDGE_VECTOR_DATABASE = Chroma(
    collection_name="documentation",
    embedding_function=embedding_model,
    persist_directory="../data/vectorstore", 
)

In [22]:
_ = KNOWLEDGE_VECTOR_DATABASE.add_documents(documents_chunked)

['b80b31c4-9209-459c-b947-cd1f2c796b39',
 'c6fc1b5f-37ee-47e6-8aba-276a8ab6429c',
 'ffda1749-8807-4b6d-826e-760465123f61',
 '984a6b51-b0c4-4fa7-baa4-954b5a5bf0cb',
 'e9ad86d5-3648-4f9d-b948-094f042293b3',
 'bc41ca94-b8ea-4fd0-a45a-b4454e28e1d9',
 '28c404e9-02e4-48d7-8a0e-6b2e8bbf4cfc',
 '76507201-37ef-4c74-ab1f-2f9cb86e440b',
 'a7d4272e-c10e-45b4-8d07-5e384f3554cd',
 '66594960-1bc2-4052-b903-6bdcf2c4f895',
 'fd0ee3e9-8985-44b1-a764-12ca6a223696',
 'bbe4a183-489c-4dce-abe8-52503aaea7b7',
 '2c87856d-2355-41ae-9db6-4cfd1908039f',
 '6f02d897-0671-4c42-b548-870286e7629b',
 '0f921cbd-d0b2-4e80-8126-5c020d4a2703',
 'cf37e76c-2a66-4750-a9b3-ef4d200bbd1c',
 '205a3d4c-3326-4f98-9508-523997dd833c',
 '69bd2366-4519-4435-9be1-38d2e1d5a725',
 '065129fb-4bf2-4d3b-9f0f-643974eca72c',
 '2e19e82a-38cb-4d4f-ac69-2d9b0fa223c9',
 '644ee177-a688-42e5-aa96-11ad62b743cf',
 'cb5d4a56-8b32-43b6-9a31-0f7bac7fe15d',
 'f6570f56-e6d3-49d1-9936-0a4ada4ad0f6',
 'c3d0da6c-4a9a-494c-aa25-47533d52975d',
 '07bcd923-3439-

In [23]:
# Embed a user query in the same space
user_query = "How to start conda?"
query_vector = embedding_model.embed_query(user_query)

In [24]:
print(f"\nStarting retrieval for {user_query=}...")
results = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)

for result_id, result in enumerate(results):
    print(f"\n============================== Document {result_id+1} ==============================")
    print(result.metadata)
    print(result.page_content)


Starting retrieval for user_query='How to start conda?'...



{'cleaned_source': 'conda tutorial', 'cleaned_title': 'GettingStartedWithConda', 'source': '../data/raw/conda-tutorial.md', 'title': '1.3 **Getting Started With Conda**'}
Conda is a powerful package manager and environment manager that you use with command line commands at the Anaconda Prompt for Windows, or in a Terminal window for macOS or Linux.

This 20-minute guide to getting started with conda lets you try out the major features of conda. You should understand how conda works when you finish this guide.

SEE ALSO: Getting started with Anaconda Navigator, a graphical user interface that lets you use conda in a weblike interface without having to enter manual commands. Compare the Getting started guides for each to see which program you prefer.

{'cleaned_source': 'conda tutorial', 'cleaned_title': 'GettingStartedWithConda', 'source': '../data/raw/conda-tutorial.md', 'title': '1.3 **Getting Started With Conda**'}
Conda is a powerful package manager and environment manager that you