#  Document Chunking

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset

pd.set_option(
    "display.max_colwidth", None
)  # This will be helpful when visualizing retriever outputs

  from .autonotebook import tqdm as notebook_tqdm


### Loading Files

See: https://python.langchain.com/v0.2/docs/how_to/document_loader_markdown/

In [2]:
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader, TextLoader

In [3]:
import os
from typing import List
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.schema import Document

def validate_directory(directory_path: str) -> None:
    """Validate the directory path."""
    if not isinstance(directory_path, str):
        raise ValueError("The directory path must be a string.")
    if not os.path.isdir(directory_path):
        raise FileNotFoundError(f"The specified directory '{directory_path}' does not exist.")
    if not os.access(directory_path, os.R_OK):
        raise PermissionError(f"Cannot read from the directory: {directory_path}")

def load_markdown_documents(directory_path: str) -> List[Document]:
    """Load all markdown documents from a directory."""
    validate_directory(directory_path)
    
    try:
        loader = DirectoryLoader(
            directory_path, 
            glob="**/*.md", 
            loader_cls=TextLoader
        )
        
        documents = loader.load()
        
        if not documents:
            raise ValueError("No documents were loaded from the specified directory.")
        return documents
    
    except Exception as e:
        raise RuntimeError(f"An error occurred while loading documents: {str(e)}")

In [4]:
# Example usage
directory_path = "../data/raw/"
documents = load_markdown_documents(directory_path)
print(f"Loaded {len(documents)} documents")

Loaded 3 documents


In [5]:
print("First Document Metadata:", documents[0].metadata)
print("First Document Content:", documents[0].page_content[:100])

print("Second Document Metadata:", documents[1].metadata)
print("Second Document Content:", documents[1].page_content[:100])

print("Third Document Metadata:", documents[2].metadata)
print("Third Document Content:", documents[2].page_content[:100])

First Document Metadata: {'source': '../data/raw/conda-tutorial.md'}
First Document Content: CHAPTER 1 User guide

## 1.1 **Overview**

This page provides an overview of how to use conda. For a
Second Document Metadata: {'source': '../data/raw/regex-tutorial.md'}
Second Document Content: 
## 1. Regular Expression Tutorial

In this tutorial, I will teach you all you need to know to be ab
Third Document Metadata: {'source': '../data/raw/git-tutorial.md'}
Third Document Content: 
## Git Basics

If you can read only one chapter to get going with Git, this is it. This chapter cov


In [6]:
import re
import os
from typing import List
from langchain.schema import Document

def clean_string(text: str) -> str:
    """Remove special characters and return the cleaned string."""
    text = re.sub(r'\s+', ' ', text).strip()
    return re.sub(r'[^a-zA-Z]', '', text)

def clean_source(path: str) -> str:
    """Extract the filename from the path, remove the extension, replace non-alpha with spaces, and clean it."""
    filename = os.path.splitext(os.path.basename(path))[0]  # Extract filename without extension
    cleaned_filename = re.sub(r'[^a-zA-Z]', ' ', filename)  # Replace non-alpha characters with space
    cleaned_filename = re.sub(r'\s+', ' ', cleaned_filename).strip()
    return cleaned_filename

def split_single_document(document: Document, header: str = "##") -> List[Document]:
    """Split a single document by a given header."""
    split_docs = []
    content = document.page_content
    sections = content.split(header)[1:]  # Skip the content before the first '##'
    
    for section in sections:
        try:
            # Split the section into title and body
            title, *body = section.split('\n', 1)  # Split once by the first newline
            body_content = body[0].lstrip() if body else ''  # Handle empty bodies
            
            # Preserve metadata and add cleaned fields
            new_metadata = {
                **document.metadata,
                "title": title.strip(),
                "cleaned_title": clean_string(title.strip()),
                "cleaned_source": clean_source(document.metadata.get("source", "")),
            }
            
            # Create a new document
            new_doc = document.copy(update={"page_content": body_content, "metadata": new_metadata})
            split_docs.append(new_doc)
        except Exception as e:
            print(f"Error processing section: {e}")
        
    return split_docs


def split_documents_by_header(documents, header="##"):
    """ Split a list of documents by a given header """
    return [doc for document in documents for doc in split_single_document(document, header)]

# Split the loaded documents
split_documents = split_documents_by_header(documents)

print(f"Split into {len(split_documents)} smaller documents")
print(f"First document title: {split_documents[0].metadata['title']}")
print(f"First document title: {split_documents[0].metadata}")
print(f"First document content:\n{split_documents[0].page_content[:200]}")

Split into 369 smaller documents
First document title: 1.1 **Overview**
First document title: {'source': '../data/raw/conda-tutorial.md', 'title': '1.1 **Overview**', 'cleaned_title': 'Overview', 'cleaned_source': 'conda tutorial'}
First document content:
This page provides an overview of how to use conda. For an overview of what conda is and what it does, please see the *front page*.

The quickest way to start using conda is to go through the 20-minut


## Split Documents

In [7]:
from transformers import BertTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List

def split_documents_by_token_count(documents: List[Document], chunk_size: int = 256, chunk_overlap: int = 192) -> List[Document]:
    """Splits documents using BERT token count."""
    
    # Load BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", clean_up_tokenization_spaces=True)
    
    # Custom function to calculate the number of tokens in a text
    def get_length(text: str) -> int:
        tokens = tokenizer.tokenize(text)
        return len(tokens)
    
    # Custom separators
    custom_separators = ["\n\n", "\n", ".", " ", ""]
    
    # Use token count instead of character count in the RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,  
        length_function=get_length,
        is_separator_regex=False,
        separators=custom_separators
    )
    
    # Split the documents using token count
    return text_splitter.split_documents(documents)

In [8]:
# Split the documents using token count
documents_chunked = split_documents_by_token_count(split_documents)
print("Number of chunks:", len(documents_chunked))

Number of chunks: 925


In [9]:
print(documents_chunked[6])

page_content='The conda command searches a default set of channels, and packages are automatically downloaded and updated from http://repo.continuum.io/pkgs/. You can modify what remote channels are automatically searched. You might want to do this to maintain a private or internal channel. For details, see *Channel locations (channels)*. See also Managing packages.

The conda package format is identical across platforms and operating systems.

To install conda packages, in the Terminal or an Anaconda Prompt, run:
conda install [packagename]
NOTE: Replace [packagename] with the desired package name.

A conda package includes a link to a tarball or bzipped tar archive, with the extension ".tar.bz2", which contains metadata under the info/ directory and a collection of files that are installed directly into an install prefix.

During the install process, files are extracted into the install prefix, except for files in the info/ directory. Installing the files of a conda package into an e

#  Document Embeddings

In [10]:
from langchain_huggingface import HuggingFaceEmbeddings

In [11]:
model_name = "thenlper/gte-small"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model = HuggingFaceEmbeddings(
    multi_process=True,
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

In [12]:
from langchain_chroma import Chroma

KNOWLEDGE_VECTOR_DATABASE = Chroma(
    collection_name="documentation",
    embedding_function=embedding_model,
    persist_directory="../data/vectorstore", 
)

In [13]:
_ = KNOWLEDGE_VECTOR_DATABASE.add_documents(documents_chunked)

In [14]:
# Embed a user query in the same space
user_query = "How to start conda?"
query_vector = embedding_model.embed_query(user_query)

In [15]:
print(f"\nStarting retrieval for {user_query=}...")
results = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)

for result_id, result in enumerate(results):
    print(f"\n============================== Document {result_id+1} ==============================")
    print(result.metadata)
    print(result.page_content)


Starting retrieval for user_query='How to start conda?'...

{'cleaned_source': 'conda tutorial', 'cleaned_title': 'GettingStartedWithConda', 'source': '../data/raw/conda-tutorial.md', 'title': '1.3 **Getting Started With Conda**'}
Conda is a powerful package manager and environment manager that you use with command line commands at the Anaconda Prompt for Windows, or in a Terminal window for macOS or Linux.

This 20-minute guide to getting started with conda lets you try out the major features of conda. You should understand how conda works when you finish this guide.

SEE ALSO: Getting started with Anaconda Navigator, a graphical user interface that lets you use conda in a weblike interface without having to enter manual commands. Compare the Getting started guides for each to see which program you prefer.

{'cleaned_source': 'conda tutorial', 'cleaned_title': 'GettingStartedWithConda', 'source': '../data/raw/conda-tutorial.md', 'title': '1.3 **Getting Started With Conda**'}
Conda is

## Chat Model

**Load Model:**

In [16]:
from langchain_huggingface import HuggingFacePipeline
from langchain_huggingface import ChatHuggingFace
from langchain_huggingface import HuggingFaceEndpoint
from typing import Dict

In [17]:
def load_huggingface_model(model_name: str, task: str, generation_params: Dict, device: int = 0):
    """Load a HuggingFace model with the specified task, generation parameters, and device."""
    
    # Load the model using HuggingFacePipeline
    llm = HuggingFacePipeline.from_model_id(
        model_id=model_name,
        task=task,
        pipeline_kwargs=dict(
            **generation_params  # Pass the text generation parameters
        ),
        #device=device  # Set the device for inference (GPU or CPU)
    )
    return ChatHuggingFace(llm=llm)

In [18]:
# # Example usage
# model_name = "HuggingFaceTB/SmolLM-1.7B-Instruct"
# task = "text-generation"
# generation_params = {
#     "temperature": 0.7,
#     "max_length": 512,
#     "top_p": 0.9,
#     "repetition_penalty": 1.2,
# }
# device = 0  # GPU (use -1 for CPU)

# llm = load_huggingface_model(model_name=model_name, task=task, generation_params=generation_params, device=device)

In [19]:
# Replace these values with your actual parameters
repo_id = "HuggingFaceH4/zephyr-7b-beta"  # Model ID from Hugging Face
task = "text-generation"  # Task type

# Parameters for generation (you can adjust these as needed)
generation_params = {
    "temperature": 0.7,
    "max_length": 512,
    "top_p": 0.9,
    "repetition_penalty": 1.2,
}


# Create the Hugging Face Endpoint using the specified parameters
endpoint = HuggingFaceEndpoint(
    repo_id=repo_id,
    task=task,
    **generation_params,  # Pass the generation parameters,
    huggingfacehub_api_token = ""
)

# Return the LangChain HuggingFacePipeline object with the endpoint
llm = ChatHuggingFace(llm=endpoint)


                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/julianschelb/.cache/huggingface/token
Login successful


In [20]:
llm.invoke("What is conda in the context of Python?")

AIMessage(content='conda is a package and environment management system for Python. It allows you to easily manage and distribute Python packages, as well as create and manage multiple environments with different sets of dependencies. Conda helps to ensure that your Python projects are reproducible and can be run on any machine with the same environment and dependencies. Additionally, conda includes a wide variety of pre-compiled packages for Python, R, and other languages, making it easy to install and manage software packages.', additional_kwargs={}, response_metadata={'token_usage': ChatCompletionOutputUsage(completion_tokens=99, prompt_tokens=33, total_tokens=132), 'model': '', 'finish_reason': 'eos_token'}, id='run-9fd2af1c-e705-4397-8fb1-544e57bea999-0')

**Define Prompt:**

In [21]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage

# Define the messages in the chat prompt format
messages = [
    SystemMessage( role="system", 
        content="""Using the only information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
If the answer cannot be deduced from the context, do not give an answer."""
    ),
    HumanMessage(role="user",
        content="""Context:
{context}
---
Now here is the question you need to answer.

Question: {question}"""
    ),
]

# Create the ChatPromptTemplate object
prompt_template = ChatPromptTemplate.from_messages(
    [(msg.role, msg.content) for msg in messages]
)

# Example usage of prompt_template
print(prompt_template)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='Using the only information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nIf the answer cannot be deduced from the context, do not give an answer.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Context:\n{context}\n---\nNow here is the question you need to answer.\n\nQuestion: {question}'), additional_kwargs={})]


In [22]:
chain = prompt_template | llm

In [23]:
input_dict = {"context" : "To start conda turn the key three times!", "question": "How to start conda?"}

In [24]:
prompt_template.invoke(input_dict)

ChatPromptValue(messages=[SystemMessage(content='Using the only information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nIf the answer cannot be deduced from the context, do not give an answer.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Context:\nTo start conda turn the key three times!\n---\nNow here is the question you need to answer.\n\nQuestion: How to start conda?', additional_kwargs={}, response_metadata={})])

In [25]:
answer = chain.invoke(input_dict)
print(answer)

content='Answer: To start conda, follow these steps as described in the context: 1. Turn the key three times (this is likely a metaphorical instruction and not an actual physical action you need to take). 2. After turning the key three times, you can assume that conda has started and is ready to use. Since we cannot verify if this is a literal or figurative instruction, we may assume it is just a unique way of starting conda and follow the' additional_kwargs={} response_metadata={'token_usage': ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=129, total_tokens=229), 'model': '', 'finish_reason': 'length'} id='run-9a0bf5c1-3fd1-4cce-a3ff-04778c44de37-0'


In [26]:
retriever = KNOWLEDGE_VECTOR_DATABASE.as_retriever()
prompt = prompt_template
model = llm

In [27]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [28]:
retrieval_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [29]:
answer = retrieval_chain.invoke("How to start conda?")

In [30]:
llm.invoke("Provide 5 rephrased versions of the question, each focusing on different aspects: 'What is conda in Python?'")

AIMessage(content="1. How would you explain conda in the context of Python? (Focuses on defining conda in relation to Python)\n2. What is conda, and how does it relate to Python package management? (Focuses on clarifying conda's role in Python package management)\n3. Can you summarize what conda is in terms of Python environments and package handling? (Focuses on highlighting conda's functionalities in managing Python environments and handling packages)", additional_kwargs={}, response_metadata={'token_usage': ChatCompletionOutputUsage(completion_tokens=100, prompt_tokens=50, total_tokens=150), 'model': '', 'finish_reason': 'length'}, id='run-6a666e99-0b94-4ca8-b84f-e70ded9c9fbe-0')

In [31]:
llm.invoke("""Here is a short document:

"To start Conda, first install it by downloading the Anaconda or Miniconda distribution. After installation, open a terminal or command prompt and type 'conda' to ensure the installation was successful. To create a new environment, run 'conda create --name myenv'. Once the environment is created, activate it using 'conda activate myenv'. You can then install packages with 'conda install package_name'."

Based on this document, generate 5 questions that users might have about how this document.
""")

AIMessage(content="1. What is Conda and why do I need to install it?\n2. How can I verify that the Conda installation was successful?\n3. How do I create a new environment using Conda?\n4. How do I activate a Conda environment that I've created?\n5. How do I install packages using Conda? Is there a specific syntax I need to follow?", additional_kwargs={}, response_metadata={'token_usage': ChatCompletionOutputUsage(completion_tokens=84, prompt_tokens=143, total_tokens=227), 'model': '', 'finish_reason': 'eos_token'}, id='run-0a293a4f-2ac8-427d-bae2-1761331b50d9-0')

## Rephrase Question

In [32]:
# Define the prompt template to paraphrase the question
messages = [
    SystemMessage(
        role="system",
        content="Generate 5 paraphrased versions of the question provided by the user. "
                "The results should be formatted as a list, each paraphrase separated by a newline."
    ),
    HumanMessage(
        role="user",
        content="Generate 5 paraphrased versions of the following question: {question}"
    ),
]

# Create the ChatPromptTemplate from the message
prompt_template_parapharse = ChatPromptTemplate.from_messages(
    [(msg.role, msg.content) for msg in messages]
)

In [33]:
# Define a simple output parser to split the result by newlines
def parse_output(result):
    """Parses the result and splits it into a list of questions based on newlines."""
    result = result.content
    return [line.strip() for line in result.split('\n') if line.strip()]

In [34]:
# Create the chain using the LLM and the prompt template
paraphrasing_chain = prompt_template_parapharse | llm | parse_output

In [35]:
paraphrasing_chain.invoke({"question": "What is conda in Python?"})

['1. What is conda in the context of Python programming language?',
 '2. What is conda in the realm of Python software development?',
 '3. How would you define conda in relation to Python programming language?',
 '4. Can you explain what conda is within the context of Python programming?',
 '5. What does the term "conda" signify in the context of Python programming language?']

## Generate Hypothetical Questions

In [36]:
# Define the prompt template to generate hypothetical questions about the text
messages_hypothetical = [
    SystemMessage(
        role="system",
        content="Generate 5 hypothetical questions based on the following text. "
                "The results should be formatted as a list, with each question separated by a newline."
    ),
    HumanMessage(
        role="user",
        content="Here is the text: {text}\n"
                "Generate 5 hypothetical questions about the above text."
    ),
]

# Create the ChatPromptTemplate from the message
prompt_template_hypothetical = ChatPromptTemplate.from_messages(
    [(msg.role, msg.content) for msg in messages_hypothetical]
)

# Define a simple output parser to split the result by newlines
def parse_output(result):
    """Parses the result and splits it into a list of questions based on newlines."""
    result = result.content
    return [line.strip() for line in result.split('\n') if line.strip()]

# Create the chain using the LLM and the prompt template for generating hypothetical questions
hypothetical_question_chain = prompt_template_hypothetical | llm | parse_output

# Example text input to generate hypothetical questions about
text_input = {
    "text": "Conda is an open-source package management system and environment management system that runs on Windows, macOS, and Linux. Conda quickly installs, runs, and updates packages and their dependencies."
}

# Invoke the chain to get the hypothetical questions
hypothetical_questions = hypothetical_question_chain.invoke(text_input)
print(hypothetical_questions)

['1. How does Conda differ from traditional package managers in terms of functionality?', "2. What are the advantages of using Conda's environment management system compared to other environment management tools?", '3. In what operating systems is Conda compatible with?', "4. How does Conda's package installation process differ from that of traditional package managers in terms of speed and ease of use?", '5. What steps does Conda take to update packages and their dependencies,']
