In [2]:
import re

from sklearn.feature_extraction.text import TfidfVectorizer
FILE_PATH = "../data/no_sale_countries.md"

# Chunk Strategy of documents

## Simple Chunking

In [11]:
with open(FILE_PATH, 'r') as file:
    document = file.read()
sections = document.split('\n\n')

#TODO: instead of this OpenAI Embeddings or any other embeddings can be used
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(sections).toarray()

for sec in sections:
    print("--------")
    print(sec)
    print("--------")

--------
# No Sale Countries
As part of our commitment to ethical business practices and compliance with international regulations, [Your Company Name] has identified certain countries where we will not conduct sales. This decision is based on a combination of factors including but not limited to legal restrictions, ethical concerns, and market conditions.
--------
--------
The following countries are on our no sale list, along with the specific reasons for each designation:
--------
--------
1. Spain
- Reason: Compliance with Local Regulations
  - Spain has recently implemented stringent regulations on the sale of specific categories of products that we manufacture. Our current product lines do not meet the new regulatory requirements, and bringing them into compliance would require significant changes to our production process and supply chain, resulting in unsustainable costs.
--------
--------
2. Italy
- Reason: Unstable Economic Environment
  - The ongoing economic instability in 

## Langchain Character chunking

In [12]:
from langchain_text_splitters import CharacterTextSplitter
import re
with open(FILE_PATH, 'r') as file:
    markdown = file.read()

text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,)

chunks_of_text = text_splitter.split_text(markdown)

metadata = []
for chunk in chunks_of_text:
    header = chunk.split("\n")[0]
    
    if "#" in header or bool(re.search(r'\d', header)):
        metadata.append({"document": FILE_PATH, "title": header})
    else:
        metadata.append({"document": FILE_PATH, "title": "No header"})

documents = text_splitter.create_documents(texts=chunks_of_text, metadatas=metadata)
for doc in documents:
    print("--------")
    print(doc)
    print("--------")

Created a chunk of size 349, which is longer than the specified 100
Created a chunk of size 102, which is longer than the specified 100
Created a chunk of size 394, which is longer than the specified 100
Created a chunk of size 410, which is longer than the specified 100
Created a chunk of size 378, which is longer than the specified 100
Created a chunk of size 470, which is longer than the specified 100
Created a chunk of size 297, which is longer than the specified 100


--------
page_content='# No Sale Countries
As part of our commitment to ethical business practices and compliance with international regulations, [Your Company Name] has identified certain countries where we will not conduct sales. This decision is based on a combination of factors including but not limited to legal restrictions, ethical concerns, and market conditions.' metadata={'document': '../no_sale_countries.md', 'title': '# No Sale Countries'}
--------
--------
page_content='The following countries are on our no sale list, along with the specific reasons for each designation:' metadata={'document': '../no_sale_countries.md', 'title': 'No header'}
--------
--------
page_content='1. Spain
- Reason: Compliance with Local Regulations
  - Spain has recently implemented stringent regulations on the sale of specific categories of products that we manufacture. Our current product lines do not meet the new regulatory requirements, and bringing them into compliance would require significa

## Langchain Recursive chunking

In [13]:

from langchain_text_splitters import RecursiveCharacterTextSplitter

with open(FILE_PATH, 'r') as file:
    markdown = file.read()


text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)

documents = text_splitter.create_documents(text_splitter.split_text(markdown))
for doc in documents:
    print("--------")
    print(doc)
    print("--------")

--------
page_content='# No Sale Countries'
--------
--------
page_content='As part of our commitment to ethical business practices and compliance with international'
--------
--------
page_content='regulations, [Your Company Name] has identified certain countries where we will not conduct sales.'
--------
--------
page_content='This decision is based on a combination of factors including but not limited to legal restrictions,'
--------
--------
page_content='ethical concerns, and market conditions.'
--------
--------
page_content='The following countries are on our no sale list, along with the specific reasons for each'
--------
--------
page_content='designation:'
--------
--------
page_content='1. Spain
- Reason: Compliance with Local Regulations'
--------
--------
page_content='- Spain has recently implemented stringent regulations on the sale of specific categories of'
--------
--------
page_content='products that we manufacture. Our current product lines do not meet the new regul

## Langchain Markdownloader/Splitter chunking

In [17]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

loader = UnstructuredMarkdownLoader(FILE_PATH, mode="elements")
documents = loader.load()
for sec in documents:
    print("--------")
    print(sec)
    print("--------")

--------
page_content='No Sale Countries' metadata={'source': '../no_sale_countries.md', 'category_depth': 0, 'last_modified': '2024-07-27T12:52:04', 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': '..', 'filename': 'no_sale_countries.md', 'category': 'Title'}
--------
--------
page_content='As part of our commitment to ethical business practices and compliance with international regulations, [Your Company Name] has identified certain countries where we will not conduct sales. This decision is based on a combination of factors including but not limited to legal restrictions, ethical concerns, and market conditions.' metadata={'source': '../no_sale_countries.md', 'last_modified': '2024-07-27T12:52:04', 'languages': ['eng'], 'parent_id': 'f61d476babc2c078e3029a4bebacbb3f', 'filetype': 'text/markdown', 'file_directory': '..', 'filename': 'no_sale_countries.md', 'category': 'NarrativeText'}
--------
--------
page_content='The following countries are on our no sale list

## Langchain MarkdownHeaderTextSplitter (chunking)

In [21]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
with open(FILE_PATH, 'r') as file:
        markdown_document = file.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)

for chunk in md_header_splits:
    print("--------")
    print(chunk)
    print("--------")

--------
page_content='As part of our commitment to ethical business practices and compliance with international regulations, [Your Company Name] has identified certain countries where we will not conduct sales. This decision is based on a combination of factors including but not limited to legal restrictions, ethical concerns, and market conditions.  
The following countries are on our no sale list, along with the specific reasons for each designation:  
1. Spain
- Reason: Compliance with Local Regulations
- Spain has recently implemented stringent regulations on the sale of specific categories of products that we manufacture. Our current product lines do not meet the new regulatory requirements, and bringing them into compliance would require significant changes to our production process and supply chain, resulting in unsustainable costs.  
2. Italy
- Reason: Unstable Economic Environment
- The ongoing economic instability in Italy poses a high risk for business operations. The fluct