# LangChain data chunking example

This notebook uses Langchain's recursive character text splitter to chunk text. Source files are large PDFs loaded using PyPDFLoader.

The notebook complements the [Chunking large documents for vector search solutions](https://learn.microsoft.com/azure/search/vector-search-how-to-chunk-document) article in the Azure AI Search documentation.


### Install packages

In [None]:
%pip install --quiet -r requirements.txt

### Load .env file (Copy .env-sample to .env and update accordingly)

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv() # take environment variables from .env

# variables not used here do not need to be updated in your .env file
search_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_embedding_deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYED_MODEL"]
recursivetextsplitter_searchindex = 'chunkingsample-recursivetextsplitter_langchain'

search_credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
azure_openai_key = os.environ["AZURE_OPENAI_API_KEY"] if len(os.environ["AZURE_OPENAI_API_KEY"]) > 0 else None

### Setup sample resources for embedding chunks

In [None]:
from openai import AzureOpenAI
# from azure.identity import get_bearer_token_provider

azure_openai_client = None
if azure_openai_key:
    azure_openai_client = AzureOpenAI(
        api_key=azure_openai_key, 
        api_version="2023-05-15",
        azure_deployment=azure_openai_embedding_deployment_id,
        azure_endpoint=azure_openai_endpoint)
else:
    azure_openai_client = AzureOpenAI(
        azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
        api_version="2023-05-15",
        azure_deployment=azure_openai_embedding_deployment_id,
        azure_endpoint=azure_openai_endpoint)

### Setup sample resources for recursive text splitter chunking

! pip uninstall -y azure-search-documents 
! pip install azure-search-documents==11.6.0b2

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchIndexer,
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    AzureOpenAIEmbeddingSkill,
    SplitSkill,
    VectorSearchProfile
)
# Required to use the preview SDK
from azure.search.documents.indexes._generated.models import (
    SearchIndexerSkillset,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

import tiktoken
import matplotlib.pyplot as plt
import math
import numpy as np

In [None]:
search_field = [
        SearchField(
            name="chunk_id",
            type=SearchFieldDataType.String,
            key=True,
            hidden=False,
            filterable=True,
            sortable=True,
            facetable=False,
            searchable=True,
            analyzer_name="keyword"
        ),
        SearchField(
            name="parent_id",
            type=SearchFieldDataType.String,
            hidden=False,
            filterable=True,
            sortable=True,
            facetable=False,
            searchable=True
        ),
        SearchField(
            name="chunk",
            type=SearchFieldDataType.String,
            hidden=False,
            filterable=False,
            sortable=False,
            facetable=False,
            searchable=True
        ),
        SearchField(
            name="title",
            type=SearchFieldDataType.String,
            hidden=False,
            filterable=False,
            sortable=False,
            facetable=False,
            searchable=True
        ),
        SearchField(
            name="vector",
            type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
            hidden=False,
            filterable=False,
            sortable=False,
            facetable=False,
            searchable=True,
            vector_search_dimensions=1536,
            vector_search_profile="profile"
        )
        ]

In [None]:
vector_search = VectorSearch(
            profiles=[
                VectorSearchProfile(
                    name="profile",
                    algorithm_configuration_name="hnsw-algorithm",
                    vectorizer="azure-openai-vectorizer"
                )
            ],
            algorithms=[
                HnswAlgorithmConfiguration(  
            name="myHnsw",  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        )
            ],
            vectorizers=[
                AzureOpenAIVectorizer(
                        name="azure-openai-vectorizer",
                        azure_open_ai_parameters=AzureOpenAIParameters(
                            resource_uri=azure_openai_endpoint,
                            deployment_id=azure_openai_embedding_deployment_id,
                            api_key=azure_openai_key # Optional if using RBAC authentication
                        )
                    )
            ]
        )

In [None]:
from azure.search.documents.indexes import SearchIndexClient

search_index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
rts_searchindex = SearchIndex(
        name=recursivetextsplitter_searchindex,
        fields=search_field,
        vector_search=vector_search)

search_index_client.create_or_update_index(rts_searchindex)

print("Created recursive text splitter index")


### Load PDF

In [None]:
from langchain_community.document_loaders import PyPDFLoader
import os

loader = PyPDFLoader(os.path.join("data", "roa-barista.pdf"))
pages = loader.load()


In [None]:
len(pages)

### Generate histogram of token and character lengths per page

In [None]:
def plot_chunk_histogram(chunks, length_fn, title, xlabel, ylabel="Chunk Count"):
    def round_to_lowest_multiple(number, multiple):
        return (number // multiple) * multiple

    def round_to_highest_multiple(number, multiple):
        return math.ceil(number / multiple) * multiple

    ys = [length_fn(chunk) for chunk in chunks]
    min_y = min(ys)
    max_y = max(ys)
    bins=25
    n, _, _ = plt.hist(ys, edgecolor="black", bins=bins) 
    # Set y-axis limits to remove the gap at the top
    max_freq = max(n)
    plt.ylim(0, max_freq)

    # Spacing for ticks on x-axis and x-axis limits to remove gaps
    tick_step = max(int(round_to_lowest_multiple((max_y-min_y)/5, 100)), 100)
    max_xtick = round_to_highest_multiple(max_y, tick_step)
    xticks = list(np.arange(start=round_to_lowest_multiple(min_y, tick_step), stop=round_to_highest_multiple(max_xtick, tick_step), step=tick_step))
    if max_xtick and xticks[-1] != max_xtick:
        xticks.append(max_xtick)
    plt.xticks(xticks)
    plt.xlim(round_to_lowest_multiple(min_y, tick_step), max_xtick)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()
    
def get_token_length(text, model="gpt-3.5-turbo"):
    return len(tiktoken.encoding_for_model(model).encode(text))

In [None]:
%matplotlib inline
page_content = [page.page_content for page in pages]

plot_chunk_histogram(
    chunks=page_content,
    length_fn=len,
    title="Distribution of page character lengths",
    xlabel="Page character length",
    ylabel="Page count")

plot_chunk_histogram(
    chunks=page_content,
    length_fn=get_token_length,
    title="Distribution of page token lengths",
    xlabel="Page token length",
    ylabel="Page count")

### Chunk PDF using Recursive text splitter

We use the output of the above historgrams to guide us into selecting a 600 token chunk length with a 150 token overlap.

In [None]:
def get_encoding_name(model="gpt-3.5-turbo"):
    return tiktoken.encoding_for_model(model).name

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from_tiktoken_encoder enables use to split on tokens rather than characters
recursive_text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
   encoding_name=get_encoding_name(),
   chunk_size=600, 
   chunk_overlap=125
)

recursive_text_splitter_chunks = recursive_text_splitter.split_documents(pages)

In [None]:
# for x, y in zip([pages for pages in page_content], [page_r.page_content for page_r in recursive_text_splitter_chunks]):
#     print(f"Original: {x} |||||||||||||||| Recursive Text Splitter: {y}")
#     print()

### Generate histogram of chunk character and token lengths

In [None]:
chunk_content = [chunk.page_content for chunk in recursive_text_splitter_chunks]

plot_chunk_histogram(
    chunks=chunk_content,
    length_fn=len,
    title="Distribution of chunk character lengths",
    xlabel="Chunk character length")
plot_chunk_histogram(
    chunks=chunk_content,
    length_fn=get_token_length,
    title="Distribution of chunk token lengths",
    xlabel="Chunk token length")

### Embed Recursive text splitter chunks

In [None]:
recursive_text_splitter_embeddings = azure_openai_client.embeddings.create(input=chunk_content, model=azure_openai_embedding_deployment_id)
recursive_text_splitter_embeddings = [result.embedding for result in recursive_text_splitter_embeddings.data]

### Upload chunks to search index

In [None]:
recursive_search_client = search_index_client.get_search_client(recursivetextsplitter_searchindex)

docs = [
    {
        "parent_id": "0",
        "chunk_id": f"earth-at-night-508-pdf_0_0_{i}",
        "chunk": chunk.page_content,
        "title": "earth_at_night_508.pdf",
        "vector": recursive_text_splitter_embeddings[i]
    }
    for i, chunk in enumerate(recursive_text_splitter_chunks)
]

recursive_search_client.upload_documents(docs)

print("Uploaded chunks and embeddings for recursive text splitter")