In [1]:
import pickle
import re
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
# Load the saved documents
with open("langchain_docs.pkl", "rb") as f:
    docs = pickle.load(f)

print(f"Loaded {len(docs)} documents")
print(f"Sample Document:\n{docs[0].page_content[:500]}")

Loaded 403 documents
Sample Document:
hide_table_of_contents: true

import People from "@theme/People";

People

There are some incredible humans from all over the world who have been instrumental in helping the LangChain community flourish 🌐!

This page highlights a few of those folks who have dedicated their time to the open-source repo in the form of direct contributions and reviews.

Top reviewers

As LangChain has grown, the amount of surface area that maintainers cover has grown as well.

Thank you to the following folks who h


In [3]:
#to clean the text
def clean_text(text):
    """Removes JSX components, HTML tags, special characters, and extra spaces."""
    
    # Remove HTML & JSX-like components
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()
    
    # Remove Markdown-style headers & links
    text = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", text)  # Remove links [text](url)
    text = re.sub(r"#{1,6}\s*", "", text)  # Remove markdown headers
    
    # Remove code blocks & inline code
    text = re.sub(r"`{1,3}.*?`{1,3}", "", text, flags=re.DOTALL)  
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Apply cleaning
for doc in docs:
    doc.page_content = clean_text(doc.page_content)

print("Cleaned text successfully!")
print(f"Sample Cleaned Document:\n{docs[0].page_content[:500]}")


Cleaned text successfully!
Sample Cleaned Document:
hide_table_of_contents: true import People from "@theme/People"; People There are some incredible humans from all over the world who have been instrumental in helping the LangChain community flourish 🌐! This page highlights a few of those folks who have dedicated their time to the open-source repo in the form of direct contributions and reviews. Top reviewers As LangChain has grown, the amount of surface area that maintainers cover has grown as well. Thank you to the following folks who have gon


In [4]:
#Split Long Documents into Chunks. LangChain needs smaller chunks for efficient retrieval.
# Define chunking parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Each chunk is 1000 characters
    chunk_overlap=200,  # Overlap to maintain context
    separators=["\n\n", "\n", " ", ""]
)

# Split the documents
split_docs = text_splitter.split_documents(docs)

print(f"Split into {len(split_docs)} chunks!")
print(f"Sample Chunk:\n{split_docs[0].page_content[:500]}")


Split into 1116 chunks!
Sample Chunk:
hide_table_of_contents: true import People from "@theme/People"; People There are some incredible humans from all over the world who have been instrumental in helping the LangChain community flourish 🌐! This page highlights a few of those folks who have dedicated their time to the open-source repo in the form of direct contributions and reviews. Top reviewers As LangChain has grown, the amount of surface area that maintainers cover has grown as well. Thank you to the following folks who have gon


In [5]:
# Save preprocessed documents
with open("preprocessed_docs.pkl", "wb") as f:
    pickle.dump(split_docs, f)

print("Preprocessed documents saved successfully!")


Preprocessed documents saved successfully!
