In [1]:
from langchain.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from pathlib import Path
from bs4 import BeautifulSoup

In [2]:
# Function to clean JSX/HTML-like tags from .mdx files
def clean_mdx_text(text):
    """Removes JSX/React components and HTML-like tags from .mdx files."""
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [3]:
# Path to the cloned LangChain docs
docs_path = "./langchain/docs"  # Update if needed

In [4]:
#Load .mdx files using GenericLoader + UnstructuredMarkdownLoader
mdx_files = list(Path(docs_path).rglob("*.mdx"))  # Find all .mdx files

docs_mdx = []
for file_path in mdx_files:
    loader = UnstructuredMarkdownLoader(str(file_path))
    doc = loader.load()[0]  # Load single document
    doc.page_content = clean_mdx_text(doc.page_content)  # Clean JSX
    docs_mdx.append(doc)

In [5]:
# Combine both lists
docs = docs_mdx

print(f"Successfully loaded {len(docs)} documents!")
print(f"Example document path: {docs[0].metadata['source']}")  # Shows file path
print(f"Cleaned Content Preview: {docs[40].page_content[:500]}")  # First 500 chars

Successfully loaded 403 documents!
Example document path: langchain/docs/docs/people.mdx
Cleaned Content Preview: MODEL_NOT_FOUND

The model name you have specified is not acknowledged by your provider.

Troubleshooting

The following may help resolve this error:

Double check the model string you are passing in.

If you are using a proxy or other alternative host with a model wrapper, confirm that the permitted model names are not restricted or altered.


In [6]:
import pickle

# Save the documents as a pickle file
with open("langchain_docs.pkl", "wb") as f:
    pickle.dump(docs, f)

print("Documents saved successfully!")


Documents saved successfully!
