diff --git a/src/docs.json b/src/docs.json
index 12c82e9a6..a2ae2cbef 100644
--- a/src/docs.json
+++ b/src/docs.json
@@ -1273,8 +1273,8 @@
"group": "OpenAI",
"pages": [
"oss/javascript/integrations/providers/openai",
- "/oss/javascript/integrations/chat/openai",
- "/oss/javascript/integrations/text_embedding/openai"
+ "oss/javascript/integrations/chat/openai",
+ "oss/javascript/integrations/text_embedding/openai"
]
},
{
@@ -1288,24 +1288,24 @@
"group": "Google",
"pages": [
"oss/javascript/integrations/providers/google",
- "/oss/javascript/integrations/chat/google_generative_ai",
- "/oss/javascript/integrations/text_embedding/google_generativeai"
+ "oss/javascript/integrations/chat/google_generative_ai",
+ "oss/javascript/integrations/text_embedding/google_generativeai"
]
},
{
"group": "AWS",
"pages": [
"oss/javascript/integrations/providers/aws",
- "/oss/javascript/integrations/chat/bedrock",
- "/oss/javascript/integrations/text_embedding/bedrock"
+ "oss/javascript/integrations/chat/bedrock",
+ "oss/javascript/integrations/text_embedding/bedrock"
]
},
{
"group": "Microsoft",
"pages": [
"oss/javascript/integrations/providers/microsoft",
- "/oss/javascript/integrations/chat/azure",
- "/oss/javascript/integrations/text_embedding/azure_openai"
+ "oss/javascript/integrations/chat/azure",
+ "oss/javascript/integrations/text_embedding/azure_openai"
]
},
"oss/javascript/integrations/providers/all_providers"
@@ -1329,16 +1329,11 @@
"icon": "database",
"pages": [
"oss/javascript/integrations/retrievers/index",
+ "oss/javascript/integrations/splitters/index",
"oss/javascript/integrations/text_embedding/index",
"oss/javascript/integrations/vectorstores/index",
- {
- "group": "Document loaders",
- "pages": [
- "oss/javascript/integrations/document_loaders/index",
- "oss/javascript/integrations/document_loaders/file_loaders/index",
- "oss/javascript/integrations/document_loaders/web_loaders/index"
- ]
- }
+ "oss/javascript/integrations/document_loaders/index",
+ "oss/javascript/integrations/stores/index"
]
}
]
diff --git a/src/oss/integrations/splitters/character_text_splitter.mdx b/src/oss/integrations/splitters/character_text_splitter.mdx
new file mode 100644
index 000000000..c79f79546
--- /dev/null
+++ b/src/oss/integrations/splitters/character_text_splitter.mdx
@@ -0,0 +1,145 @@
+---
+title: Splitting by character
+---
+
+Character-based splitting is the simplest approach to text splitting. It divides text using a specified character sequence (default: `"\n\n"`), with chunk length measured by the number of characters.
+
+**Key points**:
+1. **How text is split**: by a given character separator.
+2. **How chunk size is measured**: by character count.
+
+:::python
+You can choose between:
+- `.split_text` — returns plain string chunks.
+- `.create_documents` — returns LangChain @[Document] objects, useful when metadata needs to be preserved for downstream tasks.
+:::
+:::js
+You can choose between:
+- `.splitText` — returns plain string chunks.
+- `.createDocuments` — returns LangChain @[Document] objects, useful when metadata needs to be preserved for downstream tasks.
+:::
+
+:::python
+```python
+%pip install -qU langchain-text-splitters
+```
+:::
+:::js
+
+```bash npm
+npm install @langchain/textsplitters
+```
+
+```bash pnpm
+pnpm install @langchain/textsplitters
+```
+
+```bash yarn
+yarn add @langchain/textsplitters
+```
+
+```bash bun
+bun add @langchain/textsplitters
+```
+
+:::
+
+:::python
+```python
+from langchain_text_splitters import CharacterTextSplitter
+
+# Load an example document
+with open("state_of_the_union.txt") as f:
+ state_of_the_union = f.read()
+
+text_splitter = CharacterTextSplitter(
+ separator="\n\n",
+ chunk_size=1000,
+ chunk_overlap=200,
+ length_function=len,
+ is_separator_regex=False,
+)
+texts = text_splitter.create_documents([state_of_the_union])
+print(texts[0])
+```
+```output
+page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
+```
+:::
+:::js
+```ts
+import { CharacterTextSplitter } from "@langchain/textsplitters";
+import { readFileSync } from "fs";
+
+// Example: read a long document
+const stateOfTheUnion = readFileSync("state_of_the_union.txt", "utf8");
+
+const splitter = new CharacterTextSplitter({
+ separator: "\n\n",
+ chunkSize: 1000,
+ chunkOverlap: 200,
+});
+const texts = splitter.createDocuments([{ pageContent: stateOfTheUnion }]);
+console.log(texts[0]);
+```
+```output
+Document {
+ pageContent: 'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
+}
+```
+:::
+
+:::python
+Use `.create_documents` to propagate metadata associated with each document to the output chunks:
+
+```python
+metadatas = [{"document": 1}, {"document": 2}]
+documents = text_splitter.create_documents(
+ [state_of_the_union, state_of_the_union], metadatas=metadatas
+)
+print(documents[0])
+```
+```output
+page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.' metadata={'document': 1}
+```
+:::
+:::js
+Use `.createDocuments` to propagate metadata associated with each document to the output chunks:
+
+```ts
+const metadatas = [{"document": 1}, {"document": 2}]
+const documents = splitter.createDocuments(
+ [{ pageContent: stateOfTheUnion }, { pageContent: stateOfTheUnion }],
+ { metadatas: metadatas }
+);
+console.log(documents[0]);
+```
+```output
+Document {
+ pageContent: 'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.',
+ metadata: {'document': 1}
+}
+```
+:::
+
+:::python
+Use `.split_text` to obtain the string content directly:
+
+```python
+text_splitter.split_text(state_of_the_union)[0]
+```
+```output
+'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
+```
+
+:::
+:::js
+Use `.splitText` to obtain the string content directly:
+
+```ts
+splitter.splitText(stateOfTheUnion)[0]
+```
+```output
+'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
+```
+:::
diff --git a/src/oss/integrations/splitters/code_splitter.mdx b/src/oss/integrations/splitters/code_splitter.mdx
new file mode 100644
index 000000000..67faa207f
--- /dev/null
+++ b/src/oss/integrations/splitters/code_splitter.mdx
@@ -0,0 +1,1035 @@
+---
+title: Splitting code
+---
+
+@[RecursiveCharacterTextSplitter] includes pre-built lists of separators that are useful for [splitting text](/oss/integrations/splitters/) in a specific programming language.
+
+:::python
+Supported languages are stored in the `langchain_text_splitters.Language` enum. They include:
+
+```
+"cpp",
+"go",
+"java",
+"kotlin",
+"js",
+"ts",
+"php",
+"proto",
+"python",
+"rst",
+"ruby",
+"rust",
+"scala",
+"swift",
+"markdown",
+"latex",
+"html",
+"sol",
+"csharp",
+"cobol",
+"c",
+"lua",
+"perl",
+"haskell"
+```
+
+To view the list of separators for a given language, pass a value from this enum into
+```python
+RecursiveCharacterTextSplitter.get_separators_for_language
+```
+
+To instantiate a splitter that is tailored for a specific language, pass a value from the enum into
+```python
+RecursiveCharacterTextSplitter.from_language
+```
+:::
+
+:::js
+Supported languages are kept in the `SupportedTextSplitterLanguages` type. They include:
+
+```
+"cpp",
+"go",
+"java",
+"js",
+"php",
+"proto",
+"python",
+"rst",
+"ruby",
+"rust",
+"scala",
+"swift",
+"markdown",
+"latex",
+"html",
+"sol",
+```
+
+To view the list of separators for a given language, pass a value from this enum into
+```ts
+RecursiveCharacterTextSplitter.getSeparatorsForLanguage()
+```
+
+To instantiate a splitter that is tailored for a specific language, pass a value from the enum into
+```ts
+RecursiveCharacterTextSplitter.fromLanguage()
+```
+:::
+
+Below we demonstrate examples for the various languages.
+
+:::python
+```python
+%pip install -qU langchain-text-splitters
+```
+:::
+:::js
+
+```bash npm
+npm install @langchain/textsplitters
+```
+
+```bash pnpm
+pnpm install @langchain/textsplitters
+```
+
+```bash yarn
+yarn add @langchain/textsplitters
+```
+
+```bash bun
+bun add @langchain/textsplitters
+```
+
+:::
+
+:::python
+```python
+from langchain_text_splitters import (
+ Language,
+ RecursiveCharacterTextSplitter,
+)
+```
+
+To view the full list of supported languages:
+
+
+```python
+[e.value for e in Language]
+```
+
+
+
+```output
+['cpp',
+ 'go',
+ 'java',
+ 'kotlin',
+ 'js',
+ 'ts',
+ 'php',
+ 'proto',
+ 'python',
+ 'rst',
+ 'ruby',
+ 'rust',
+ 'scala',
+ 'swift',
+ 'markdown',
+ 'latex',
+ 'html',
+ 'sol',
+ 'csharp',
+ 'cobol',
+ 'c',
+ 'lua',
+ 'perl',
+ 'haskell',
+ 'elixir',
+ 'powershell',
+ 'visualbasic6']
+```
+
+
+You can also see the separators used for a given language:
+
+
+```python
+RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)
+```
+
+
+
+```output
+['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']
+```
+:::
+
+## Python
+:::python
+
+Here's an example using the PythonTextSplitter:
+
+```python
+PYTHON_CODE = """
+def hello_world():
+ print("Hello, World!")
+
+# Call the function
+hello_world()
+"""
+python_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.PYTHON, chunk_size=50, chunk_overlap=0
+)
+python_docs = python_splitter.create_documents([PYTHON_CODE])
+python_docs
+```
+
+
+
+```output
+[Document(metadata={}, page_content='def hello_world():\n print("Hello, World!")'),
+ Document(metadata={}, page_content='# Call the function\nhello_world()')]
+```
+:::
+:::js
+Here's an example using the python text splitter:
+
+```ts
+const pythonSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "python",
+ { chunkSize: 50, chunkOverlap: 0 }
+);
+const pythonDocs = pythonSplitter.createDocuments([{ pageContent: PYTHON_CODE }]);
+console.log(pythonDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'def hello_world():\n print("Hello, World!")' },
+ Document { metadata: {}, pageContent: '# Call the function\nhello_world()' }
+]
+```
+:::
+
+## JS
+:::python
+Here's an example using the JS text splitter:
+
+```python
+JS_CODE = """
+function helloWorld() {
+ console.log("Hello, World!");
+}
+
+// Call the function
+helloWorld();
+"""
+
+js_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.JS, chunk_size=60, chunk_overlap=0
+)
+js_docs = js_splitter.create_documents([JS_CODE])
+js_docs
+```
+
+```output
+[Document(metadata={}, page_content='function helloWorld() {\n console.log("Hello, World!");\n}'),
+ Document(metadata={}, page_content='// Call the function\nhelloWorld();')]
+```
+:::
+:::js
+Here's an example using the JS text splitter:
+
+```ts
+const JS_CODE = `
+function helloWorld() {
+ console.log("Hello, World!");
+}
+
+// Call the function
+helloWorld();
+`;
+
+const jsSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "js",
+ { chunkSize: 60, chunkOverlap: 0 }
+);
+const jsDocs = jsSplitter.createDocuments([{ pageContent: JS_CODE }]);
+console.log(jsDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'function helloWorld() {\n console.log("Hello, World!");\n}' },
+ Document { metadata: {}, pageContent: '// Call the function\nhelloWorld()' }
+]
+```
+:::
+
+## TS
+:::python
+Here's an example using the typescript text splitter:
+
+```python
+TS_CODE = """
+function helloWorld(): void {
+ console.log("Hello, World!");
+}
+
+// Call the function
+helloWorld();
+"""
+
+ts_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.TS, chunk_size=60, chunk_overlap=0
+)
+ts_docs = ts_splitter.create_documents([TS_CODE])
+ts_docs
+```
+
+```output
+[Document(metadata={}, page_content='function helloWorld(): void {'),
+ Document(metadata={}, page_content='console.log("Hello, World!");\n}'),
+ Document(metadata={}, page_content='// Call the function\nhelloWorld();')]
+```
+:::
+:::js
+Here's an example using the typescript text splitter:
+
+```ts
+const TS_CODE = `
+function helloWorld(): void {
+ console.log("Hello, World!");
+}
+
+// Call the function
+helloWorld();
+`;
+
+const tsSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "ts",
+ { chunkSize: 60, chunkOverlap: 0 }
+);
+const tsDocs = tsSplitter.createDocuments([{ pageContent: TS_CODE }]);
+console.log(tsDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'function helloWorld(): void {' },
+ Document { metadata: {}, pageContent: 'console.log("Hello, World!");\n}' },
+ Document { metadata: {}, pageContent: '// Call the function\nhelloWorld()' }
+]
+```
+:::
+
+## Markdown
+:::python
+Here's an example using the Markdown text splitter:
+
+```python
+markdown_text = """
+# 🦜️🔗 LangChain
+
+⚡ Building applications with LLMs through composability ⚡
+
+## What is LangChain?
+
+# Hopefully this code block isn't split
+LangChain is a framework for...
+
+As an open-source project in a rapidly developing field, we are extremely open to contributions.
+"""
+```
+
+```python
+md_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
+)
+md_docs = md_splitter.create_documents([markdown_text])
+md_docs
+```
+
+```output
+[Document(metadata={}, page_content='# 🦜️🔗 LangChain'),
+ Document(metadata={}, page_content='⚡ Building applications with LLMs through composability ⚡'),
+ Document(metadata={}, page_content='## What is LangChain?'),
+ Document(metadata={}, page_content="# Hopefully this code block isn't split"),
+ Document(metadata={}, page_content='LangChain is a framework for...'),
+ Document(metadata={}, page_content='As an open-source project in a rapidly developing field, we'),
+ Document(metadata={}, page_content='are extremely open to contributions.')]
+```
+:::
+:::js
+Here's an example using the Markdown text splitter:
+
+```ts
+const markdownText = `
+# 🦜️🔗 LangChain
+
+⚡ Building applications with LLMs through composability ⚡
+
+## What is LangChain?
+
+# Hopefully this code block isn't split
+LangChain is a framework for...
+
+As an open-source project in a rapidly developing field, we are extremely open to contributions.
+`;
+
+const mdSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "markdown",
+ { chunkSize: 60, chunkOverlap: 0 }
+);
+const mdDocs = mdSplitter.createDocuments([{ pageContent: markdownText }]);
+console.log(mdDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: '# 🦜️🔗 LangChain' },
+ Document { metadata: {}, pageContent: '⚡ Building applications with LLMs through composability ⚡' },
+ Document { metadata: {}, pageContent: '## What is LangChain?' },
+ Document { metadata: {}, pageContent: "# Hopefully this code block isn't split" },
+ Document { metadata: {}, pageContent: 'LangChain is a framework for...' },
+ Document { metadata: {}, pageContent: 'As an open-source project in a rapidly developing field, we' },
+ Document { metadata: {}, pageContent: 'are extremely open to contributions.' }
+]
+```
+:::
+
+## Latex
+:::python
+Here's an example on Latex text:
+
+```python
+latex_text = """
+\documentclass{article}
+
+\begin{document}
+
+\maketitle
+
+\section{Introduction}
+Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.
+
+\subsection{History of LLMs}
+The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
+
+\subsection{Applications of LLMs}
+LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
+
+\end{document}
+"""
+```
+
+```python
+latex_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
+)
+latex_docs = latex_splitter.create_documents([latex_text])
+latex_docs
+```
+
+```output
+[Document(metadata={}, page_content='\\documentclass{article}\n\n\x08egin{document}\n\n\\maketitle'),
+ Document(metadata={}, page_content='\\section{Introduction}'),
+ Document(metadata={}, page_content='Large language models (LLMs) are a type of machine learning'),
+ Document(metadata={}, page_content='model that can be trained on vast amounts of text data to'),
+ Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),
+ Document(metadata={}, page_content='made significant advances in a variety of natural language'),
+ Document(metadata={}, page_content='processing tasks, including language translation, text'),
+ Document(metadata={}, page_content='generation, and sentiment analysis.'),
+ Document(metadata={}, page_content='\\subsection{History of LLMs}'),
+ Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),
+ Document(metadata={}, page_content='but they were limited by the amount of data that could be'),
+ Document(metadata={}, page_content='processed and the computational power available at the'),
+ Document(metadata={}, page_content='time. In the past decade, however, advances in hardware and'),
+ Document(metadata={}, page_content='software have made it possible to train LLMs on massive'),
+ Document(metadata={}, page_content='datasets, leading to significant improvements in'),
+ Document(metadata={}, page_content='performance.'),
+ Document(metadata={}, page_content='\\subsection{Applications of LLMs}'),
+ Document(metadata={}, page_content='LLMs have many applications in industry, including'),
+ Document(metadata={}, page_content='chatbots, content creation, and virtual assistants. They'),
+ Document(metadata={}, page_content='can also be used in academia for research in linguistics,'),
+ Document(metadata={}, page_content='psychology, and computational linguistics.'),
+ Document(metadata={}, page_content='\\end{document}')]
+```
+:::
+:::js
+Here's an example on Latex text:
+
+```ts
+const latexText = `
+\\documentclass{article}
+
+\\begin{document}
+
+\\maketitle
+
+\\section{Introduction}
+Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.
+
+\\subsection{History of LLMs}
+The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
+
+\\subsection{Applications of LLMs}
+LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
+
+\\end{document}
+`;
+
+const latexSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "latex",
+ { chunkSize: 60, chunkOverlap: 0 }
+);
+const latexDocs = latexSplitter.createDocuments([{ pageContent: latexText }]);
+console.log(latexDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: '\\documentclass{article}\n\n\\begin{document}\n\n\\maketitle' },
+ Document { metadata: {}, pageContent: '\\section{Introduction}' },
+ Document { metadata: {}, pageContent: 'Large language models (LLMs) are a type of machine learning' },
+ Document { metadata: {}, pageContent: 'model that can be trained on vast amounts of text data to' },
+ Document { metadata: {}, pageContent: 'generate human-like language. In recent years, LLMs have' },
+ Document { metadata: {}, pageContent: 'made significant advances in a variety of natural language' },
+ Document { metadata: {}, pageContent: 'processing tasks, including language translation, text' },
+ Document { metadata: {}, pageContent: 'generation, and sentiment analysis.' },
+ Document { metadata: {}, pageContent: '\\subsection{History of LLMs}' },
+ Document { metadata: {}, pageContent: 'The earliest LLMs were developed in the 1980s and 1990s,' },
+ Document { metadata: {}, pageContent: 'but they were limited by the amount of data that could be' },
+ Document { metadata: {}, pageContent: 'processed and the computational power available at the' },
+ Document { metadata: {}, pageContent: 'time. In the past decade, however, advances in hardware and' },
+ Document { metadata: {}, pageContent: 'software have made it possible to train LLMs on massive' },
+ Document { metadata: {}, pageContent: 'datasets, leading to significant improvements in' },
+ Document { metadata: {}, pageContent: 'performance.' },
+ Document { metadata: {}, pageContent: '\\subsection{Applications of LLMs}' },
+ Document { metadata: {}, pageContent: 'LLMs have many applications in industry, including' },
+ Document { metadata: {}, pageContent: 'chatbots, content creation, and virtual assistants. They' },
+ Document { metadata: {}, pageContent: 'can also be used in academia for research in linguistics,' },
+ Document { metadata: {}, pageContent: 'psychology, and computational linguistics.' },
+ Document { metadata: {}, pageContent: '\\end{document}' }
+]
+```
+:::
+
+## HTML
+:::python
+Here's an example using an HTML text splitter:
+
+```python
+html_text = """
+
+
+
+ 🦜️🔗 LangChain
+
+
+
+
+
🦜️🔗 LangChain
+
⚡ Building applications with LLMs through composability ⚡
+
+
+ As an open-source project in a rapidly developing field, we are extremely open to contributions.
+
+
+
+"""
+```
+
+```python
+html_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.HTML, chunk_size=60, chunk_overlap=0
+)
+html_docs = html_splitter.create_documents([html_text])
+html_docs
+```
+
+```output
+[Document(metadata={}, page_content='\n'),
+ Document(metadata={}, page_content='\n 🦜️🔗 LangChain'),
+ Document(metadata={}, page_content='\n '),
+ Document(metadata={}, page_content=''),
+ Document(metadata={}, page_content='\n
🦜️🔗 LangChain
'),
+ Document(metadata={}, page_content='
⚡ Building applications with LLMs through composability ⚡'),
+ Document(metadata={}, page_content='
\n
'),
+ Document(metadata={}, page_content='\n As an open-source project in a rapidly dev'),
+ Document(metadata={}, page_content='eloping field, we are extremely open to contributions.'),
+ Document(metadata={}, page_content='
\n \n')]
+```
+:::
+:::js
+Here's an example using an HTML text splitter:
+
+```ts
+const htmlText = `
+
+
+
+ 🦜️🔗 LangChain
+
+
+
+
+
🦜️🔗 LangChain
+
⚡ Building applications with LLMs through composability ⚡
+
+
+ As an open-source project in a rapidly developing field, we are extremely open to contributions.
+
+
+
+`;
+
+const htmlSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "html",
+ { chunkSize: 60, chunkOverlap: 0 }
+);
+const htmlDocs = htmlSplitter.createDocuments([{ pageContent: htmlText }]);
+console.log(htmlDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: '\n' },
+ Document { metadata: {}, pageContent: '\n 🦜️🔗 LangChain' },
+ Document { metadata: {}, pageContent: '\n ' },
+ Document { metadata: {}, pageContent: '' },
+ Document { metadata: {}, pageContent: '\n
🦜️🔗 LangChain
' },
+ Document { metadata: {}, pageContent: '
⚡ Building applications with LLMs through composability ⚡' },
+ Document { metadata: {}, pageContent: '
\n
' },
+ Document { metadata: {}, pageContent: '\n As an open-source project in a rapidly dev' },
+ Document { metadata: {}, pageContent: 'eloping field, we are extremely open to contributions.' },
+ Document { metadata: {}, pageContent: '
\n \n' }
+]
+```
+:::
+
+## Solidity
+:::python
+Here's an example using the Solidity text splitter:
+
+```python
+SOL_CODE = """
+pragma solidity ^0.8.20;
+contract HelloWorld {
+ function add(uint a, uint b) pure public returns(uint) {
+ return a + b;
+ }
+}
+"""
+
+sol_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.SOL, chunk_size=128, chunk_overlap=0
+)
+sol_docs = sol_splitter.create_documents([SOL_CODE])
+sol_docs
+```
+
+```output
+[Document(metadata={}, page_content='pragma solidity ^0.8.20;'),
+ Document(metadata={}, page_content='contract HelloWorld {\n function add(uint a, uint b) pure public returns(uint) {\n return a + b;\n }\n}')]
+```
+:::
+:::js
+Here's an example using the Solidity text splitter:
+
+```ts
+const SOL_CODE = `
+pragma solidity ^0.8.20;
+contract HelloWorld {
+ function add(uint a, uint b) pure public returns(uint) {
+ return a + b;
+ }
+}
+`;
+
+const solSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "sol",
+ { chunkSize: 128, chunkOverlap: 0 }
+);
+const solDocs = solSplitter.createDocuments([{ pageContent: SOL_CODE }]);
+console.log(solDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'pragma solidity ^0.8.20;' },
+ Document { metadata: {}, pageContent: 'contract HelloWorld {\n function add(uint a, uint b) pure public returns(uint) {\n return a + b;\n }\n}' }
+]
+```
+:::
+
+## C#
+:::python
+Here's an example using the C# text splitter:
+
+```python
+C_CODE = """
+using System;
+class Program
+{
+ static void Main()
+ {
+ int age = 30; // Change the age value as needed
+
+ // Categorize the age without any console output
+ if (age < 18)
+ {
+ // Age is under 18
+ }
+ else if (age >= 18 && age < 65)
+ {
+ // Age is an adult
+ }
+ else
+ {
+ // Age is a senior citizen
+ }
+ }
+}
+"""
+c_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.CSHARP, chunk_size=128, chunk_overlap=0
+)
+c_docs = c_splitter.create_documents([C_CODE])
+c_docs
+```
+
+```output
+[Document(metadata={}, page_content='using System;'),
+ Document(metadata={}, page_content='class Program\n{\n static void Main()\n {\n int age = 30; // Change the age value as needed'),
+ Document(metadata={}, page_content='// Categorize the age without any console output\n if (age < 18)\n {\n // Age is under 18'),
+ Document(metadata={}, page_content='}\n else if (age >= 18 && age < 65)\n {\n // Age is an adult\n }\n else\n {'),
+ Document(metadata={}, page_content='// Age is a senior citizen\n }\n }\n}')]
+```
+:::
+:::js
+Here's an example using the C# text splitter:
+
+```ts
+const C_CODE = `
+using System;
+class Program
+{
+ static void Main()
+ {
+ int age = 30; // Change the age value as needed
+
+ // Categorize the age without any console output
+ if (age < 18)
+ {
+ // Age is under 18
+ }
+ else if (age >= 18 && age < 65)
+ {
+ // Age is an adult
+ }
+ else
+ {
+ // Age is a senior citizen
+ }
+ }
+}
+`;
+
+const csharpSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "csharp",
+ { chunkSize: 128, chunkOverlap: 0 }
+);
+const csharpDocs = csharpSplitter.createDocuments([{ pageContent: C_CODE }]);
+console.log(csharpDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'using System;' },
+ Document { metadata: {}, pageContent: 'class Program\n{\n static void Main()\n {\n int age = 30; // Change the age value as needed' },
+ Document { metadata: {}, pageContent: '// Categorize the age without any console output\n if (age < 18)\n {\n // Age is under 18' },
+ Document { metadata: {}, pageContent: '}\n else if (age >= 18 && age < 65)\n {\n // Age is an adult\n }\n else\n {' },
+ Document { metadata: {}, pageContent: '// Age is a senior citizen\n }\n }\n}' }
+]
+```
+:::
+
+## Haskell
+:::python
+Here's an example using the Haskell text splitter:
+
+```python
+HASKELL_CODE = """
+main :: IO ()
+main = do
+ putStrLn "Hello, World!"
+-- Some sample functions
+add :: Int -> Int -> Int
+add x y = x + y
+"""
+haskell_splitter = RecursiveCharacterTextSplitter.from_language(
+ language=Language.HASKELL, chunk_size=50, chunk_overlap=0
+)
+haskell_docs = haskell_splitter.create_documents([HASKELL_CODE])
+haskell_docs
+```
+
+```output
+[Document(metadata={}, page_content='main :: IO ()'),
+ Document(metadata={}, page_content='main = do\n putStrLn "Hello, World!"\n-- Some'),
+ Document(metadata={}, page_content='sample functions\nadd :: Int -> Int -> Int\nadd x y'),
+ Document(metadata={}, page_content='= x + y')]
+```
+:::
+:::js
+Here's an example using the Haskell text splitter:
+
+```ts
+const HASKELL_CODE = `
+main :: IO ()
+main = do
+ putStrLn "Hello, World!"
+-- Some sample functions
+add :: Int -> Int -> Int
+add x y = x + y
+`;
+
+const haskellSplitter = RecursiveCharacterTextSplitter.fromLanguage(
+ "haskell",
+ { chunkSize: 50, chunkOverlap: 0 }
+);
+const haskellDocs = haskellSplitter.createDocuments([{ pageContent: HASKELL_CODE }]);
+console.log(haskellDocs);
+```
+```output
+[
+ Document { metadata: {}, pageContent: 'main :: IO ()' },
+ Document { metadata: {}, pageContent: 'main = do\n putStrLn "Hello, World!"\n-- Some' },
+ Document { metadata: {}, pageContent: 'sample functions\nadd :: Int -> Int -> Int\nadd x y' },
+ Document { metadata: {}, pageContent: '= x + y' }
+]
+```
+:::
+
+## PHP
+:::python
+Here's an example using the PHP text splitter:
+
+```python
+PHP_CODE = """
## Text structure-based
+
Text is naturally organized into hierarchical units such as paragraphs, sentences, and words. We can leverage this inherent structure to inform our splitting strategy, creating split that maintain natural language flow, maintain semantic coherence within split, and adapts to varying levels of text granularity. LangChain's `RecursiveCharacterTextSplitter` implements this concept:
- The [RecursiveCharacterTextSplitter](/oss/integrations/splitters/recursive_text_splitter) attempts to keep larger units (e.g., paragraphs) intact.
@@ -19,12 +20,22 @@ Text is naturally organized into hierarchical units such as paragraphs, sentence
Example usage:
+:::python
```python
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(document)
```
+:::
+:::js
+```ts
+import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
+
+const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 100, chunkOverlap: 0 })
+const texts = splitter.splitText(document)
+```
+:::
**Available text splitters**:
- [Recursively split text](/oss/integrations/splitters/recursive_text_splitter)
@@ -44,6 +55,7 @@ Types of length-based splitting:
Example implementation using LangChain's CharacterTextSplitter with token-based splitting:
+:::python
```python
from langchain_text_splitters import CharacterTextSplitter
@@ -52,6 +64,15 @@ text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
)
texts = text_splitter.split_text(document)
```
+:::
+:::js
+```ts
+import { TokenTextSplitter } from "@langchain/textsplitters";
+
+const splitter = new TokenTextSplitter({ encodingName: "cl100k_base", chunkSize: 100, chunkOverlap: 0 })
+const texts = splitter.splitText(document)
+```
+:::
**Available text splitters**:
- [Split by tokens](/oss/integrations/splitters/split_by_token)
@@ -65,6 +86,7 @@ Some documents have an inherent structure, such as HTML, Markdown, or JSON files
- Maintains context within each chunk
- Can be more effective for downstream tasks like retrieval or summarization
+:::python
Examples of structure-based splitting:
- Markdown: Split based on headers (e.g., #, ##, ###)
@@ -77,9 +99,16 @@ Examples of structure-based splitting:
- [Split JSON](/oss/integrations/splitters/recursive_json_splitter)
- [Split code](/oss/integrations/splitters/code_splitter)
- [Split HTML](/oss/integrations/splitters/split_html)
+:::
+:::js
+**Available text splitters**:
+- [Split code](/oss/integrations/splitters/code_splitter)
+:::
+:::python
## Provider-specific
+:::
diff --git a/src/oss/python/integrations/splitters/recursive_text_splitter.mdx b/src/oss/integrations/splitters/recursive_text_splitter.mdx
similarity index 70%
rename from src/oss/python/integrations/splitters/recursive_text_splitter.mdx
rename to src/oss/integrations/splitters/recursive_text_splitter.mdx
index b7ffbfc44..c25891a3e 100644
--- a/src/oss/python/integrations/splitters/recursive_text_splitter.mdx
+++ b/src/oss/integrations/splitters/recursive_text_splitter.mdx
@@ -9,15 +9,35 @@ This [text splitter](/oss/integrations/splitters/) is the recommended one for ge
Below we show example usage.
-To obtain the string content directly, use `.split_text`.
+:::python
+```shell
+pip install -qU langchain-text-splitters
+```
+:::
+:::js
+
+```bash npm
+npm install @langchain/textsplitters
+```
-To create LangChain [Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) objects (e.g., for use in downstream tasks), use `.create_documents`.
+```bash pnpm
+pnpm install @langchain/textsplitters
+```
+```bash yarn
+yarn add @langchain/textsplitters
+```
-```shell
-pip install -qU langchain-text-splitters
+```bash bun
+bun add @langchain/textsplitters
```
+
+:::
+:::python
+To obtain the string content directly, use `.split_text`.
+
+To create LangChain @[Document] objects (e.g., for use in downstream tasks), use `.create_documents`.
```python
from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -43,22 +63,46 @@ page_content='of Congress and the Cabinet. Justices of the Supreme Court. My fel
```
```python
-text_splitter.split_text(state_of_the_union)[:2]
+print(text_splitter.split_text(state_of_the_union)[:2])
```
-
-
```output
['Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and',
'of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.']
```
+:::
+
+:::js
+To obtain the string content directly, use `.splitText`.
+
+To create LangChain @[Document] objects (e.g., for use in downstream tasks), use `.createDocuments`.
+```ts
+import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
+
+const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 100, chunkOverlap: 0 })
+const texts = splitter.createDocuments([{ pageContent: "..." }])
+```
+```output
+[
+ { pageContent: "...", metadata: {} },
+]
+```
+:::
Let's go through the parameters set above for `RecursiveCharacterTextSplitter`:
+
+:::python
- `chunk_size`: The maximum size of a chunk, where size is determined by the `length_function`.
- `chunk_overlap`: Target overlap between chunks. Overlapping chunks helps to mitigate loss of information when context is divided between chunks.
- `length_function`: Function determining the chunk size.
- `is_separator_regex`: Whether the separator list (defaulting to `["\n\n", "\n", " ", ""]`) should be interpreted as regex.
+:::
+
+:::js
+- `chunkSize`: The maximum size of a chunk, where size is determined by the `lengthFunction`.
+- `chunkOverlap`: Target overlap between chunks. Overlapping chunks helps to mitigate loss of information when context is divided between chunks.
+:::
## Splitting text from languages without word boundaries
@@ -68,7 +112,7 @@ Some writing systems do not have [word boundaries](https://en.wikipedia.org/wiki
* Add [Zero-width space](https://en.wikipedia.org/wiki/Zero-width_space) used in Thai, Myanmar, Kmer, and Japanese.
* Add ASCII comma "`,`", Unicode fullwidth comma "`,`", and Unicode ideographic comma "`、`"
-
+:::python
```python
text_splitter = RecursiveCharacterTextSplitter(
separators=[
@@ -87,3 +131,23 @@ text_splitter = RecursiveCharacterTextSplitter(
# Existing args
)
```
+:::
+:::js
+```ts
+const splitter = new RecursiveCharacterTextSplitter({
+ separators: [
+ "\n\n",
+ "\n",
+ " ",
+ ".",
+ ",",
+ "\u200b", // Zero-width space
+ "\uff0c", // Fullwidth comma
+ "\u3001", // Ideographic comma
+ "\uff0e", // Fullwidth full stop
+ "\u3002", // Ideographic stop
+ "",
+ ],
+});
+```
+:::
diff --git a/src/oss/python/integrations/splitters/split_by_token.mdx b/src/oss/integrations/splitters/split_by_token.mdx
similarity index 85%
rename from src/oss/python/integrations/splitters/split_by_token.mdx
rename to src/oss/integrations/splitters/split_by_token.mdx
index fc18bc46d..2d59c2e46 100644
--- a/src/oss/python/integrations/splitters/split_by_token.mdx
+++ b/src/oss/integrations/splitters/split_by_token.mdx
@@ -4,6 +4,7 @@ title: Splitting by token
Language models have a token limit. You should not exceed the token limit. When you [split your text](/oss/integrations/splitters/) into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model.
+:::python
## tiktoken
@@ -16,7 +17,7 @@ We can use `tiktoken` to estimate tokens used. It will probably be more accurate
1. How the text is split: by character passed in.
2. How the chunk size is measured: by `tiktoken` tokenizer.
-[CharacterTextSplitter](https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.CharacterTextSplitter.html), [RecursiveCharacterTextSplitter](https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html), and [TokenTextSplitter](https://python.langchain.com/api_reference/text_splitters/base/langchain_text_splitters.base.TokenTextSplitter.html) can be used with `tiktoken` directly.
+@[CharacterTextSplitter], @[RecursiveCharacterTextSplitter], and @[TokenTextSplitter] can be used with `tiktoken` directly.
```python
@@ -32,7 +33,7 @@ with open("state_of_the_union.txt") as f:
state_of_the_union = f.read()
```
-To split with a [CharacterTextSplitter](https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.CharacterTextSplitter.html) and then merge chunks with `tiktoken`, use its `.from_tiktoken_encoder()` method. Note that splits from this method can be larger than the chunk size measured by the `tiktoken` tokenizer.
+To split with a @[CharacterTextSplitter] and then merge chunks with `tiktoken`, use its `.from_tiktoken_encoder()` method. Note that splits from this method can be larger than the chunk size measured by the `tiktoken` tokenizer.
The `.from_tiktoken_encoder()` method takes either `encoding_name` as an argument (e.g. `cl100k_base`), or the `model_name` (e.g. `gpt-4`). All additional arguments like `chunk_size`, `chunk_overlap`, and `separators` are used to instantiate `CharacterTextSplitter`:
@@ -85,7 +86,69 @@ print(texts[0])
Madam Speaker, Madam Vice President, our
```
Some written languages (e.g. Chinese and Japanese) have characters which encode to 2 or more tokens. Using the `TokenTextSplitter` directly can split the tokens for a character between two chunks causing malformed Unicode characters. Use `RecursiveCharacterTextSplitter.from_tiktoken_encoder` or `CharacterTextSplitter.from_tiktoken_encoder` to ensure chunks contain valid Unicode strings.
+:::
+:::js
+## js-tiktoken
+
+
+**[js-tiktoken](https://github.com/dqbd/tiktoken) is a JavaScript vesrion of the `BPE` tokenizer created by `OpenAI`.**
+
+
+We can use `tiktoken` to estimate tokens used using @[TokenTextSplitter]. It will probably be more accurate for OpenAI mdoels.
+
+1. How the text is split: by character passed in.
+2. How the chunk size is measured: by `tiktoken` tokenizer.
+
+
+```bash npm
+npm install @langchain/textsplitters
+```
+
+```bash pnpm
+pnpm install @langchain/textsplitters
+```
+
+```bash yarn
+yarn add @langchain/textsplitters
+```
+
+```bash bun
+bun add @langchain/textsplitters
+```
+
+
+```ts
+import { TokenTextSplitter } from "@langchain/textsplitters";
+import { readFileSync } from "fs";
+
+// Example: read a long document
+const stateOfTheUnion = readFileSync("state_of_the_union.txt", "utf8");
+```
+
+To split with a @[TokenTextSplitter] and then merge chunks with `tiktoken`, pass in an `encodingName` (e.g. cl100k_base) when initializing the @[TokenTextSplitter]. Note that splits from this method can be larger than the chunk size measured by the `tiktoken` tokenizer.
+
+```ts
+import { TokenTextSplitter } from "@langchain/textsplitters";
+
+// Example: use cl100k_base encoding
+const splitter = new TokenTextSplitter({ encodingName: "cl100k_base", chunkSize: 10, chunkOverlap: 0 });
+
+const texts = splitter.splitText(stateOfTheUnion);
+console.log(texts[0]);
+```
+```output
+Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.
+
+Last year COVID-19 kept us apart. This year we are finally together again.
+
+Tonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.
+
+With a duty to one another to the American people to the Constitution.
+```
+:::
+
+:::python
## spaCy
@@ -398,3 +461,4 @@ Tonight, we meet as Democrats Republicans and Independents. But most importantly
With a duty to one another to the American people to the Constitution.
```
+:::
diff --git a/src/oss/javascript/integrations/document_loaders/index.mdx b/src/oss/javascript/integrations/document_loaders/index.mdx
index 0243c2837..9bc82992a 100644
--- a/src/oss/javascript/integrations/document_loaders/index.mdx
+++ b/src/oss/javascript/integrations/document_loaders/index.mdx
@@ -1,16 +1,188 @@
---
-title: Overview
+sidebar_position: 0
+sidebarTitle: "Document loaders"
---
-[Document loaders](/oss/integrations/document_loaders) load data into LangChain's expected format for use-cases such as [retrieval-augmented generation (RAG)](/oss/langchain/rag).
+Document loaders provide a **standard interface** for reading data from different sources (such as Slack, Notion, or Google Drive) into LangChain's @[Document] format.
+This ensures that data can be handled consistently regardless of the source.
+
+All document loaders implement the @[BaseLoader] interface.
+
+## Interface
+
+Each document loader may define its own parameters, but they share a common API:
+
+- `.load()`: Loads all documents at once.
+- `.loadAndSplit()`: Loads all documents at once and splits them into smaller documents.
+
+```typescript
+import { CSVLoader } from "@langchain/community/document_loaders/fs/csv";
+
+const loader = new CSVLoader(
+ ... // <-- Integration specific parameters here
+);
+const data = await loader.load();
+```
+
+## By category
LangChain.js categorizes document loaders in two different ways:
- [File loaders](/oss/integrations/document_loaders/file_loaders/), which load data into LangChain formats from your local filesystem.
- [Web loaders](/oss/integrations/document_loaders/web_loaders/), which load data from remote sources.
-See the individual pages for more on each category.
+### File loaders
If you'd like to contribute an integration, see [Contributing integrations](/oss/contributing#add-a-new-integration).
+
+#### PDFs
+
+| Document Loader | Description | Package/API |
+|----------------|-------------|-------------|
+| [PDFLoader](/oss/integrations/document_loaders/file_loaders/pdf) | Load and parse PDF files using pdf-parse | Package |
+
+#### Common File Types
+
+| Document Loader | Description | Package/API |
+|----------------|-------------|-------------|
+| [CSV](/oss/integrations/document_loaders/file_loaders/csv) | Load data from CSV files with configurable column extraction | Package |
+| [JSON](/oss/integrations/document_loaders/file_loaders/json) | Load JSON files using JSON pointer to target specific keys | Package |
+| [JSONLines](/oss/integrations/document_loaders/file_loaders/jsonlines) | Load data from JSONLines/JSONL files | Package |
+| [Text](/oss/integrations/document_loaders/file_loaders/text) | Load plain text files | Package |
+| [DOCX](/oss/integrations/document_loaders/file_loaders/docx) | Load Microsoft Word documents (.docx and .doc formats) | Package |
+| [EPUB](/oss/integrations/document_loaders/file_loaders/epub) | Load EPUB files with optional chapter splitting | Package |
+| [PPTX](/oss/integrations/document_loaders/file_loaders/pptx) | Load PowerPoint presentations | Package |
+| [Subtitles](/oss/integrations/document_loaders/file_loaders/subtitles) | Load subtitle files (.srt format) | Package |
+
+#### Specialized File Loaders
+
+| Document Loader | Description | Package/API |
+|----------------|-------------|-------------|
+| [DirectoryLoader](/oss/integrations/document_loaders/file_loaders/directory) | Load all files from a directory with custom loader mappings | Package |
+| [UnstructuredLoader](/oss/integrations/document_loaders/file_loaders/unstructured) | Load multiple file types using Unstructured API | API |
+| [MultiFileLoader](/oss/integrations/document_loaders/file_loaders/multi_file) | Load data from multiple individual file paths | Package |
+| [ChatGPT](/oss/integrations/document_loaders/file_loaders/chatgpt) | Load ChatGPT conversation exports | Package |
+| [Notion Markdown](/oss/integrations/document_loaders/file_loaders/notion_markdown) | Load Notion pages exported as Markdown | Package |
+| [OpenAI Whisper Audio](/oss/integrations/document_loaders/file_loaders/openai_whisper_audio) | Transcribe audio files using OpenAI Whisper API | API |
+
+### Web loaders
+
+#### Webpages
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [Cheerio](/oss/integrations/document_loaders/web_loaders/web_cheerio) | Load webpages using Cheerio (lightweight, no JavaScript execution) | ✅ | Package |
+| [Playwright](/oss/integrations/document_loaders/web_loaders/web_playwright) | Load dynamic webpages using Playwright (supports JavaScript rendering) | ❌ | Package |
+| [Puppeteer](/oss/integrations/document_loaders/web_loaders/web_puppeteer) | Load dynamic webpages using Puppeteer (headless Chrome) | ❌ | Package |
+| [FireCrawl](/oss/integrations/document_loaders/web_loaders/firecrawl) | Crawl and convert websites into LLM-ready markdown | ✅ | API |
+| [Spider](/oss/integrations/document_loaders/web_loaders/spider) | Fast crawler that converts websites into HTML, markdown, or text | ✅ | API |
+| [RecursiveUrlLoader](/oss/integrations/document_loaders/web_loaders/recursive_url_loader) | Recursively load webpages following links | ❌ | Package |
+| [Sitemap](/oss/integrations/document_loaders/web_loaders/sitemap) | Load all pages from a sitemap.xml | ✅ | Package |
+| [Browserbase](/oss/integrations/document_loaders/web_loaders/browserbase) | Load webpages using managed headless browsers with stealth mode | ✅ | API |
+| [WebPDFLoader](/oss/integrations/document_loaders/web_loaders/pdf) | Load PDF files in web environments | ✅ | Package |
+
+#### Cloud Providers
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [S3](/oss/integrations/document_loaders/web_loaders/s3) | Load files from AWS S3 buckets | ❌ | Package |
+| [Azure Blob Storage Container](/oss/integrations/document_loaders/web_loaders/azure_blob_storage_container) | Load all files from Azure Blob Storage container | ❌ | Package |
+| [Azure Blob Storage File](/oss/integrations/document_loaders/web_loaders/azure_blob_storage_file) | Load individual files from Azure Blob Storage | ❌ | Package |
+| [Google Cloud Storage](/oss/integrations/document_loaders/web_loaders/google_cloud_storage) | Load files from Google Cloud Storage buckets | ❌ | Package |
+| [Google Cloud SQL for PostgreSQL](/oss/integrations/document_loaders/web_loaders/google_cloudsql_pg) | Load documents from Cloud SQL PostgreSQL databases | ✅ | Package |
+
+#### Productivity Tools
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [Notion API](/oss/integrations/document_loaders/web_loaders/notionapi) | Load Notion pages and databases via API | ✅ | API |
+| [Figma](/oss/integrations/document_loaders/web_loaders/figma) | Load Figma file data | ✅ | API |
+| [Confluence](/oss/integrations/document_loaders/web_loaders/confluence) | Load pages from Confluence spaces | ❌ | API |
+| [GitHub](/oss/integrations/document_loaders/web_loaders/github) | Load files from GitHub repositories | ✅ | API |
+| [GitBook](/oss/integrations/document_loaders/web_loaders/gitbook) | Load GitBook documentation pages | ✅ | Package |
+| [Jira](/oss/integrations/document_loaders/web_loaders/jira) | Load issues from Jira projects | ❌ | API |
+| [Airtable](/oss/integrations/document_loaders/web_loaders/airtable) | Load records from Airtable bases | ✅ | API |
+| [Taskade](/oss/integrations/document_loaders/web_loaders/taskade) | Load Taskade project data | ✅ | API |
+
+#### Search & Data APIs
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [SearchAPI](/oss/integrations/document_loaders/web_loaders/searchapi) | Load web search results from SearchAPI (Google, YouTube, etc.) | ✅ | API |
+| [SerpAPI](/oss/integrations/document_loaders/web_loaders/serpapi) | Load web search results from SerpAPI | ✅ | API |
+| [Apify Dataset](/oss/integrations/document_loaders/web_loaders/apify_dataset) | Load scraped data from Apify platform | ✅ | API |
+
+#### Audio & Video
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [YouTube](/oss/integrations/document_loaders/web_loaders/youtube) | Load YouTube video transcripts | ✅ | Package |
+| [AssemblyAI](/oss/integrations/document_loaders/web_loaders/assemblyai_audio_transcription) | Transcribe audio and video files using AssemblyAI API | ✅ | API |
+| [Sonix](/oss/integrations/document_loaders/web_loaders/sonix_audio_transcription) | Transcribe audio files using Sonix API | ❌ | API |
+
+#### Other
+
+| Document Loader | Description | Web Support | Package/API |
+|----------------|-------------|:-----------:|-------------|
+| [Couchbase](/oss/integrations/document_loaders/web_loaders/couchbase) | Load documents from Couchbase database using SQL++ queries | ✅ | Package |
+| [LangSmith](/oss/integrations/document_loaders/web_loaders/langsmith) | Load datasets and traces from LangSmith | ✅ | API |
+| [Hacker News](/oss/integrations/document_loaders/web_loaders/hn) | Load Hacker News threads and comments | ✅ | Package |
+| [IMSDB](/oss/integrations/document_loaders/web_loaders/imsdb) | Load movie scripts from Internet Movie Script Database | ✅ | Package |
+| [College Confidential](/oss/integrations/document_loaders/web_loaders/college_confidential) | Load college information from College Confidential | ✅ | Package |
+| [Blockchain Data](/oss/integrations/document_loaders/web_loaders/sort_xyz_blockchain) | Load blockchain data (NFTs, transactions) via Sort.xyz API | ✅ | API |
+
+## All document loaders
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/oss/javascript/integrations/stores/index.mdx b/src/oss/javascript/integrations/stores/index.mdx
index 58cdc16d4..37641783e 100644
--- a/src/oss/javascript/integrations/stores/index.mdx
+++ b/src/oss/javascript/integrations/stores/index.mdx
@@ -1,10 +1,39 @@
---
-title: Key-value stores
+title: "Key-value stores"
---
-[Key-value stores](/oss/integrations/stores) are used by other LangChain components to store and retrieve data.
+## Overview
-## All key-value stores
+LangChain provides a key-value store interface for storing and retrieving data by key. The key-value store interface in LangChain is primarily used for caching [embeddings](/oss/integrations/text_embedding).
+
+## Interface
+
+All [`BaseStores`](https://api.js.langchain.com/classes/langchain_core.stores.BaseStore.html) are **generic** and support the following interface, where `K` represents the key type and `V` represents the value type:
+
+- `mget(keys: K[]): Promise<(V | undefined)[]>`: get the values for multiple keys, returning `undefined` if a key does not exist
+- `mset(keyValuePairs: [K, V][]): Promise`: set the values for multiple keys
+- `mdelete(keys: K[]): Promise`: delete multiple keys
+- `yieldKeys(prefix?: string): AsyncGenerator`: asynchronously yield all keys in the store, optionally filtering by a prefix
+
+The generic nature of the interface allows you to use different types for keys and values. For example, `BaseStore` would store messages with string keys, while `BaseStore` would store arrays of numbers.
+
+
+Base stores are designed to work with **multiple** key-value pairs at once for efficiency. This saves on network round-trips and may allow for more efficient batch operations in the underlying store.
+
+
+## Built-in stores for local development
+
+
+
+
+
+
+
+## Custom stores
+
+You can also implement your own custom store by extending the `BaseStore` class. See the [store interface documentation](https://api.js.langchain.com/classes/langchain_core.stores.BaseStore.html) for more details.
+
+## All integrations
-
-
-
-
-
+ />
-
+ />
-
+ />
-
+ />
diff --git a/src/oss/javascript/integrations/text_embedding/index.mdx b/src/oss/javascript/integrations/text_embedding/index.mdx
index b052787ed..025fa9886 100644
--- a/src/oss/javascript/integrations/text_embedding/index.mdx
+++ b/src/oss/javascript/integrations/text_embedding/index.mdx
@@ -1,10 +1,42 @@
---
-title: Embeddings
+title: "Embedding models"
---
-[Embedding models](/oss/integrations/text_embedding) create a vector representation of a piece of text.
+## Overview
-This page documents integrations with various model providers that allow you to use embeddings in LangChain.
+
+This overview covers **text-based embedding models**. LangChain does not currently support multimodal embeddings.
+
+
+Embedding models transform raw text—such as a sentence, paragraph, or tweet—into a fixed-length vector of numbers that captures its **semantic meaning**. These vectors allow machines to compare and search text based on meaning rather than exact words.
+
+In practice, this means that texts with similar ideas are placed close together in the vector space. For example, instead of matching only the phrase *"machine learning"*, embeddings can surface documents that discuss related concepts even when different wording is used.
+
+### How it works
+
+1. **Vectorization** — The model encodes each input string as a high-dimensional vector.
+2. **Similarity scoring** — Vectors are compared using mathematical metrics to measure how closely related the underlying texts are.
+
+### Similarity metrics
+
+Several metrics are commonly used to compare embeddings:
+
+* **Cosine similarity** — measures the angle between two vectors.
+* **Euclidean distance** — measures the straight-line distance between points.
+* **Dot product** — measures how much one vector projects onto another.
+
+## Interface
+
+LangChain provides a standard interface for text embedding models (e.g., OpenAI, Cohere, Hugging Face) via the @[Embeddings] interface.
+
+Two main methods are available:
+
+* `embedDocuments(documents: string[]) → number[][]`: Embeds a list of documents.
+* `embedQuery(text: string) → number[]`: Embeds a single query.
+
+
+The interface allows queries and documents to be embedded with different strategies, though most providers handle them the same way in practice.
+
## Install and use
@@ -18,11 +50,9 @@ Install dependencies:
```bash npm
npm i @langchain/openai
```
-
```bash yarn
yarn add @langchain/openai
```
-
```bash pnpm
pnpm add @langchain/openai
```
@@ -42,19 +72,18 @@ import { OpenAIEmbeddings } from "@langchain/openai";
const embeddings = new OpenAIEmbeddings({
model: "text-embedding-3-large"
});
-
-await embeddings.embedQuery("Hello, world!");
```
-Install dependencies:
+Install dependencies
+
+
```bash npm
npm i @langchain/openai
```
-
```bash yarn
yarn add @langchain/openai
```
@@ -62,6 +91,7 @@ yarn add @langchain/openai
```bash pnpm
pnpm add @langchain/openai
```
+
Add environment variables:
@@ -79,16 +109,16 @@ import { AzureOpenAIEmbeddings } from "@langchain/openai";
const embeddings = new AzureOpenAIEmbeddings({
azureOpenAIApiEmbeddingsDeploymentName: "text-embedding-ada-002"
});
-
-await embeddings.embedQuery("Hello, world!");
```
+
Install dependencies:
+
```bash npm
npm i @langchain/aws
```
@@ -100,6 +130,7 @@ yarn add @langchain/aws
```bash pnpm
pnpm add @langchain/aws
```
+
Add environment variables:
@@ -116,13 +147,45 @@ import { BedrockEmbeddings } from "@langchain/aws";
const embeddings = new BedrockEmbeddings({
model: "amazon.titan-embed-text-v1"
});
-
-await embeddings.embedQuery("Hello, world!");
```
+
+
+Install dependencies:
+
+
+```bash npm
+npm i @langchain/google-genai
+```
+
+```bash yarn
+yarn add @langchain/google-genai
+```
+
+```bash pnpm
+pnpm add @langchain/google-genai
+```
+
+
+Add environment variables:
+
+```bash
+GOOGLE_API_KEY=your-api-key
+```
-
+Instantiate the model:
+
+```typescript
+import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
+
+const embeddings = new GoogleGenerativeAIEmbeddings({
+ model: "text-embedding-004"
+});
+```
+
+
+
Install dependencies:
@@ -155,8 +218,6 @@ import { VertexAIEmbeddings } from "@langchain/google-vertexai";
const embeddings = new VertexAIEmbeddings({
model: "gemini-embedding-001"
});
-
-await embeddings.embedQuery("Hello, world!");
```
@@ -165,7 +226,6 @@ await embeddings.embedQuery("Hello, world!");
Install dependencies:
-
```bash npm
npm i @langchain/mistralai
```
@@ -177,7 +237,6 @@ yarn add @langchain/mistralai
```bash pnpm
pnpm add @langchain/mistralai
```
-
Add environment variables:
@@ -194,8 +253,6 @@ import { MistralAIEmbeddings } from "@langchain/mistralai";
const embeddings = new MistralAIEmbeddings({
model: "mistral-embed"
});
-
-await embeddings.embedQuery("Hello, world!");
```
@@ -211,6 +268,7 @@ npm i @langchain/cohere
```bash yarn
yarn add @langchain/cohere
```
+
```bash pnpm
pnpm add @langchain/cohere
```
@@ -231,15 +289,88 @@ import { CohereEmbeddings } from "@langchain/cohere";
const embeddings = new CohereEmbeddings({
model: "embed-english-v3.0"
});
+```
-await embeddings.embedQuery("Hello, world!");
+
+
+
+Install dependencies:
+
+
+```bash npm
+npm i @langchain/ollama
+```
+
+```bash yarn
+yarn add @langchain/ollama
+```
+
+```bash pnpm
+pnpm add @langchain/ollama
+```
+
+
+Instantiate the model:
+
+```typescript
+import { OllamaEmbeddings } from "@langchain/ollama";
+
+const embeddings = new OllamaEmbeddings({
+ model: "llama2",
+ baseUrl: "http://localhost:11434", // Default value
+});
```
+## Caching
+
+Embeddings can be stored or temporarily cached to avoid needing to recompute them.
+
+Caching embeddings can be done using a `CacheBackedEmbeddings`. This wrapper stores embeddings in a key-value store, where the text is hashed and the hash is used as the key in the cache.
+
+The main supported way to initialize a `CacheBackedEmbeddings` is `fromBytesStore`. It takes the following parameters:
+
+- **underlyingEmbeddings**: The embedder to use for embedding.
+- **documentEmbeddingStore**: Any [`BaseStore`](/oss/integrations/stores/) for caching document embeddings.
+- **options.namespace**: (optional, defaults to `""`) The namespace to use for the document cache. Helps avoid collisions (e.g., set it to the embedding model name).
+
+
+- Always set the `namespace` parameter to avoid collisions when using different embedding models.
+- `CacheBackedEmbeddings` does not cache query embeddings by default. To enable this, specify a `query_embedding_store`.
+
+
+```typescript
+import { CacheBackedEmbeddings } from "langchain/embeddings/cache_backed";
+import { InMemoryStore } from "@langchain/core/stores";
+
+const underlyingEmbeddings = new OpenAIEmbeddings();
+
+const inMemoryStore = new InMemoryStore();
+
+const cacheBackedEmbeddings = CacheBackedEmbeddings.fromBytesStore(
+ underlyingEmbeddings,
+ inMemoryStore,
+ {
+ namespace: underlyingEmbeddings.model,
+ }
+);
+
+// Example: caching a query embedding
+const tic = Date.now();
+const queryEmbedding = cacheBackedEmbeddings.embedQuery("Hello, world!");
+console.log(`First call took: ${Date.now() - tic}ms`);
+
+// Example: caching a document embedding
+const tic = Date.now();
+const documentEmbedding = cacheBackedEmbeddings.embedDocuments(["Hello, world!"]);
+console.log(`Cached creation time: ${Date.now() - tic}ms`);
+```
+
+In production, you would typically use a more robust persistent store, such as a database or cloud storage. Please see [stores integrations](/oss/integrations/stores/) for options.
-## All embedding models
+## All integrations
B[🔢 Embedding model]
+ B --> C[🔘 Embedding vectors]
+ C --> D[(Vector store)]
+ end
+
+ subgraph "📤 Query phase (retrieval)"
+ E[❓ Query text] --> F[🔢 Embedding model]
+ F --> G[🔘 Query vector]
+ G --> H[🔍 Similarity search]
+ H --> D
+ D --> I[📄 Top-k results]
+ end
+```
+
+### Interface
+
+LangChain provides a unified interface for vector stores, allowing you to:
+
+- `addDocuments` - Add documents to the store.
+- `delete` - Remove stored documents by ID.
+- `similaritySearch` - Query for semantically similar documents.
+
+This abstraction lets you switch between different implementations without altering your application logic.
+
+### Initialization
+
+Most vectorstores in LangChain accept an embedding model as an argument when initializing the vector store.
+
+```typescript
+import { OpenAIEmbeddings } from "@langchain/openai";
+import { MemoryVectorStore } from "langchain/vectorstores/memory";
+
+const embeddings = new OpenAIEmbeddings({
+ model: "text-embedding-3-small",
+});
+const vectorStore = new MemoryVectorStore(embeddings);
+```
+
+### Adding docuemnts
+
+You can add documents to the vector store by using the `addDocuments` function.
+
+```typescript
+import { Document } from "@langchain/core/documents";
+const document = new Document({
+ pageContent: "Hello world",
+});
+await vectorStore.addDocuments([document]);
+```
+
+### Deleting documents
+
+You can delete documents from the vector store by using the `delete` function.
+
+```typescript
+await vectorStore.delete({
+ filter: {
+ pageContent: "Hello world",
+ },
+});
+```
+
+### Similarity search
+
+Issue a semantic query using `similaritySearch`, which returns the closest embedded documents:
+
+```typescript
+const results = await vectorStore.similaritySearch("Hello world", 10);
+```
+
+
+Many vector stores support parameters like:
+
+* `k` — number of results to return
+* `filter` — conditional filtering based on metadata
+
+### Similarity metrics & indexing
+
+Embedding similarity may be computed using:
+
+* **Cosine similarity**
+* **Euclidean distance**
+* **Dot product**
+
+Efficient search often employs indexing methods such as HNSW (Hierarchical Navigable Small World), though specifics depend on the vector store.
+
+### Metadata filtering
+
+Filtering by metadata (e.g., source, date) can refine search results:
+
+```typescript
+vectorStore.similaritySearch("query", 2, { source: "tweets" });
+```
+
+
+Support for metadata-based filtering varies between implementations.
+Check the documentation of your chosen vector store for details.
+
+
+## Top integrations
+
+**Select embedding model:**
+
@@ -76,6 +185,7 @@ const embeddings = new AzureOpenAIEmbeddings({
```
+
Install dependencies:
@@ -113,7 +223,42 @@ const embeddings = new BedrockEmbeddings({
```
-
+
+
+Install dependencies:
+
+
+```bash npm
+npm i @langchain/google-genai
+```
+
+```bash yarn
+yarn add @langchain/google-genai
+```
+
+```bash pnpm
+pnpm add @langchain/google-genai
+```
+
+
+Add environment variables:
+
+```bash
+GOOGLE_API_KEY=your-api-key
+```
+
+Instantiate the model:
+
+```typescript
+import { GoogleGenerativeAIEmbeddings } from "@langchain/google-genai";
+
+const embeddings = new GoogleGenerativeAIEmbeddings({
+ model: "text-embedding-004"
+});
+```
+
+
+
Install dependencies:
@@ -220,11 +365,43 @@ const embeddings = new CohereEmbeddings({
```
-
-
+
Install dependencies:
+
+```bash npm
+npm i @langchain/ollama
+```
+
+```bash yarn
+yarn add @langchain/ollama
+```
+
+```bash pnpm
+pnpm add @langchain/ollama
+```
+
+
+Instantiate the model:
+
+```typescript
+import { OllamaEmbeddings } from "@langchain/ollama";
+
+const embeddings = new OllamaEmbeddings({
+ model: "llama2",
+ baseUrl: "http://localhost:11434", // Default value
+});
+```
+
+
+
+
+**Select vector store:**
+
+
+
+
```bash
npm i langchain
@@ -239,8 +416,6 @@ pnpm add langchain
```
-Instantiate the model:
-
```typescript
import { MemoryVectorStore } from "langchain/vectorstores/memory";
@@ -250,8 +425,6 @@ const vectorStore = new MemoryVectorStore(embeddings);
-Install dependencies:
-
```bash npm
@@ -268,8 +441,6 @@ pnpm add @langchain/community
-Instantiate the model:
-
```typescript
import { Chroma } from "@langchain/community/vectorstores/chroma";
@@ -281,8 +452,6 @@ const vectorStore = new Chroma(embeddings, {
-Install dependencies:
-
```bash npm
@@ -299,8 +468,6 @@ pnpm add @langchain/community
-Instantiate the model:
-
```typescript
import { FaissStore } from "@langchain/community/vectorstores/faiss";
@@ -310,8 +477,6 @@ const vectorStore = new FaissStore(embeddings, {});
-Install dependencies:
-
```bash npm
@@ -328,8 +493,6 @@ pnpm add @langchain/mongodb
-Instantiate the model:
-
```typescript
import { MongoDBAtlasVectorSearch } from "@langchain/mongodb"
import { MongoClient } from "mongodb";
@@ -350,8 +513,6 @@ const vectorStore = new MongoDBAtlasVectorSearch(embeddings, {
-Install dependencies:
-
```bash npm
@@ -368,8 +529,6 @@ pnpm add @langchain/community
-Instantiate the model:
-
```typescript
import { PGVectorStore } from "@langchain/community/vectorstores/pgvector";
@@ -379,8 +538,6 @@ const vectorStore = await PGVectorStore.initialize(embeddings, {});
-Install dependencies:
-
```bash npm
@@ -397,8 +554,6 @@ pnpm add @langchain/pinecone
-Instantiate the model:
-
```typescript
import { PineconeStore } from "@langchain/pinecone";
import { Pinecone as PineconeClient } from "@pinecone-database/pinecone";
@@ -413,8 +568,6 @@ const vectorStore = new PineconeStore(embeddings, {
-Install dependencies:
-
```bash npm
npm i @langchain/qdrant
@@ -430,8 +583,6 @@ pnpm add @langchain/qdrant
-Instantiate the model:
-
```typescript
import { QdrantVectorStore } from "@langchain/qdrant";
@@ -455,406 +606,355 @@ LangChain.js integrates with a variety of vector stores. You can check out a ful
href="/oss/integrations/vectorstores/analyticdb"
arrow="true"
cta="View guide"
- >
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
-
+ />
diff --git a/src/oss/langchain/knowledge-base.mdx b/src/oss/langchain/knowledge-base.mdx
index 2c32a870e..bb7722318 100644
--- a/src/oss/langchain/knowledge-base.mdx
+++ b/src/oss/langchain/knowledge-base.mdx
@@ -89,11 +89,18 @@ os.environ["LANGSMITH_API_KEY"] = getpass.getpass()
## Documents and Document Loaders
-LangChain implements a [Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) abstraction, which is intended to represent a unit of text and associated metadata. It has three attributes:
+LangChain implements a @[Document] abstraction, which is intended to represent a unit of text and associated metadata. It has three attributes:
+:::python
- `page_content`: a string representing the content;
- `metadata`: a dict containing arbitrary metadata;
- `id`: (optional) a string identifier for the document.
+:::
+:::js
+- `pageContent`: a string representing the content;
+- `metadata`: a dict containing arbitrary metadata;
+- `id`: (optional) a string identifier for the document.
+:::
The `metadata` attribute can capture information about the source of the document, its relationship to other documents, and other information. Note that an individual `Document` object often represents a chunk of a larger document.
@@ -150,6 +157,14 @@ docs = loader.load()
print(len(docs))
```
+```output
+107
+```
+
+`PyPDFLoader` loads one `Document` object per PDF page. For each, we can easily access:
+
+- The string content of the page;
+- Metadata containing the file name and page number.
:::
:::js
```typescript
@@ -160,15 +175,15 @@ const loader = new PDFLoader("../../data/nke-10k-2023.pdf");
const docs = await loader.load();
console.log(docs.length);
```
-:::
```output
107
```
-`PyPDFLoader` loads one `Document` object per PDF page. For each, we can easily access:
+`PDFLoader` loads one `Document` object per PDF page. For each, we can easily access:
- The string content of the page;
- Metadata containing the file name and page number.
+:::
:::python
```python
@@ -190,7 +205,7 @@ FO
:::
:::js
```typescript
-docs[0].pageContent.slice(0, 200);
+console.log(docs[0].pageContent.slice(0, 200));
```
```output
Table of Contents
@@ -203,7 +218,7 @@ FORM 10-K
FO
```
```typescript
-docs[0].metadata;
+console.log(docs[0].metadata);
```
```output
{
@@ -230,6 +245,7 @@ docs[0].metadata;
}
```
:::
+
### Splitting
For both information retrieval and downstream question-answering purposes, a page may be too coarse a representation. Our goal in the end will be to retrieve `Document` objects that answer an input query, and further splitting our PDF will help ensure that the meanings of relevant portions of the document are not "washed out" by surrounding text.
@@ -243,11 +259,11 @@ which will recursively split the document using common separators like
new lines until each chunk is the appropriate size. This is the
recommended text splitter for generic text use cases.
+:::python
We set `add_start_index=True` so that the character index where each
split Document starts within the initial Document is preserved as
metadata attribute “start_index”.
-:::python
```python
from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -256,7 +272,7 @@ text_splitter = RecursiveCharacterTextSplitter(
)
all_splits = text_splitter.split_documents(docs)
-len(all_splits)
+print(len(all_splits))
```
:::
:::js
@@ -270,7 +286,7 @@ const textSplitter = new RecursiveCharacterTextSplitter({
const allSplits = await textSplitter.splitDocuments(docs);
-allSplits.length;
+console.log(allSplits.length);
```
:::
@@ -278,7 +294,6 @@ allSplits.length;
514
```
-
## Embeddings
Vector search is a common way to store and search over unstructured data (such as unstructured text). The idea is to store numeric vectors that are associated with the text. Given a query, we can [embed](/oss/langchain/retrieval#embedding_models) it as a vector of the same dimension and use vector similarity metrics (such as cosine similarity) to identify related text.
@@ -287,10 +302,6 @@ LangChain supports embeddings from [dozens of providers](/oss/integrations/text_
:::python
-:::
-:::js
-
-:::
```python
vector_1 = embeddings.embed_query(all_splits[0].page_content)
@@ -300,6 +311,20 @@ assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])
```
+:::
+:::js
+
+
+```typescript
+const vector1 = await embeddings.embedQuery(allSplits[0].pageContent);
+const vector2 = await embeddings.embedQuery(allSplits[1].pageContent);
+
+assert vector1.length === vector2.length;
+console.log(`Generated vectors of length ${vector1.length}\n`);
+console.log(vector1.slice(0, 10));
+```
+:::
+
```output
Generated vectors of length 1536
@@ -309,7 +334,7 @@ Armed with a model for generating text embeddings, we can next store them in a s
## Vector stores
-LangChain [VectorStore](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html) objects contain methods for adding text and `Document` objects to the store, and querying them using various similarity metrics. They are often initialized with [embedding](/oss/langchain/retrieval#embedding_models) models, which determine how text data is translated to numeric vectors.
+LangChain @[VectorStore] objects contain methods for adding text and `Document` objects to the store, and querying them using various similarity metrics. They are often initialized with [embedding](/oss/langchain/retrieval#embedding_models) models, which determine how text data is translated to numeric vectors.
LangChain includes a suite of [integrations](/oss/integrations/vectorstores) with different vector store technologies. Some vector stores are hosted by a provider (e.g., various cloud providers) and require specific credentials to use; some (such as [Postgres](/oss/integrations/vectorstores/pgvector)) run in separate infrastructure that can be run locally or via a third-party; others can run in-memory for lightweight workloads. Let's select a vector store:
@@ -335,13 +360,13 @@ await vectorStore.addDocuments(allSplits);
Note that most vector store implementations will allow you to connect to an existing vector store-- e.g., by providing a client, index name, or other information. See the documentation for a specific [integration](/oss/integrations/vectorstores) for more detail.
-Once we've instantiated a `VectorStore` that contains documents, we can query it. [VectorStore](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html) includes methods for querying:
+Once we've instantiated a `VectorStore` that contains documents, we can query it. @[VectorStore] includes methods for querying:
- Synchronously and asynchronously;
- By string query and by vector;
- With and without returning similarity scores;
-- By similarity and [maximum marginal relevance](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html#langchain_core.vectorstores.base.VectorStore.max_marginal_relevance_search) (to balance similarity with query to diversity in retrieved results).
+- By similarity and @[maximum marginal relevance][VectorStore.max_marginal_relevance_search] (to balance similarity with query to diversity in retrieved results).
-The methods will generally include a list of [Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html#langchain_core.documents.base.Document) objects in their outputs.
+The methods will generally include a list of @[Document] objects in their outputs.
### Usage
@@ -357,16 +382,6 @@ results = vector_store.similarity_search(
print(results[0])
```
-:::
-:::js
-```typescript
-const results1 = await vectorStore.similaritySearch(
- "When was Nike incorporated?"
-);
-
-results1[0];
-```
-:::
```output
page_content='direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
@@ -377,6 +392,23 @@ TOTAL 369
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2' metadata={'page': 4, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 3125}
```
+:::
+:::js
+```typescript
+const results1 = await vectorStore.similaritySearch(
+ "When was Nike incorporated?"
+);
+
+console.log(results1[0]);
+```
+```output
+Document {
+ pageContent: 'direct to consumer operations sell products...',
+ metadata: {'page': 4, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 3125}
+}
+```
+:::
+
:::python
Async query:
@@ -410,16 +442,6 @@ doc, score = results[0]
print(f"Score: {score}\n")
print(doc)
```
-:::
-:::js
-```typescript
-const results2 = await vectorStore.similaritySearchWithScore(
- "What was Nike's revenue in 2023?"
-);
-
-results2[0];
-```
-:::
```output
Score: 0.23699893057346344
@@ -434,6 +456,25 @@ The increase was due to higher revenues in North America, Europe, Middle East &
increase was primarily due to higher revenues in Men's, the Jordan Brand, Women's and Kids' which grew 17%, 35%,11% and 10%, respectively, on a wholesale
equivalent basis.' metadata={'page': 35, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0}
```
+:::
+:::js
+```typescript
+const results2 = await vectorStore.similaritySearchWithScore(
+ "What was Nike's revenue in 2023?"
+);
+
+console.log(results2[0]);
+```
+```output
+Score: 0.23699893057346344
+
+Document {
+ pageContent: 'Table of Contents...',
+ metadata: {'page': 35, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0}
+}
+```
+:::
+
Return documents based on similarity to an embedded query:
:::python
@@ -443,21 +484,6 @@ embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")
results = vector_store.similarity_search_by_vector(embedding)
print(results[0])
```
-:::
-:::js
-```typescript
-const embedding = await embeddings.embedQuery(
- "How were Nike's margins impacted in 2023?"
-);
-
-const results3 = await vectorStore.similaritySearchVectorWithScore(
- embedding,
- 1
-);
-
-results3[0];
-```
-:::
```output
page_content='Table of Contents
GROSS MARGIN
@@ -474,14 +500,40 @@ the prior period resulting from lower available inventory supply;
•Lower off-price margin, on a wholesale equivalent basis.
This was partially offset by:' metadata={'page': 36, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0}
```
+:::
+:::js
+```typescript
+const embedding = await embeddings.embedQuery(
+ "How were Nike's margins impacted in 2023?"
+);
+
+const results3 = await vectorStore.similaritySearchVectorWithScore(
+ embedding,
+ 1
+);
+
+console.log(results3[0]);
+```
+```output
+Document {
+ pageContent: 'FISCAL 2023 COMPARED TO FISCAL 2022...',
+ metadata: {
+ 'page': 36,
+ 'source': '../example_data/nke-10k-2023.pdf',
+ 'start_index': 0
+ }
+}
+```
+:::
+
Learn more:
-- [API reference](https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.base.VectorStore.html)
+- @[API Reference][VectorStore]
- [Integration-specific docs](/oss/integrations/vectorstores)
## Retrievers
-LangChain `VectorStore` objects do not subclass [Runnable](https://python.langchain.com/api_reference/core/index.html#langchain-core-runnables). LangChain [Retrievers](https://python.langchain.com/api_reference/core/index.html#langchain-core-retrievers) are Runnables, so they implement a standard set of methods (e.g., synchronous and asynchronous `invoke` and `batch` operations). Although we can construct retrievers from vector stores, retrievers can interface with non-vector store sources of data, as well (such as external APIs).
+LangChain `VectorStore` objects do not subclass @[Runnable]. LangChain @[Retrievers] are Runnables, so they implement a standard set of methods (e.g., synchronous and asynchronous `invoke` and `batch` operations). Although we can construct retrievers from vector stores, retrievers can interface with non-vector store sources of data, as well (such as external APIs).
:::python
We can create a simple version of this ourselves, without subclassing `Retriever`. If we choose what method we wish to use to retrieve documents, we can create a runnable easily. Below we will build one around the `similarity_search` method:
@@ -531,6 +583,12 @@ retriever.batch(
],
)
```
+```output
+[[Document(metadata={'page': 4, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.\n2023 FORM 10-K 2')],
+ [Document(metadata={'page': 3, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0}, page_content='Table of Contents\nPART I\nITEM 1. BUSINESS\nGENERAL\nNIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"\n"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.\nOur principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is\nthe largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores\nand sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales')]]
+```
+
+`VectorStoreRetriever` supports search types of `"similarity"` (default), `"mmr"` (maximum marginal relevance, described above), and `"similarity_score_threshold"`. We can use the latter to threshold documents output by the retriever by similarity score.
:::
:::js
```typescript
@@ -546,14 +604,19 @@ await retriever.batch([
"What was Nike's revenue in 2023?",
]);
```
-:::
```output
-[[Document(metadata={'page': 4, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.\n2023 FORM 10-K 2')],
- [Document(metadata={'page': 3, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0}, page_content='Table of Contents\nPART I\nITEM 1. BUSINESS\nGENERAL\nNIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"\n"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.\nOur principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is\nthe largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores\nand sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales')]]
+[
+ [Document {
+ metadata: {'page': 4, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 3125},
+ pageContent: 'direct to consumer operations sell products...',
+ }],
+ [Document {
+ metadata: {'page': 3, 'source': '../example_data/nke-10k-2023.pdf', 'start_index': 0},
+ pageContent: 'Table of Contents...',
+ }],
+]
```
-
-
-`VectorStoreRetriever` supports search types of `"similarity"` (default), `"mmr"` (maximum marginal relevance, described above), and `"similarity_score_threshold"`. We can use the latter to threshold documents output by the retriever by similarity score.
+:::
Retrievers can easily be incorporated into more complex applications, such as [retrieval-augmented generation (RAG)](/oss/langchain/retrieval) applications that combine a given question with retrieved context into a prompt for a LLM. To learn more about building such an application, check out the [RAG tutorial](/oss/langchain/rag) tutorial.
diff --git a/src/oss/langchain/rag.mdx b/src/oss/langchain/rag.mdx
index b030ae629..92be43015 100644
--- a/src/oss/langchain/rag.mdx
+++ b/src/oss/langchain/rag.mdx
@@ -217,13 +217,11 @@ Task decomposition refers to...
:::js
```typescript
import "cheerio";
-import { createAgent } from "langchain";
+import { createAgent, tool } from "langchain";
import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio";
-import { tool } from "@langchain/core/tools";
import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
import { z } from "zod";
-
// Load and chunk contents of blog
const pTagSelector = "p";
const cheerioLoader = new CheerioWebBaseLoader(
@@ -236,11 +234,11 @@ const cheerioLoader = new CheerioWebBaseLoader(
const docs = await cheerioLoader.load();
const splitter = new RecursiveCharacterTextSplitter({
- chunkSize: 1000, chunkOverlap: 200
+ chunkSize: 1000,
+ chunkOverlap: 200
});
const allSplits = await splitter.splitDocuments(docs);
-
// Index chunks
await vectorStore.addDocuments(allSplits)
@@ -265,7 +263,7 @@ const retrieve = tool(
}
);
-const agent = createAgent({ llm: llm, tools: [retrieve] });
+const agent = createAgent({ model: "openai:gpt-5", tools: [retrieve] });
```
```typescript
let inputMessage = `What is Task Decomposition?`;
@@ -317,10 +315,7 @@ Indexing commonly works as follows:
We need to first load the blog post contents. We can use
[DocumentLoaders](/oss/langchain/retrieval#document_loaders)
for this, which are objects that load in data from a source and return a
-list of
-[Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html)
-objects.
-
+list of @[Document] objects.
:::python
In this case we'll use the
@@ -400,7 +395,7 @@ Building agents with LLM (large language model) as its core controller is...
- [Integrations](/oss/integrations/document_loaders/): 160+
integrations to choose from.
-- [Interface](https://python.langchain.com/api_reference/core/document_loaders/langchain_core.document_loaders.base.BaseLoader.html):
+- @[Interface][BaseLoader]:
API reference for the base interface.
### Splitting documents
@@ -546,6 +541,15 @@ def retrieve_context(query: str):
)
return serialized, retrieved_docs
```
+
+
+
+Here we use the @[tool decorator][tool]
+to configure the tool to attach raw documents as [artifacts](/oss/langchain/messages#param-artifact) to
+each [ToolMessage](/oss/langchain/messages#tool-message). This will let us access document metadata in our application,
+separate from the stringified representation that is sent to the model.
+
+
:::
:::js
```typescript
@@ -572,21 +576,19 @@ const retrieve = tool(
}
);
```
-:::
-
-Here we use the [tool decorator](https://python.langchain.com/api_reference/core/tools/langchain_core.tools.convert.tool.html)
-to configure the tool to attach raw documents as [artifacts](/oss/langchain/messages#param-artifact) to
-each [ToolMessage](/oss/langchain/messages#tool-message). This will let us access document metadata in our application,
+Here we specify the `responseFormat` to `content_and_artifact` to confiugre the tool to attach raw documents as [artifacts](/oss/langchain/messages#param-artifact)
+to each [ToolMessage](/oss/langchain/messages#tool-message). This will let us access document metadata in our application,
separate from the stringified representation that is sent to the model.
+:::
:::python
Retrieval tools are not limited to a single string `query` argument, as in the above example. You can
-force the LLM to specify additional search parameters by adding arguments— for example, a category:
+force the LLM to specify additional search parameters by adding arguments— for example, a category:
```python
from typing import Literal
@@ -612,10 +614,19 @@ agent = create_agent(llm, tools, system_prompt=prompt)
```
:::
:::js
+
+Given our tool, we can construct the agent:
+
```typescript
import { createAgent } from "langchain";
-const agent = createAgent({ llm: llm, tools: [retrieve] });
+const tools = [retrieve];
+const systemPrompt = new SystemMessage(
+ "You have access to a tool that retrieves context from a blog post. " +
+ "Use the tool to help answer user queries."
+)
+
+const agent = createAgent({ model: "openai:gpt-5", tools, systemPrompt });
```
:::
@@ -680,11 +691,12 @@ Once you get the answer, look up common extensions of that method.`;
let agentInputs = { messages: [{ role: "user", content: inputMessage }] };
-for await (const step of await agent.stream(agentInputs, {
+const stream = await agent.stream(agentInputs, {
streamMode: "values",
-})) {
+});
+for await (const step of stream) {
const lastMessage = step.messages[step.messages.length - 1];
- prettyPrint(lastMessage);
+ console.log(`[${lastMessage.role}]: ${lastMessage.content}`);
console.log("-----\n");
}
```
@@ -787,29 +799,31 @@ agent = create_agent(llm, tools=[], system_prompt=prompt_with_context)
:::
:::js
```typescript
-import { createAgent } from "langchain";
+import { createAgent, dynamicSystemPromptMiddleware } from "langchain";
import { SystemMessage } from "@langchain/core/messages";
const agent = createAgent({
model,
tools: [],
- prompt: async (state) => {
- const lastQuery = state.messages[state.messages.length - 1].content;
+ middleware: [
+ dynamicSystemPromptMiddleware(async (state) => {
+ const lastQuery = state.messages[state.messages.length - 1].content;
- const retrievedDocs = await vectorStore.similaritySearch(lastQuery, 2);
+ const retrievedDocs = await vectorStore.similaritySearch(lastQuery, 2);
- const docsContent = retrievedDocs
- .map((doc) => doc.pageContent)
- .join("\n\n");
+ const docsContent = retrievedDocs
+ .map((doc) => doc.pageContent)
+ .join("\n\n");
- // Build system message
- const systemMessage = new SystemMessage(
- `You are a helpful assistant. Use the following context in your response:\n\n${docsContent}`
- );
+ // Build system message
+ const systemMessage = new SystemMessage(
+ `You are a helpful assistant. Use the following context in your response:\n\n${docsContent}`
+ );
- // Return system + existing messages
- return [systemMessage, ...state.messages];
- },
+ // Return system + existing messages
+ return [systemMessage, ...state.messages];
+ })
+ ]
});
```
:::
@@ -839,9 +853,10 @@ let inputMessage = `What is Task Decomposition?`;
let chainInputs = { messages: [{ role: "user", content: inputMessage }] };
-for await (const step of await agent.stream(chainInputs, {
+const stream = await agent.stream(chainInputs, {
streamMode: "values",
-})) {
+})
+for await (const step of stream) {
const lastMessage = step.messages[step.messages.length - 1];
prettyPrint(lastMessage);
console.log("-----\n");
@@ -855,7 +870,6 @@ This is a fast and effective method for simple queries in constrained settings,
we typically do want to run user queries through semantic search to pull additional
context.
-:::python
The above [RAG chain](#rag-chains) incorporates retrieved context into a single system
@@ -869,47 +883,88 @@ do this for the two-step chain case by:
2. Adding a new node via a [pre-model hook](/oss/langchain/agents#pre-model-hook) to
populate that key (as well as inject the context).
+:::python
```python
from langchain_core.documents import Document
+from langchain.agents.middleware import AgentMiddleware, AgentState
+from langchain_core.messages import AIMessage
-def retrieve_documents(state: AgentState):
- """Inject context into state messages."""
- last_message = state["messages"][-1]
- retrieved_docs = vector_store.similarity_search(last_message.text)
+class State(AgentState):
+ context: list[Document]
- docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
- # Below we augment each input message with context, but we could also
- # modify just the system message, as before.
- augmented_message_content = (
- f"{last_message.text}\n\n"
- "Use the following context to answer the query:\n"
- f"{docs_content}"
- )
- return {
- "messages": [
- last_message.model_copy(
- update={"content": augmented_message_content}
- )
- ],
- "context": retrieved_docs,
- }
+class RetrieveDocumentsMiddleware(AgentMiddleware[State]):
+ def before_model(self, state: AgentState) -> dict[str, Any] | None:
+ last_message = state["messages"][-1]
+ retrieved_docs = vector_store.similarity_search(last_message.text)
+ docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
-class State(AgentState):
- context: list[Document]
+ augmented_message_content = (
+ f"{last_message.text}\n\n"
+ "Use the following context to answer the query:\n"
+ f"{docs_content}"
+ )
+ return {
+ "messages": [last_message.model_copy(update={"content": augmented_message_content})],
+ "context": retrieved_docs,
+ }
agent = create_agent(
llm,
tools=[],
- pre_model_hook=retrieve_documents,
+ middleware=[RetrieveDocumentsMiddleware()],
state_schema=State,
)
```
-
:::
+:::js
+```typescript
+import { createMiddleware, Document, createAgent } from "langchain";
+import { MessagesZodSchema } from "@langchain/langgraph";
+
+const StateSchema = z.object({
+ messages: MessagesZodSchema,
+ context: z.array(z.custom()),
+})
+
+const retrieveDocumentsMiddleware = createMiddleware({
+ stateSchema: StateSchema,
+ beforeModel: async (state) => {
+ const lastMessage = state.messages[state.messages.length - 1].content;
+ const retrievedDocs = await vectorStore.similaritySearch(lastMessage, 2);
+
+ const docsContent = retrievedDocs
+ .map((doc) => doc.pageContent)
+ .join("\n\n");
+
+ const augmentedMessageContent = [
+ ...lastMessage.content,
+ { type: "text", text: `Use the following context to answer the query:\n\n${docsContent}` }
+ ]
+
+ // Below we augment each input message with context, but we could also
+ // modify just the system message, as before.
+ return {
+ messages: [{
+ ...lastMessage,
+ content: augmentedMessageContent,
+ }]
+ context: retrievedDocs,
+ }
+ },
+});
+
+const agent = createAgent({
+ model,
+ tools: [],
+ middleware: [retrieveDocumentsMiddleware],
+});
+```
+:::
+
## Next steps
diff --git a/src/oss/langchain/retrieval.mdx b/src/oss/langchain/retrieval.mdx
index 566455f0a..ae61f5f21 100644
--- a/src/oss/langchain/retrieval.mdx
+++ b/src/oss/langchain/retrieval.mdx
@@ -161,6 +161,7 @@ graph LR
class C,H decision
```
+:::python
```python
import requests
from langchain_core.tools import tool
@@ -178,22 +179,43 @@ system_prompt = """\
Use fetch_url when you need to fetch information from a web-page; quote relevant snippets.
"""
-agent = create_react_agent(
- model=init_chat_model("claude-sonnet-4-0"),
+agent = create_agent(
+ model="claude-sonnet-4-0",
tools=[fetch_url], # A tool for retrieval [!code highlight]
prompt=system_prompt,
)
```
+:::
+
+:::js
+```typescript
+import { tool, createAgent, initChatModel } from "langchain";
+
+const fetchUrl = tool(
+ (url: string) => {
+ return `Fetched content from ${url}`;
+ },
+ { name: "fetch_url", description: "Fetch text content from a URL" }
+);
+
+const agent = createAgent({
+ model: "claude-sonnet-4-0",
+ tools: [fetchUrl],
+ prompt: systemPrompt,
+});
+```
+:::
This example implements an **Agentic RAG system** to assist users in querying LangGraph documentation. The agent begins by loading [llms.txt](https://llmstxt.org/), which lists available documentation URLs, and can then dynamically use a `fetch_documentation` tool to retrieve and process the relevant content based on the user’s question.
+:::python
```python
import requests
-from langchain.chat_models import init_chat_model
+from langchain.agents import create_agent
+from langchain_core.messages import HumanMessage
from langchain_core.tools import tool
-from langgraph.prebuilt import create_react_agent
from markdownify import markdownify
ALLOWED_DOMAINS = ["https://langchain-ai.github.io/"]
@@ -244,7 +266,7 @@ tools = [fetch_documentation]
model = init_chat_model("claude-sonnet-4-0", max_tokens=32_000)
-agent = create_react_agent(
+agent = create_agent(
model=model,
tools=tools, # [!code highlight]
prompt=system_prompt, # [!code highlight]
@@ -252,18 +274,93 @@ agent = create_react_agent(
)
response = agent.invoke({
- 'messages': [{
- 'role': 'user',
- 'content': (
+ 'messages': [
+ HumanMessage(content=(
"Write a short example of a langgraph agent using the "
"prebuilt create react agent. the agent should be able "
"to look up stock pricing information."
- )
- }]
+ ))
+ ]
})
print(response['messages'][-1].content)
```
+:::
+:::js
+```typescript
+import { tool, createAgent, initChatModel, HumanMessage } from "langchain";
+import { z } from "zod";
+
+const ALLOWED_DOMAINS = ["https://langchain-ai.github.io/"];
+const LLMS_TXT = "https://langchain-ai.github.io/langgraph/llms.txt";
+
+const fetchDocumentation = tool(
+ async (input) => { // [!code highlight]
+ if (!ALLOWED_DOMAINS.some((domain) => input.url.startsWith(domain))) {
+ return `Error: URL not allowed. Must start with one of: ${ALLOWED_DOMAINS.join(", ")}`;
+ }
+ const response = await fetch(input.url);
+ if (!response.ok) {
+ throw new Error(`HTTP error! status: ${response.status}`);
+ }
+ return response.text();
+ },
+ {
+ name: "fetch_documentation",
+ description: "Fetch and convert documentation from a URL",
+ schema: z.object({
+ url: z.string().describe("The URL of the documentation to fetch"),
+ }),
+ }
+);
+
+const llmsTxtResponse = await fetch(LLMS_TXT);
+const llmsTxtContent = await llmsTxtResponse.text();
+
+const systemPrompt = `
+You are an expert TypeScript developer and technical assistant.
+Your primary role is to help users with questions about LangGraph and related tools.
+
+Instructions:
+
+1. If a user asks a question you're unsure about — or one that likely involves API usage,
+ behavior, or configuration — you MUST use the \`fetch_documentation\` tool to consult the relevant docs.
+2. When citing documentation, summarize clearly and include relevant context from the content.
+3. Do not use any URLs outside of the allowed domain.
+4. If a documentation fetch fails, tell the user and proceed with your best expert understanding.
+
+You can access official documentation from the following approved sources:
+
+${llmsTxtContent}
+
+You MUST consult the documentation to get up to date documentation
+before answering a user's question about LangGraph.
+
+Your answers should be clear, concise, and technically accurate.
+`;
+
+const tools = [fetchDocumentation];
+
+const agent = createAgent({
+ model: "claude-sonnet-4-0"
+ tools, // [!code highlight]
+ messageModifier: systemPrompt, // [!code highlight]
+ name: "Agentic RAG",
+});
+
+const response = await agent.invoke({
+ messages: [
+ new HumanMessage(
+ "Write a short example of a langgraph agent using the " +
+ "prebuilt create react agent. the agent should be able " +
+ "to look up stock pricing information."
+ ),
+ ],
+});
+
+console.log(response.messages.at(-1)?.content);
+```
+:::
+```bash npm
+npm install @langchain/langgraph @langchain/openai @langchain/community @langchain/textsplitters
+```
+
+```bash pnpm
+pnpm install @langchain/langgraph @langchain/openai @langchain/community @langchain/textsplitters
+```
+
+```bash yarn
+yarn add @langchain/langgraph @langchain/openai @langchain/community @langchain/textsplitters
+```
+
+```bash bun
+bun add @langchain/langgraph @langchain/openai @langchain/community @langchain/textsplitters
+```
+
+
+:::
Sign up for LangSmith to quickly spot issues and improve the performance of your LangGraph projects. [LangSmith](https://docs.smith.langchain.com) lets you use trace data to debug, test, and monitor your LLM apps built with LangGraph.
@@ -43,6 +64,7 @@ _set_env("OPENAI_API_KEY")
## 1. Preprocess documents
+:::python
1. Fetch documents to use in our RAG system. We will use three of the most recent pages from [Lilian Weng's excellent blog](https://lilianweng.github.io/). We'll start by fetching the content of the pages using `WebBaseLoader` utility:
```python
from langchain_community.document_loaders import WebBaseLoader
@@ -72,11 +94,42 @@ _set_env("OPENAI_API_KEY")
```python
doc_splits[0].page_content.strip()
```
+:::
+
+:::js
+1. Fetch documents to use in our RAG system. We will use three of the most recent pages from [Lilian Weng's excellent blog](https://lilianweng.github.io/). We'll start by fetching the content of the pages using `CheerioWebBaseLoader`:
+ ```typescript
+ import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio";
+
+ const urls = [
+ "https://lilianweng.github.io/posts/2023-06-23-agent/",
+ "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
+ "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
+ ];
+
+ const docs = await Promise.all(
+ urls.map((url) => new CheerioWebBaseLoader(url).load()),
+ );
+ ```
+2. Split the fetched documents into smaller chunks for indexing into our vectorstore:
+ ```typescript
+ import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
+
+ const docsList = docs.flat();
+
+ const textSplitter = new RecursiveCharacterTextSplitter({
+ chunkSize: 500,
+ chunkOverlap: 50,
+ });
+ const docSplits = await textSplitter.splitDocuments(docsList);
+ ```
+:::
## 2. Create a retriever tool
Now that we have our split documents, we can index them into a vector store that we'll use for semantic search.
+:::python
1. Use an in-memory vector store and OpenAI embeddings:
```python
from langchain_core.vectorstores import InMemoryVectorStore
@@ -101,11 +154,42 @@ Now that we have our split documents, we can index them into a vector store that
```python
retriever_tool.invoke({"query": "types of reward hacking"})
```
+:::
+
+:::js
+1. Use an in-memory vector store and OpenAI embeddings:
+ ```typescript
+ import { MemoryVectorStore } from "@langchain/classic/vectorstores/memory";
+ import { OpenAIEmbeddings } from "@langchain/openai";
+
+ const vectorStore = await MemoryVectorStore.fromDocuments(
+ docSplits,
+ new OpenAIEmbeddings(),
+ );
+
+ const retriever = vectorStore.asRetriever();
+ ```
+2. Create a retriever tool using LangChain's prebuilt `createRetrieverTool`:
+ ```typescript
+ import { createRetrieverTool } from "@langchain/classic/tools/retriever";
+
+ const tool = createRetrieverTool(
+ retriever,
+ {
+ name: "retrieve_blog_posts",
+ description:
+ "Search and return information about Lilian Weng blog posts on LLM agents, prompt engineering, and adversarial attacks on LLMs.",
+ },
+ );
+ const tools = [tool];
+ ```
+:::
## 3. Generate query
Now we will start building components ([nodes](/oss/langgraph/graph-api#nodes) and [edges](/oss/langgraph/graph-api#edges)) for our agentic RAG graph.
+:::python
Note that the components will operate on the [`MessagesState`](/oss/langgraph/graph-api#messagesstate) — graph state that contains a `messages` key with a list of [chat messages](https://python.langchain.com/docs/concepts/messages/).
1. Build a `generate_query_or_respond` node. It will call an LLM to generate a response based on the current graph state (list of messages). Given the input messages, it will decide to retrieve using the retriever tool, or respond directly to the user. Note that we're giving the chat model access to the `retriever_tool` we created earlier via `.bind_tools`:
@@ -113,7 +197,7 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
from langgraph.graph import MessagesState
from langchain.chat_models import init_chat_model
- response_model = init_chat_model("openai:gpt-4.1", temperature=0)
+ response_model = init_chat_model("openai:gpt-4o", temperature=0)
def generate_query_or_respond(state: MessagesState):
@@ -158,9 +242,70 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
Args:
query: types of reward hacking
```
+:::
+
+:::js
+1. Build a `generateQueryOrRespond` node. It will call an LLM to generate a response based on the current graph state (list of messages). Given the input messages, it will decide to retrieve using the retriever tool, or respond directly to the user. Note that we're giving the chat model access to the `tools` we created earlier via `.bindTools`:
+ ```typescript
+ import { ChatOpenAI } from "@langchain/openai";
+
+ async function generateQueryOrRespond(state) {
+ const { messages } = state;
+ const model = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).bindTools(tools); // [!code highlight]
+
+ const response = await model.invoke(messages);
+ return {
+ messages: [response],
+ };
+ }
+ ```
+2. Try it on a random input:
+ ```typescript
+ import { HumanMessage } from "@langchain/core/messages";
+
+ const input = { messages: [new HumanMessage("hello!")] };
+ const result = await generateQueryOrRespond(input);
+ console.log(result.messages[0]);
+ ```
+ **Output:**
+ ```
+ AIMessage {
+ content: "Hello! How can I help you today?",
+ tool_calls: []
+ }
+ ```
+3. Ask a question that requires semantic search:
+ ```typescript
+ const input = {
+ messages: [
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?")
+ ]
+ };
+ const result = await generateQueryOrRespond(input);
+ console.log(result.messages[0]);
+ ```
+ **Output:**
+ ```
+ AIMessage {
+ content: "",
+ tool_calls: [
+ {
+ name: "retrieve_blog_posts",
+ args: { query: "types of reward hacking" },
+ id: "call_...",
+ type: "tool_call"
+ }
+ ]
+ }
+ ```
+:::
## 4. Grade documents
+:::python
1. Add a [conditional edge](/oss/langgraph/graph-api#conditional-edges) — `grade_documents` — to determine whether the retrieved documents are relevant to the question. We will use a model with a structured output schema `GradeDocuments` for document grading. The `grade_documents` function will return the name of the node to go to based on the grading decision (`generate_answer` or `rewrite_question`):
```python
from pydantic import BaseModel, Field
@@ -183,7 +328,7 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
)
- grader_model = init_chat_model("openai:gpt-4.1", temperature=0)
+ grader_model = init_chat_model("openai:gpt-4o", temperature=0)
def grade_documents(
@@ -265,9 +410,103 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
}
grade_documents(input)
```
+:::
+
+:::js
+1. Add a node — `gradeDocuments` — to determine whether the retrieved documents are relevant to the question. We will use a model with structured output using Zod for document grading. We'll also add a [conditional edge](/oss/langgraph/graph-api#conditional-edges) — `checkRelevance` — that checks the grading result and returns the name of the node to go to (`generate` or `rewrite`):
+ ```typescript
+ import { z } from "zod";
+ import { ChatPromptTemplate } from "@langchain/core/prompts";
+ import { ChatOpenAI } from "@langchain/openai";
+ import { AIMessage } from "@langchain/core/messages";
+
+ const prompt = ChatPromptTemplate.fromTemplate(
+ `You are a grader assessing relevance of retrieved docs to a user question.
+ Here are the retrieved docs:
+ \n ------- \n
+ {context}
+ \n ------- \n
+ Here is the user question: {question}
+ If the content of the docs are relevant to the users question, score them as relevant.
+ Give a binary score 'yes' or 'no' score to indicate whether the docs are relevant to the question.
+ Yes: The docs are relevant to the question.
+ No: The docs are not relevant to the question.`,
+ );
+
+ const gradeDocumentsSchema = z.object({
+ binaryScore: z.string().describe("Relevance score 'yes' or 'no'"), // [!code highlight]
+ })
+
+ async function gradeDocuments(state) {
+ const { messages } = state;
+
+ const model = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ }).withStructuredOutput(gradeDocumentsSchema);
+
+ const score = await chain.invoke({
+ question: messages.at(0)?.content,
+ context: messages.at(-1)?.content,
+ });
+
+ if (score.binaryScore === "yes") {
+ return "generate";
+ }
+ return "rewrite";
+ }
+ ```
+2. Run this with irrelevant documents in the tool response:
+ ```typescript
+ const input = {
+ messages: [
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?"),
+ new AIMessage({
+ tool_calls: [
+ {
+ type: "tool_call"
+ name: "retrieve_blog_posts",
+ args: { query: "types of reward hacking" },
+ id: "1",
+ }
+ ]
+ }),
+ new ToolMessage({
+ content: "meow",
+ tool_call_id: "1",
+ })
+ ]
+ }
+ const result = await gradeDocuments(input);
+ ```
+3. Confirm that the relevant documents are classified as such:
+ ```typescript
+ const input = {
+ messages: [
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?"),
+ new AIMessage({
+ tool_calls: [
+ {
+ type: "tool_call"
+ name: "retrieve_blog_posts",
+ args: { query: "types of reward hacking" },
+ id: "1",
+ }
+ ]
+ }),
+ new ToolMessage({
+ content: "reward hacking can be categorized into two types: environment or goal misspecification, and reward tampering",
+ tool_call_id: "1",
+ })
+ ]
+ }
+ const result = await gradeDocuments(input);
+ ```
+:::
## 5. Rewrite question
+:::python
1. Build the `rewrite_question` node. The retriever tool can return potentially irrelevant documents, which indicates a need to improve the original user question. To do so, we will call the `rewrite_question` node:
```python
REWRITE_PROMPT = (
@@ -320,9 +559,72 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
```
What are the different types of reward hacking described by Lilian Weng, and how does she explain them?
```
+:::
+
+:::js
+1. Build the `rewrite` node. The retriever tool can return potentially irrelevant documents, which indicates a need to improve the original user question. To do so, we will call the `rewrite` node:
+ ```typescript
+ import { ChatPromptTemplate } from "@langchain/core/prompts";
+ import { ChatOpenAI } from "@langchain/openai";
+
+ const rewritePrompt = ChatPromptTemplate.fromTemplate(
+ `Look at the input and try to reason about the underlying semantic intent / meaning. \n
+ Here is the initial question:
+ \n ------- \n
+ {question}
+ \n ------- \n
+ Formulate an improved question:`,
+ );
+
+ async function rewrite(state) {
+ const { messages } = state;
+ const question = messages.at(0)?.content;
+
+ const model = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ });
+
+ const response = await rewritePrompt.pipe(model).invoke({ question });
+ return {
+ messages: [response],
+ };
+ }
+ ```
+2. Try it out:
+ ```typescript
+ import { HumanMessage, AIMessage, ToolMessage } from "@langchain/core/messages";
+
+ const input = {
+ messages: [
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?"),
+ new AIMessage({
+ content: "",
+ tool_calls: [
+ {
+ id: "1",
+ name: "retrieve_blog_posts",
+ args: { query: "types of reward hacking" },
+ type: "tool_call"
+ }
+ ]
+ }),
+ new ToolMessage({ content: "meow", tool_call_id: "1" })
+ ]
+ };
+
+ const response = await rewrite(input);
+ console.log(response.messages[0].content);
+ ```
+ **Output:**
+ ```
+ What are the different types of reward hacking described by Lilian Weng, and how does she explain them?
+ ```
+:::
## 6. Generate an answer
+:::python
1. Build `generate_answer` node: if we pass the grader checks, we can generate the final answer based on the original question and the retrieved context:
```python
GENERATE_PROMPT = (
@@ -381,9 +683,84 @@ Note that the components will operate on the [`MessagesState`](/oss/langgraph/gr
Lilian Weng categorizes reward hacking into two types: environment or goal misspecification, and reward tampering. She considers reward hacking as a broad concept that includes both of these categories. Reward hacking occurs when an agent exploits flaws or ambiguities in the reward function to achieve high rewards without performing the intended behaviors.
```
+:::
+
+:::js
+1. Build `generate` node: if we pass the grader checks, we can generate the final answer based on the original question and the retrieved context:
+ ```typescript
+ import { ChatPromptTemplate } from "@langchain/core/prompts";
+ import { ChatOpenAI } from "@langchain/openai";
+
+ async function generate(state) {
+ const { messages } = state;
+ const question = messages.at(0)?.content;
+ const context = messages.at(-1)?.content;
+
+ const prompt = ChatPromptTemplate.fromTemplate(
+ `You are an assistant for question-answering tasks.
+ Use the following pieces of retrieved context to answer the question.
+ If you don't know the answer, just say that you don't know.
+ Use three sentences maximum and keep the answer concise.
+ Question: {question}
+ Context: {context}`
+ );
+
+ const llm = new ChatOpenAI({
+ model: "gpt-4o",
+ temperature: 0,
+ });
+
+ const ragChain = prompt.pipe(llm);
+
+ const response = await ragChain.invoke({
+ context,
+ question,
+ });
+
+ return {
+ messages: [response],
+ };
+ }
+ ```
+2. Try it:
+ ```typescript
+ import { HumanMessage, AIMessage, ToolMessage } from "@langchain/core/messages";
+
+ const input = {
+ messages: [
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?"),
+ new AIMessage({
+ content: "",
+ tool_calls: [
+ {
+ id: "1",
+ name: "retrieve_blog_posts",
+ args: { query: "types of reward hacking" },
+ type: "tool_call"
+ }
+ ]
+ }),
+ new ToolMessage({
+ content: "reward hacking can be categorized into two types: environment or goal misspecification, and reward tampering",
+ tool_call_id: "1"
+ })
+ ]
+ };
+
+ const response = await generate(input);
+ console.log(response.messages[0].content);
+ ```
+ **Output:**
+ ```
+ Lilian Weng categorizes reward hacking into two types: environment or goal misspecification, and reward tampering. She considers reward hacking as a broad concept that includes both of these categories. Reward hacking occurs when an agent exploits flaws or ambiguities in the reward function to achieve high rewards without performing the intended behaviors.
+ ```
+:::
## 7. Assemble the graph
+Now we'll assemble all the nodes and edges into a complete graph:
+
+:::python
* Start with a `generate_query_or_respond` and determine if we need to call `retriever_tool`
* Route to next step using `tools_condition`:
* If `generate_query_or_respond` returned `tool_calls`, call `retriever_tool` to retrieve context
@@ -441,9 +818,71 @@ display(Image(graph.get_graph().draw_mermaid_png()))
```

+:::
+
+:::js
+* Start with a `generateQueryOrRespond` and determine if we need to call the retriever tool
+* Route to next step using a conditional edge:
+ * If `generateQueryOrRespond` returned `tool_calls`, call the retriever tool to retrieve context
+ * Otherwise, respond directly to the user
+* Grade retrieved document content for relevance to the question (`gradeDocuments`) and route to next step:
+ * If not relevant, rewrite the question using `rewrite` and then call `generateQueryOrRespond` again
+ * If relevant, proceed to `generate` and generate final response using the `ToolMessage` with the retrieved document context
+
+```typescript
+import { StateGraph, START, END } from "@langchain/langgraph";
+import { ToolNode } from "@langchain/langgraph/prebuilt";
+import { AIMessage } from "langchain";
+
+// Create a ToolNode for the retriever
+const toolNode = new ToolNode(tools);
+
+// Helper function to determine if we should retrieve
+function shouldRetrieve(state) {
+ const { messages } = state;
+ const lastMessage = messages.at(-1);
+
+ if (AIMessage.isInstance(lastMessage) && lastMessage.tool_calls.length) {
+ return "retrieve";
+ }
+ return END;
+}
+
+// Define the graph
+const builder = new StateGraph(GraphState)
+ .addNode("generateQueryOrRespond", generateQueryOrRespond)
+ .addNode("retrieve", toolNode)
+ .addNode("gradeDocuments", gradeDocuments)
+ .addNode("rewrite", rewrite)
+ .addNode("generate", generate)
+ // Add edges
+ .addEdge(START, "generateQueryOrRespond")
+ // Decide whether to retrieve
+ .addConditionalEdges("generateQueryOrRespond", shouldRetrieve)
+ .addEdge("retrieve", "gradeDocuments")
+ // Edges taken after grading documents
+ .addConditionalEdges(
+ "gradeDocuments",
+ // Route based on grading decision
+ (state) => {
+ // The gradeDocuments function returns either "generate" or "rewrite"
+ const lastMessage = state.messages.at(-1);
+ return lastMessage.content === "generate" ? "generate" : "rewrite";
+ }
+ )
+ .addEdge("generate", END)
+ .addEdge("rewrite", "generateQueryOrRespond");
+
+// Compile
+const graph = builder.compile();
+```
+:::
## 8. Run the agentic RAG
+Now let's test the complete graph by running it with a question:
+
+:::python
```python
for chunk in graph.stream(
{
@@ -495,434 +934,44 @@ Update from node generate_answer
Lilian Weng categorizes reward hacking into two types: environment or goal misspecification, and reward tampering. She considers reward hacking as a broad concept that includes both of these categories. Reward hacking occurs when an agent exploits flaws or ambiguities in the reward function to achieve high rewards without performing the intended behaviors.
```
-
-
:::
:::js
-We can implement
-[Retrieval Agents](https://js.langchain.com/docs/use_cases/question_answering/conversational_retrieval_agents)
-in [LangGraph](https://js.langchain.com/docs/langgraph).
-
-## Setup
-
-### Load env vars
-
-Add a `.env` variable in the root of the `./examples` folder with your
-variables.
-
-```typescript
-// import dotenv from 'dotenv';
-
-// dotenv.config();
-```
-
-### Install dependencies
-
-```bash
-npm install cheerio zod zod-to-json-schema langchain @langchain/openai @langchain/core @langchain/community @langchain/textsplitters
-```
-
-## Retriever
-
-```typescript
-import { CheerioWebBaseLoader } from "@langchain/community/document_loaders/web/cheerio";
-import { RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
-import { MemoryVectorStore } from "langchain/vectorstores/memory";
-import { OpenAIEmbeddings } from "@langchain/openai";
-
-const urls = [
- "https://lilianweng.github.io/posts/2023-06-23-agent/",
- "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
- "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
-];
-
-const docs = await Promise.all(
- urls.map((url) => new CheerioWebBaseLoader(url).load()),
-);
-const docsList = docs.flat();
-
-const textSplitter = new RecursiveCharacterTextSplitter({
- chunkSize: 500,
- chunkOverlap: 50,
-});
-const docSplits = await textSplitter.splitDocuments(docsList);
-
-// Add to vectorDB
-const vectorStore = await MemoryVectorStore.fromDocuments(
- docSplits,
- new OpenAIEmbeddings(),
-);
-
-const retriever = vectorStore.asRetriever();
-```
-
-## Agent state
-
-We will define a graph.
-
-You may pass a custom `state` object to the graph, or use a simple list of
-`messages`.
-
-Our state will be a list of `messages`.
-
-Each node in our graph will append to it.
-
-```typescript
-import { Annotation } from "@langchain/langgraph";
-import { BaseMessage } from "@langchain/core/messages";
-
-const GraphState = Annotation.Root({
- messages: Annotation({
- reducer: (x, y) => x.concat(y),
- default: () => [],
- })
-})
-```
-
-```typescript
-import { createRetrieverTool } from "langchain/tools/retriever";
-import { ToolNode } from "@langchain/langgraph/prebuilt";
-
-const tool = createRetrieverTool(
- retriever,
- {
- name: "retrieve_blog_posts",
- description:
- "Search and return information about Lilian Weng blog posts on LLM agents, prompt engineering, and adversarial attacks on LLMs.",
- },
-);
-const tools = [tool];
-
-const toolNode = new ToolNode(tools);
-```
-
-## Nodes and Edges
-
-Each node will -
-
-1/ Either be a function or a runnable.
-
-2/ Modify the `state`.
-
-The edges choose which node to call next.
-
-We can lay out an agentic RAG graph like this:
-
-
-
-### Edges
-
-```typescript
-import { END } from "@langchain/langgraph";
-import { pull } from "langchain/hub";
-import { z } from "zod";
-import { ChatPromptTemplate } from "@langchain/core/prompts";
-import { ChatOpenAI } from "@langchain/openai";
-import { AIMessage, BaseMessage } from "@langchain/core/messages";
-
-/**
- * Decides whether the agent should retrieve more information or end the process.
- * This function checks the last message in the state for a function call. If a tool call is
- * present, the process continues to retrieve information. Otherwise, it ends the process.
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {string} - A decision to either "continue" the retrieval process or "end" it.
- */
-function shouldRetrieve(state: typeof GraphState.State): string {
- const { messages } = state;
- console.log("---DECIDE TO RETRIEVE---");
- const lastMessage = messages[messages.length - 1];
-
- if ("tool_calls" in lastMessage && Array.isArray(lastMessage.tool_calls) && lastMessage.tool_calls.length) {
- console.log("---DECISION: RETRIEVE---");
- return "retrieve";
- }
- // If there are no tool calls then we finish.
- return END;
-}
-
-/**
- * Determines whether the Agent should continue based on the relevance of retrieved documents.
- * This function checks if the last message in the conversation is of type FunctionMessage, indicating
- * that document retrieval has been performed. It then evaluates the relevance of these documents to the user's
- * initial question using a predefined model and output parser. If the documents are relevant, the conversation
- * is considered complete. Otherwise, the retrieval process is continued.
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {Promise>} - The updated state with the new message added to the list of messages.
- */
-async function gradeDocuments(state: typeof GraphState.State): Promise> {
- console.log("---GET RELEVANCE---");
-
- const { messages } = state;
- const tool = {
- name: "give_relevance_score",
- description: "Give a relevance score to the retrieved documents.",
- schema: z.object({
- binaryScore: z.string().describe("Relevance score 'yes' or 'no'"),
- })
- }
-
- const prompt = ChatPromptTemplate.fromTemplate(
- `You are a grader assessing relevance of retrieved docs to a user question.
- Here are the retrieved docs:
- \n ------- \n
- {context}
- \n ------- \n
- Here is the user question: {question}
- If the content of the docs are relevant to the users question, score them as relevant.
- Give a binary score 'yes' or 'no' score to indicate whether the docs are relevant to the question.
- Yes: The docs are relevant to the question.
- No: The docs are not relevant to the question.`,
- );
-
- const model = new ChatOpenAI({
- model: "gpt-4o",
- temperature: 0,
- }).bindTools([tool], {
- tool_choice: tool.name,
- });
-
- const chain = prompt.pipe(model);
-
- const lastMessage = messages[messages.length - 1];
-
- const score = await chain.invoke({
- question: messages[0].content as string,
- context: lastMessage.content as string,
- });
-
- return {
- messages: [score]
- };
-}
-
-/**
- * Check the relevance of the previous LLM tool call.
- *
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {string} - A directive to either "yes" or "no" based on the relevance of the documents.
- */
-function checkRelevance(state: typeof GraphState.State): string {
- console.log("---CHECK RELEVANCE---");
-
- const { messages } = state;
- const lastMessage = messages[messages.length - 1];
- if (!("tool_calls" in lastMessage)) {
- throw new Error("The 'checkRelevance' node requires the most recent message to contain tool calls.")
- }
- const toolCalls = (lastMessage as AIMessage).tool_calls;
- if (!toolCalls || !toolCalls.length) {
- throw new Error("Last message was not a function message");
- }
-
- if (toolCalls[0].args.binaryScore === "yes") {
- console.log("---DECISION: DOCS RELEVANT---");
- return "yes";
- }
- console.log("---DECISION: DOCS NOT RELEVANT---");
- return "no";
-}
-
-// Nodes
-
-/**
- * Invokes the agent model to generate a response based on the current state.
- * This function calls the agent model to generate a response to the current conversation state.
- * The response is added to the state's messages.
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {Promise>} - The updated state with the new message added to the list of messages.
- */
-async function agent(state: typeof GraphState.State): Promise> {
- console.log("---CALL AGENT---");
-
- const { messages } = state;
- // Find the AIMessage which contains the `give_relevance_score` tool call,
- // and remove it if it exists. This is because the agent does not need to know
- // the relevance score.
- const filteredMessages = messages.filter((message) => {
- if ("tool_calls" in message && Array.isArray(message.tool_calls) && message.tool_calls.length > 0) {
- return message.tool_calls[0].name !== "give_relevance_score";
- }
- return true;
- });
-
- const model = new ChatOpenAI({
- model: "gpt-4o",
- temperature: 0,
- streaming: true,
- }).bindTools(tools);
-
- const response = await model.invoke(filteredMessages);
- return {
- messages: [response],
- };
-}
-
-/**
- * Transform the query to produce a better question.
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {Promise>} - The updated state with the new message added to the list of messages.
- */
-async function rewrite(state: typeof GraphState.State): Promise> {
- console.log("---TRANSFORM QUERY---");
-
- const { messages } = state;
- const question = messages[0].content as string;
- const prompt = ChatPromptTemplate.fromTemplate(
- `Look at the input and try to reason about the underlying semantic intent / meaning. \n
-Here is the initial question:
-\n ------- \n
-{question}
-\n ------- \n
-Formulate an improved question:`,
- );
-
- // Grader
- const model = new ChatOpenAI({
- model: "gpt-4o",
- temperature: 0,
- streaming: true,
- });
- const response = await prompt.pipe(model).invoke({ question });
- return {
- messages: [response],
- };
-}
-
-/**
- * Generate answer
- * @param {typeof GraphState.State} state - The current state of the agent, including all messages.
- * @returns {Promise>} - The updated state with the new message added to the list of messages.
- */
-async function generate(state: typeof GraphState.State): Promise> {
- console.log("---GENERATE---");
-
- const { messages } = state;
- const question = messages[0].content as string;
- // Extract the most recent ToolMessage
- const lastToolMessage = messages.slice().reverse().find((msg) => msg._getType() === "tool");
- if (!lastToolMessage) {
- throw new Error("No tool message found in the conversation history");
- }
-
- const docs = lastToolMessage.content as string;
-
- const prompt = await pull("rlm/rag-prompt");
-
- const llm = new ChatOpenAI({
- model: "gpt-4o",
- temperature: 0,
- streaming: true,
- });
-
- const ragChain = prompt.pipe(llm);
-
- const response = await ragChain.invoke({
- context: docs,
- question,
- });
-
- return {
- messages: [response],
- };
-}
-```
-
-## Graph
-
-* Start with an agent, `callModel`
-* Agent make a decision to call a function
-* If so, then `action` to call tool (retriever)
-* Then call agent with the tool output added to messages (`state`)
-
-```typescript
-import { StateGraph } from "@langchain/langgraph";
-
-// Define the graph
-const workflow = new StateGraph(GraphState)
- // Define the nodes which we'll cycle between.
- .addNode("agent", agent)
- .addNode("retrieve", toolNode)
- .addNode("gradeDocuments", gradeDocuments)
- .addNode("rewrite", rewrite)
- .addNode("generate", generate);
-```
-
-```typescript
-import { START } from "@langchain/langgraph";
-
-// Call agent node to decide to retrieve or not
-workflow.addEdge(START, "agent");
-
-// Decide whether to retrieve
-workflow.addConditionalEdges(
- "agent",
- // Assess agent decision
- shouldRetrieve,
-);
-
-workflow.addEdge("retrieve", "gradeDocuments");
-
-// Edges taken after the `action` node is called.
-workflow.addConditionalEdges(
- "gradeDocuments",
- // Assess agent decision
- checkRelevance,
- {
- // Call tool node
- yes: "generate",
- no: "rewrite", // placeholder
- },
-);
-
-workflow.addEdge("generate", END);
-workflow.addEdge("rewrite", "agent");
-
-// Compile
-const app = workflow.compile();
-```
-
```typescript
import { HumanMessage } from "@langchain/core/messages";
const inputs = {
messages: [
- new HumanMessage(
- "What are the types of agent memory based on Lilian Weng's blog post?",
- ),
- ],
+ new HumanMessage("What does Lilian Weng say about types of reward hacking?")
+ ]
};
-let finalState;
-for await (const output of await app.stream(inputs)) {
+
+for await (const output of await graph.stream(inputs)) {
for (const [key, value] of Object.entries(output)) {
const lastMsg = output[key].messages[output[key].messages.length - 1];
console.log(`Output from node: '${key}'`);
- console.dir({
+ console.log({
type: lastMsg._getType(),
content: lastMsg.content,
tool_calls: lastMsg.tool_calls,
- }, { depth: null });
+ });
console.log("---\n");
- finalState = value;
}
}
-
-console.log(JSON.stringify(finalState, null, 2));
```
-```output
----CALL AGENT---
----DECIDE TO RETRIEVE---
----DECISION: RETRIEVE---
-Output from node: 'agent'
+**Output:**
+
+```
+Output from node: 'generateQueryOrRespond'
{
type: 'ai',
content: '',
tool_calls: [
{
name: 'retrieve_blog_posts',
- args: { query: 'types of agent memory' },
- id: 'call_adLYkV7T2ry1EZFboT0jPuwn',
+ args: { query: 'types of reward hacking' },
+ id: 'call_...',
type: 'tool_call'
}
]
@@ -932,128 +981,19 @@ Output from node: 'agent'
Output from node: 'retrieve'
{
type: 'tool',
- content: 'Agent System Overview\n' +
- ' \n' +
- ' Component One: Planning\n' +
- ' \n' +
- ' \n' +
- ' Task Decomposition\n' +
- ' \n' +
- ' Self-Reflection\n' +
- ' \n' +
- ' \n' +
- ' Component Two: Memory\n' +
- ' \n' +
- ' \n' +
- ' Types of Memory\n' +
- ' \n' +
- ' Maximum Inner Product Search (MIPS)\n' +
- '\n' +
- 'Memory stream: is a long-term memory module (external database) that records a comprehensive list of agents’ experience in natural language.\n' +
- '\n' +
- 'Each element is an observation, an event directly provided by the agent.\n' +
- '- Inter-agent communication can trigger new natural language statements.\n' +
- '\n' +
- '\n' +
- 'Retrieval model: surfaces the context to inform the agent’s behavior, according to relevance, recency and importance.\n' +
- '\n' +
- 'Planning\n' +
- '\n' +
- 'Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\n' +
- 'Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.\n' +
- '\n' +
- '\n' +
- 'Memory\n' +
- '\n' +
- 'The design of generative agents combines LLM with memory, planning and reflection mechanisms to enable agents to behave conditioned on past experience, as well as to interact with other agents.',
+ content: '(Note: Some work defines reward tampering as a distinct category...\n' +
+ 'At a high level, reward hacking can be categorized into two types: environment or goal misspecification, and reward tampering.\n' +
+ '...',
tool_calls: undefined
}
---
----GET RELEVANCE---
----CHECK RELEVANCE---
----DECISION: DOCS NOT RELEVANT---
-Output from node: 'gradeDocuments'
+Output from node: 'generate'
{
type: 'ai',
- content: '',
- tool_calls: [
- {
- name: 'give_relevance_score',
- args: { binaryScore: 'no' },
- type: 'tool_call',
- id: 'call_AGE7gORVFubExfJWcjb0C2nV'
- }
- ]
-}
----
-
----TRANSFORM QUERY---
-Output from node: 'rewrite'
-{
- type: 'ai',
- content: "What are the different types of agent memory described in Lilian Weng's blog post?",
+ content: 'Lilian Weng categorizes reward hacking into two types: environment or goal misspecification, and reward tampering. She considers reward hacking as a broad concept that includes both of these categories. Reward hacking occurs when an agent exploits flaws or ambiguities in the reward function to achieve high rewards without performing the intended behaviors.',
tool_calls: []
}
---
-
----CALL AGENT---
----DECIDE TO RETRIEVE---
-Output from node: 'agent'
-{
- type: 'ai',
- content: "Lilian Weng's blog post describes the following types of agent memory:\n" +
- '\n' +
- '1. **Memory Stream**:\n' +
- ' - This is a long-term memory module (external database) that records a comprehensive list of agents’ experiences in natural language.\n' +
- ' - Each element in the memory stream is an observation or an event directly provided by the agent.\n' +
- ' - Inter-agent communication can trigger new natural language statements to be added to the memory stream.\n' +
- '\n' +
- '2. **Retrieval Model**:\n' +
- ' - This model surfaces the context to inform the agent’s behavior based on relevance, recency, and importance.\n' +
- '\n' +
- 'These memory types are part of a broader design that combines generative agents with memory, planning, and reflection mechanisms to enable agents to behave based on past experiences and interact with other agents.',
- tool_calls: []
-}
----
-
-{
- "messages": [
- {
- "lc": 1,
- "type": "constructor",
- "id": [
- "langchain_core",
- "messages",
- "AIMessageChunk"
- ],
- "kwargs": {
- "content": "Lilian Weng's blog post describes the following types of agent memory:\n\n1. **Memory Stream**:\n - This is a long-term memory module (external database) that records a comprehensive list of agents’ experiences in natural language.\n - Each element in the memory stream is an observation or an event directly provided by the agent.\n - Inter-agent communication can trigger new natural language statements to be added to the memory stream.\n\n2. **Retrieval Model**:\n - This model surfaces the context to inform the agent’s behavior based on relevance, recency, and importance.\n\nThese memory types are part of a broader design that combines generative agents with memory, planning, and reflection mechanisms to enable agents to behave based on past experiences and interact with other agents.",
- "additional_kwargs": {},
- "response_metadata": {
- "estimatedTokenUsage": {
- "promptTokens": 280,
- "completionTokens": 155,
- "totalTokens": 435
- },
- "prompt": 0,
- "completion": 0,
- "finish_reason": "stop",
- "system_fingerprint": "fp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3bfp_3cd8b62c3b"
- },
- "tool_call_chunks": [],
- "id": "chatcmpl-9zAaVQGmTLiCaFvtbxUK60qMFsSmU",
- "usage_metadata": {
- "input_tokens": 363,
- "output_tokens": 156,
- "total_tokens": 519
- },
- "tool_calls": [],
- "invalid_tool_calls": []
- }
- }
- ]
-}
```
-
:::
diff --git a/src/oss/python/integrations/splitters/character_text_splitter.mdx b/src/oss/python/integrations/splitters/character_text_splitter.mdx
deleted file mode 100644
index a1d8bc226..000000000
--- a/src/oss/python/integrations/splitters/character_text_splitter.mdx
+++ /dev/null
@@ -1,62 +0,0 @@
----
-title: Splitting by character
----
-
-Character-based splitting is the simplest approach to text splitting. It divides text using a specified character sequence (default: `"\n\n"`), with chunk length measured by the number of characters.
-
-**Key points**:
-1. **How text is split**: by a given character separator.
-2. **How chunk size is measured**: by character count.
-
-You can choose between:
-- `.split_text` — returns plain string chunks.
-- `.create_documents` — returns LangChain [Document](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) objects, useful when metadata needs to be preserved for downstream tasks.
-
-```python
-%pip install -qU langchain-text-splitters
-```
-
-
-```python
-from langchain_text_splitters import CharacterTextSplitter
-
-# Load an example document
-with open("state_of_the_union.txt") as f:
- state_of_the_union = f.read()
-
-text_splitter = CharacterTextSplitter(
- separator="\n\n",
- chunk_size=1000,
- chunk_overlap=200,
- length_function=len,
- is_separator_regex=False,
-)
-texts = text_splitter.create_documents([state_of_the_union])
-print(texts[0])
-```
-```output
-page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
-```
-Use `.create_documents` to propagate metadata associated with each document to the output chunks:
-
-
-```python
-metadatas = [{"document": 1}, {"document": 2}]
-documents = text_splitter.create_documents(
- [state_of_the_union, state_of_the_union], metadatas=metadatas
-)
-print(documents[0])
-```
-```output
-page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.' metadata={'document': 1}
-```
-Use `.split_text` to obtain the string content directly:
-
-
-```python
-text_splitter.split_text(state_of_the_union)[0]
-```
-
-```output
-'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.'
-```
diff --git a/src/oss/python/integrations/splitters/code_splitter.mdx b/src/oss/python/integrations/splitters/code_splitter.mdx
deleted file mode 100644
index 413f09ec5..000000000
--- a/src/oss/python/integrations/splitters/code_splitter.mdx
+++ /dev/null
@@ -1,587 +0,0 @@
----
-title: Splitting code
----
-
-[RecursiveCharacterTextSplitter](https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html) includes pre-built lists of separators that are useful for [splitting text](/oss/integrations/splitters/) in a specific programming language.
-
-Supported languages are stored in the `langchain_text_splitters.Language` enum. They include:
-
-```
-"cpp",
-"go",
-"java",
-"kotlin",
-"js",
-"ts",
-"php",
-"proto",
-"python",
-"rst",
-"ruby",
-"rust",
-"scala",
-"swift",
-"markdown",
-"latex",
-"html",
-"sol",
-"csharp",
-"cobol",
-"c",
-"lua",
-"perl",
-"haskell"
-```
-
-To view the list of separators for a given language, pass a value from this enum into
-```python
-RecursiveCharacterTextSplitter.get_separators_for_language
-```
-
-To instantiate a splitter that is tailored for a specific language, pass a value from the enum into
-```python
-RecursiveCharacterTextSplitter.from_language
-```
-
-Below we demonstrate examples for the various languages.
-
-
-```python
-%pip install -qU langchain-text-splitters
-```
-
-
-```python
-from langchain_text_splitters import (
- Language,
- RecursiveCharacterTextSplitter,
-)
-```
-
-To view the full list of supported languages:
-
-
-```python
-[e.value for e in Language]
-```
-
-
-
-```output
-['cpp',
- 'go',
- 'java',
- 'kotlin',
- 'js',
- 'ts',
- 'php',
- 'proto',
- 'python',
- 'rst',
- 'ruby',
- 'rust',
- 'scala',
- 'swift',
- 'markdown',
- 'latex',
- 'html',
- 'sol',
- 'csharp',
- 'cobol',
- 'c',
- 'lua',
- 'perl',
- 'haskell',
- 'elixir',
- 'powershell',
- 'visualbasic6']
-```
-
-
-You can also see the separators used for a given language:
-
-
-```python
-RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)
-```
-
-
-
-```output
-['\nclass ', '\ndef ', '\n\tdef ', '\n\n', '\n', ' ', '']
-```
-
-
-## Python
-
-Here's an example using the PythonTextSplitter:
-
-
-
-
-```python
-PYTHON_CODE = """
-def hello_world():
- print("Hello, World!")
-
-# Call the function
-hello_world()
-"""
-python_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.PYTHON, chunk_size=50, chunk_overlap=0
-)
-python_docs = python_splitter.create_documents([PYTHON_CODE])
-python_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='def hello_world():\n print("Hello, World!")'),
- Document(metadata={}, page_content='# Call the function\nhello_world()')]
-```
-
-
-## JS
-Here's an example using the JS text splitter:
-
-
-```python
-JS_CODE = """
-function helloWorld() {
- console.log("Hello, World!");
-}
-
-// Call the function
-helloWorld();
-"""
-
-js_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.JS, chunk_size=60, chunk_overlap=0
-)
-js_docs = js_splitter.create_documents([JS_CODE])
-js_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='function helloWorld() {\n console.log("Hello, World!");\n}'),
- Document(metadata={}, page_content='// Call the function\nhelloWorld();')]
-```
-
-
-## TS
-Here's an example using the TS text splitter:
-
-
-```python
-TS_CODE = """
-function helloWorld(): void {
- console.log("Hello, World!");
-}
-
-// Call the function
-helloWorld();
-"""
-
-ts_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.TS, chunk_size=60, chunk_overlap=0
-)
-ts_docs = ts_splitter.create_documents([TS_CODE])
-ts_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='function helloWorld(): void {'),
- Document(metadata={}, page_content='console.log("Hello, World!");\n}'),
- Document(metadata={}, page_content='// Call the function\nhelloWorld();')]
-```
-
-
-## Markdown
-
-Here's an example using the Markdown text splitter:
-
-
-
-```python
-markdown_text = """
-# 🦜️🔗 LangChain
-
-⚡ Building applications with LLMs through composability ⚡
-
-## What is LangChain?
-
-# Hopefully this code block isn't split
-LangChain is a framework for...
-
-As an open-source project in a rapidly developing field, we are extremely open to contributions.
-"""
-```
-
-
-```python
-md_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
-)
-md_docs = md_splitter.create_documents([markdown_text])
-md_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='# 🦜️🔗 LangChain'),
- Document(metadata={}, page_content='⚡ Building applications with LLMs through composability ⚡'),
- Document(metadata={}, page_content='## What is LangChain?'),
- Document(metadata={}, page_content="# Hopefully this code block isn't split"),
- Document(metadata={}, page_content='LangChain is a framework for...'),
- Document(metadata={}, page_content='As an open-source project in a rapidly developing field, we'),
- Document(metadata={}, page_content='are extremely open to contributions.')]
-```
-
-
-## Latex
-
-Here's an example on Latex text:
-
-
-
-```python
-latex_text = """
-\documentclass{article}
-
-\begin{document}
-
-\maketitle
-
-\section{Introduction}
-Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in a variety of natural language processing tasks, including language translation, text generation, and sentiment analysis.
-
-\subsection{History of LLMs}
-The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.
-
-\subsection{Applications of LLMs}
-LLMs have many applications in industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.
-
-\end{document}
-"""
-```
-
-
-```python
-latex_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
-)
-latex_docs = latex_splitter.create_documents([latex_text])
-latex_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='\\documentclass{article}\n\n\x08egin{document}\n\n\\maketitle'),
- Document(metadata={}, page_content='\\section{Introduction}'),
- Document(metadata={}, page_content='Large language models (LLMs) are a type of machine learning'),
- Document(metadata={}, page_content='model that can be trained on vast amounts of text data to'),
- Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),
- Document(metadata={}, page_content='made significant advances in a variety of natural language'),
- Document(metadata={}, page_content='processing tasks, including language translation, text'),
- Document(metadata={}, page_content='generation, and sentiment analysis.'),
- Document(metadata={}, page_content='\\subsection{History of LLMs}'),
- Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),
- Document(metadata={}, page_content='but they were limited by the amount of data that could be'),
- Document(metadata={}, page_content='processed and the computational power available at the'),
- Document(metadata={}, page_content='time. In the past decade, however, advances in hardware and'),
- Document(metadata={}, page_content='software have made it possible to train LLMs on massive'),
- Document(metadata={}, page_content='datasets, leading to significant improvements in'),
- Document(metadata={}, page_content='performance.'),
- Document(metadata={}, page_content='\\subsection{Applications of LLMs}'),
- Document(metadata={}, page_content='LLMs have many applications in industry, including'),
- Document(metadata={}, page_content='chatbots, content creation, and virtual assistants. They'),
- Document(metadata={}, page_content='can also be used in academia for research in linguistics,'),
- Document(metadata={}, page_content='psychology, and computational linguistics.'),
- Document(metadata={}, page_content='\\end{document}')]
-```
-
-
-## HTML
-
-Here's an example using an HTML text splitter:
-
-
-
-```python
-html_text = """
-
-
-
- 🦜️🔗 LangChain
-
-
-
-
-
🦜️🔗 LangChain
-
⚡ Building applications with LLMs through composability ⚡
-
-
- As an open-source project in a rapidly developing field, we are extremely open to contributions.
-
-
-
-"""
-```
-
-
-```python
-html_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.HTML, chunk_size=60, chunk_overlap=0
-)
-html_docs = html_splitter.create_documents([html_text])
-html_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='\n'),
- Document(metadata={}, page_content='\n 🦜️🔗 LangChain'),
- Document(metadata={}, page_content='\n '),
- Document(metadata={}, page_content=''),
- Document(metadata={}, page_content='\n
🦜️🔗 LangChain
'),
- Document(metadata={}, page_content='
⚡ Building applications with LLMs through composability ⚡'),
- Document(metadata={}, page_content='
\n
'),
- Document(metadata={}, page_content='\n As an open-source project in a rapidly dev'),
- Document(metadata={}, page_content='eloping field, we are extremely open to contributions.'),
- Document(metadata={}, page_content='
\n \n')]
-```
-
-
-## Solidity
-Here's an example using the Solidity text splitter:
-
-
-```python
-SOL_CODE = """
-pragma solidity ^0.8.20;
-contract HelloWorld {
- function add(uint a, uint b) pure public returns(uint) {
- return a + b;
- }
-}
-"""
-
-sol_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.SOL, chunk_size=128, chunk_overlap=0
-)
-sol_docs = sol_splitter.create_documents([SOL_CODE])
-sol_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='pragma solidity ^0.8.20;'),
- Document(metadata={}, page_content='contract HelloWorld {\n function add(uint a, uint b) pure public returns(uint) {\n return a + b;\n }\n}')]
-```
-
-
-## C#
-Here's an example using the C# text splitter:
-
-
-
-```python
-C_CODE = """
-using System;
-class Program
-{
- static void Main()
- {
- int age = 30; // Change the age value as needed
-
- // Categorize the age without any console output
- if (age < 18)
- {
- // Age is under 18
- }
- else if (age >= 18 && age < 65)
- {
- // Age is an adult
- }
- else
- {
- // Age is a senior citizen
- }
- }
-}
-"""
-c_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.CSHARP, chunk_size=128, chunk_overlap=0
-)
-c_docs = c_splitter.create_documents([C_CODE])
-c_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='using System;'),
- Document(metadata={}, page_content='class Program\n{\n static void Main()\n {\n int age = 30; // Change the age value as needed'),
- Document(metadata={}, page_content='// Categorize the age without any console output\n if (age < 18)\n {\n // Age is under 18'),
- Document(metadata={}, page_content='}\n else if (age >= 18 && age < 65)\n {\n // Age is an adult\n }\n else\n {'),
- Document(metadata={}, page_content='// Age is a senior citizen\n }\n }\n}')]
-```
-
-
-## Haskell
-Here's an example using the Haskell text splitter:
-
-
-```python
-HASKELL_CODE = """
-main :: IO ()
-main = do
- putStrLn "Hello, World!"
--- Some sample functions
-add :: Int -> Int -> Int
-add x y = x + y
-"""
-haskell_splitter = RecursiveCharacterTextSplitter.from_language(
- language=Language.HASKELL, chunk_size=50, chunk_overlap=0
-)
-haskell_docs = haskell_splitter.create_documents([HASKELL_CODE])
-haskell_docs
-```
-
-
-
-```output
-[Document(metadata={}, page_content='main :: IO ()'),
- Document(metadata={}, page_content='main = do\n putStrLn "Hello, World!"\n-- Some'),
- Document(metadata={}, page_content='sample functions\nadd :: Int -> Int -> Int\nadd x y'),
- Document(metadata={}, page_content='= x + y')]
-```
-
-
-## PHP
-Here's an example using the PHP text splitter:
-
-
-```python
-PHP_CODE = """
-# Caching
+## Caching
Embeddings can be stored or temporarily cached to avoid needing to recompute them.
diff --git a/src/oss/python/integrations/vectorstores/index.mdx b/src/oss/python/integrations/vectorstores/index.mdx
index 775f85a6e..0e8f75cef 100644
--- a/src/oss/python/integrations/vectorstores/index.mdx
+++ b/src/oss/python/integrations/vectorstores/index.mdx
@@ -366,8 +366,8 @@ model = init_chat_model("deepseek-chat", model_provider="deepseek")
**Select vector store:**
-
-
+
+
```bash pip
@@ -383,8 +383,8 @@ from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)
```
-
-
+
+
```bash pip
@@ -406,8 +406,8 @@ vector_store = AstraDBVectorStore(
namespace=ASTRA_DB_NAMESPACE,
)
```
-
-
+
+
```bash pip
@@ -427,8 +427,8 @@ vector_store = Chroma(
persist_directory="./chroma_langchain_db", # Where to save data locally, remove if not necessary
)
```
-
-
+
+
```bash
pip install -qU langchain-community
```
@@ -447,8 +447,8 @@ vector_store = FAISS(
index_to_docstore_id={},
)
```
-
-
+
+
```bash pip
@@ -470,8 +470,8 @@ vector_store = Milvus(
index_params={"index_type": "FLAT", "metric_type": "L2"},
)
```
-
-
+
+
```bash
pip install -qU langchain-mongodb
```
@@ -485,8 +485,8 @@ vector_store = MongoDBAtlasVectorSearch(
relevance_score_fn="cosine",
)
```
-
-
+
+
```bash pip
@@ -506,8 +506,8 @@ vector_store = PGVector(
connection="postgresql+psycopg://..."
)
```
-
-
+
+
```bash pip
@@ -531,8 +531,8 @@ vector_store = PGVectorStore.create_sync(
embedding_service=embedding
)
```
-
-
+
+
```bash pip
@@ -552,8 +552,8 @@ index = pc.Index(index_name)
vector_store = PineconeVectorStore(embedding=embeddings, index=index)
```
-
-
+
+
```bash pip
@@ -584,8 +584,8 @@ vector_store = QdrantVectorStore(
embedding=embeddings,
)
```
-
-
+
+
| Vectorstore | Delete by ID | Filtering | Search by Vector | Search with score | Async | Passes Standard Tests | Multi Tenancy | IDs in add Documents |
|------------|-------------|-----------|-----------------|------------------|--------|---------------------|---------------|-------------------|
diff --git a/src/snippets/vectorstore-tabs-js.mdx b/src/snippets/vectorstore-tabs-js.mdx
index 5083db019..0e2c4e093 100644
--- a/src/snippets/vectorstore-tabs-js.mdx
+++ b/src/snippets/vectorstore-tabs-js.mdx
@@ -2,17 +2,17 @@
```bash npm
- npm i langchain
+ npm i @langchain/classic
```
```bash yarn
- yarn add langchain
+ yarn add @langchain/classic
```
```bash pnpm
- pnpm add langchain
+ pnpm add @langchain/classic
```
```typescript
- import { MemoryVectorStore } from "langchain/vectorstores/memory";
+ import { MemoryVectorStore } from "@langchain/classic/vectorstores/memory";
const vectorStore = new MemoryVectorStore(embeddings);
```