In [32]:
from langchain.document_loaders import PyPDFLoader, WebBaseLoader, TextLoader, JSONLoader, UnstructuredXMLLoader, ArxivLoader, WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, RecursiveJsonSplitter
from langchain_core.documents import Document
import bs4
import requests

In [6]:
# Text Loader and Splitter
text_loader = TextLoader("data/speech.txt")
text_docs = text_loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
text_split_docs = text_splitter.split_documents(text_docs)
text_split_docs

[Document(metadata={'source': 'data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': 'data/speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'),
 Document(metadata={'source': 'data/speech

In [8]:
# PDF Loader and Splitter
pdf_loader = PyPDFLoader("data/syllabus.pdf")
pdf_docs = pdf_loader.load()
pdf_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
pdf_split_docs = pdf_splitter.split_documents(pdf_docs)
pdf_split_docs

[Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'data/syllabus.pdf', 'total_pages': 34, 'page': 0, 'page_label': '1'}, page_content='MACHINE\nLEARNING\nDEEP\nLEARNING\nPYTHON +\nSTATS\nCOMPUTER VISIONNATURAL LANGUAGE PROCESSING\nGENERATIVE AI\nRETRIEVAL AUGUMENT GENERATION\nVECTOR DB'),
 Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-01-30T20:27:03+00:00', 'title': 'Ultimate Data Science & GenAI Bootcamp', 'moddate': '2025-01-30T20:26:59+00:00', 'keywords': 'DAGdmhcqnYw,BAEmsmap8Lg,0', 'author': 'monal singh', 'containsaigeneratedcontent': 'Yes', 'source': 'data/syllabus.pdf', 'total_pages': 34, 'page': 1, 'page_label': '2'}, page_content='This course is designed for aspiring data scientists, m

In [17]:
# XML Loader and Splitter
xml_loader = UnstructuredXMLLoader("data/records.xml")
xml_docs = xml_loader.load()
xml_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
xml_split_docs = xml_splitter.split_documents(xml_docs)
xml_split_docs

[Document(metadata={'source': 'data/records.xml'}, page_content='1\n\nName_1\n\nValue_1\n\n2\n\nName_2\n\nValue_2\n\n3\n\nName_3\n\nValue_3\n\n4\n\nName_4\n\nValue_4\n\n5\n\nName_5\n\nValue_5\n\n6\n\nName_6\n\nValue_6\n\n7\n\nName_7\n\nValue_7\n\n8\n\nName_8\n\nValue_8\n\n9\n\nName_9\n\nValue_9\n\n10\n\nName_10\n\nValue_10')]

In [23]:
# ArXiv Loader and Splitter
arxiv_loader = ArxivLoader("1706.03762", load_max_docs=5)
arxiv_docs = arxiv_loader.load()
arxiv_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
arxiv_split_docs = arxiv_splitter.split_documents(arxiv_docs)
arxiv_split_docs

[Document(metadata={'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-French\ntr

In [24]:
# Wikipedia Loader and Splitter
wiki_loader = WikipediaLoader("https://python.langchain.com/docs/integrations/tools/wikipedia/", load_max_docs=4)
wiki_docs = wiki_loader.load()
wiki_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
wiki_split_docs = wiki_splitter.split_documents(wiki_docs)
wiki_split_docs

[Document(metadata={'title': 'LangChain', 'summary': "LangChain is a software framework that helps facilitate the integration of large language models (LLMs) into applications. As a language model integration framework, LangChain's use-cases largely overlap with those of language models in general, including document analysis and summarization, chatbots, and code analysis.", 'source': 'https://en.wikipedia.org/wiki/LangChain'}, page_content="LangChain is a software framework that helps facilitate the integration of large language models (LLMs) into applications. As a language model integration framework, LangChain's use-cases largely overlap with those of language models in general, including document analysis and summarization, chatbots, and code analysis."),
 Document(metadata={'title': 'LangChain', 'summary': "LangChain is a software framework that helps facilitate the integration of large language models (LLMs) into applications. As a language model integration framework, LangChain

In [26]:
# Web Loader and Splitter
loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")
                     ))
                     )

doc=loader.load()
doc

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/'}, page_content='\n\n      LLM Powered Autonomous Agents\n    \nDate: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng\n\n\nBuilding agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview#\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistake

## Character Text Splitter

In [27]:
text_character_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100, separator="\n\n")
text_split_docs_character = text_character_splitter.split_documents(text_docs)
text_split_docs_character

Created a chunk of size 668, which is longer than the specified 500
Created a chunk of size 982, which is longer than the specified 500
Created a chunk of size 789, which is longer than the specified 500


[Document(metadata={'source': 'data/speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.'),
 Document(metadata={'source': 'data/speech.txt'}, page_content='Just because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'),
 Document(metadata={'source': 'data/speech

## JSON Splitter

In [37]:
json_data=requests.get("https://api.smith.langchain.com/openapi.json").json()

json_splitter = RecursiveJsonSplitter(max_chunk_size=500)
json_split_docs = json_splitter.split_json(json_data)
json_split_docs


[{'openapi': '3.1.0',
  'info': {'title': 'LangSmith', 'version': '0.1.0'},
  'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'tags': ['tracer-sessions'],
     'summary': 'Get Tracing Project Prebuilt Dashboard',
     'description': 'Get a prebuilt dashboard for a tracing project.',
     'operationId': 'get_tracing_project_prebuilt_dashboard_api_v1_sessions__session_id__dashboard_post',
     'security': [{'API Key': []}, {'Tenant ID': []}, {'Bearer Auth': []}]}}}},
 {'paths': {'/api/v1/sessions/{session_id}/dashboard': {'post': {'parameters': [{'name': 'session_id',
       'in': 'path',
       'required': True,
       'schema': {'type': 'string', 'format': 'uuid', 'title': 'Session Id'}},
      {'name': 'accept',
       'in': 'header',
       'required': False,
       'schema': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
        'title': 'Accept'}}],
     'requestBody': {'required': True,
      'content': {'application/json': {'schema': {'$ref': '#/components/schem