# Integrating document loaders

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import HNLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredHTMLLoader

## PDF document loaders

In [None]:
# Create a document loader for attention_is_all_you_need.pdf
loader = PyPDFLoader("documents/attention-is-all-you-need.pdf")

# Load the document
data = loader.load()
print(data[0])

## CSV document loaders

In [None]:
# Create a document loader for fifa_countries_audience.csv
loader = CSVLoader(file_path='documents/fifa_countries_audience.csv')

# Load the document
data = loader.load()
print(data[0])

## Third-party document loaders

In [None]:
# Create a document loader for the top Hacker News stories
loader = HNLoader("https://news.ycombinator.com")

# Load the document
data = loader.load()

# Print the first document
print(data[0])

# Print the first document's metadata
print(data[0].metadata)

# Splitting external data for retrieval

## Splitting bu character

In [None]:
quote = 'One machine can do the work of fifty ordinary humans. No machine can do the work of one extraordinary human.'
chunk_size = 24
chunk_overlap = 3

# Create an instance of the splitter class
splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)

# Split the document and print the chunks
docs = splitter.split_text(quote)
print(docs)

## Recursively splitting by character

In [None]:
quote = 'Words are flowing out like endless rain into a paper cup,\nthey slither while they pass,\nthey slip away across the universe.'
chunk_size = 24
chunk_overlap = 10

# Create an instance of the splitter class
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)

# Split the document and print the chunks
docs = splitter.split_text(quote)
print(docs)

## Splitting HTML

In [None]:
# Load the HTML document into memory
loader = UnstructuredHTMLLoader("documents/white_house_executive_order_nov_2023.html")
data = loader.load()

# Define variables
chunk_size = 300
chunk_overlap = 100

# Split the HTML
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators=['.'])

docs = splitter.split_documents(data)
print(docs)