## All about text extraction!

In [11]:
%pip install llama-index-readers-file llama-index-readers-web llama-index-core llama-index-readers-json llama-index-readers-database docx2txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
from llama_index.readers.file import PDFReader, DocxReader
from llama_index.readers.web import SimpleWebPageReader
import pathlib

# Extract from PDF
pdf_reader = PDFReader()
pdf_docs = pdf_reader.load_data(file=pathlib.Path("../samples/pdf-report.pdf"))

# Extract from DOCX
docx_reader = DocxReader()
docx_docs = docx_reader.load_data(file=pathlib.Path("../samples/docx-report.docx"))

# Extract from Web
web_reader = SimpleWebPageReader()
web_docs = web_reader.load_data(urls=["https://example.com"])

# Let's see what the extracted text looks like
print(f"PDF extract: {pdf_docs[0].text[:200]}...")
print(f"DOCX extract: {docx_docs[0].text[:200]}...")
print(f"Web extract: {web_docs[0].text[:200]}...")

Ignoring wrong pointing object 6 0 (offset 0)


PDF extract: Renewable Energy Market Trends: A 2025 Overview Executive Summary This report examines the current state of renewable energy markets globally, highlighting key trends, challenges, and opportunities. I...
DOCX extract: Renewable Energy Market Trends: A 2025 Overview

Executive Summary

This report examines the current state of renewable energy markets globally, highlighting key trends, challenges, and opportunities....
Web extract: <!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" conten...


In [2]:
from llama_index.readers.file import CSVReader, MarkdownReader
from llama_index.readers.json import JSONReader
from llama_index.readers.database import DatabaseReader
import pathlib

# CSV files
csv_reader = CSVReader()
csv_docs = csv_reader.load_data(file=pathlib.Path("../samples/csv-data.csv"))

# JSON files
json_reader = JSONReader()
json_docs = json_reader.load_data(input_file="../samples/json-data.json")

# Markdown files
md_reader = MarkdownReader()
md_docs = md_reader.load_data(file="../samples/README.md")

# Databases 
db_reader = DatabaseReader(uri="sqlite:///../samples/database.db")
db_docs = db_reader.load_data(query="SELECT * FROM orders")

# Let's see what the extracted text looks like
print(f"CSV extract: {csv_docs[0].text[:200]}...")
print(f"JSON extract: {json_docs[0].text[:200]}...")
print(f"Markdown extract: {md_docs[0].text[:200]}...")
print(f"DB extract: {db_docs[0].text[:200]}...")

CSV extract: OrderID, OrderDate, Customer, Product, Quantity, OrderStatus, Misc
4321, 1/30/2010, BX30550, ABQ008, 163, Complete, 54
4352, 1/15/2010, DY55760, ABQ008, 107, Complete, 36
4353, 1/29/2010, BC13961, ABQ...
JSON extract: "OrderID": 4321,
"OrderDate": "1/30/2010",
"Customer": "BX30550",
"Product": "ABQ008",
"Quantity": 163,
"OrderStatus": "Complete",
"Misc": 54
"OrderID": 4352,
"OrderDate": "1/15/2010",
"Customer": "DY...
Markdown extract: 

Sample Data Files Files Overview
This directory contains sample data files in various formats for testing and demonstration purposes.
- **csv-data.csv**: Tabular data in CSV format
- **json-data.jso...
DB extract: OrderID: 4321, OrderDate: 1/30/2010, Customer: BX30550, Product: ABQ008, Quantity: 163, OrderStatus: Complete, Misc: 54...


In [3]:
import re
from llama_index.core.schema import Document

# Get our raw text from a document
raw_text = pdf_docs[0].text

def clean_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters but keep structural elements
    text = re.sub(r'[^\w\s\.\,\;\:\-\(\)\[\]\{\}\"\'\n\t]', '', text)

    # Fix common OCR errors (example)
    text = text.replace('l<eywor', 'keyword')

    return text.strip()


# Let's clean our text
cleaned_text = clean_text(raw_text)

print(f"Original first 100 chars: {raw_text[:100]}")
print(f"Cleaned first 100 chars: {cleaned_text[:100]}")
print(f"Original length: {len(raw_text)} characters")
print(f"Cleaned length: {len(cleaned_text)} characters")

Original first 100 chars: Renewable Energy Market Trends: A 2025 Overview Executive Summary This report examines the current s
Cleaned first 100 chars: Renewable Energy Market Trends: A 2025 Overview Executive Summary This report examines the current s
Original length: 2180 characters
Cleaned length: 2173 characters


In [4]:
# Function to extract basic metadata
def extract_metadata(text, filename):
    metadata = {
        "source": filename,
        "file_type": filename.split('.')[-1],
    }

    # Extract title (assume first line might be title)
    lines = text.split('\n')
    if lines and len(lines[0]) < 100:  # Simple heuristic for title
        metadata["title"] = lines[0].strip()

    # Try to extract date with regex (simple example)
    date_match = re.search(r'\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4}', text)
    if date_match:
        metadata["date"] = date_match.group(0)

    return metadata


# Extract metadata from our document
metadata = extract_metadata(raw_text, "../samples/pdf-report.pdf")

# Create a new document with cleaned text and metadata
processed_doc = Document(
    text=cleaned_text,
    metadata=metadata
)

print(f"Extracted metadata: {metadata}")

Extracted metadata: {'source': '../samples/pdf-report.pdf', 'file_type': 'pdf'}


In [5]:
def process_document(file_path):
    """Process a document with appropriate reader and cleaning"""

    # Determine file type
    file_type = file_path.split('.')[-1].lower()

    # Select appropriate reader
    if file_type == 'pdf':
        reader = PDFReader()
    elif file_type in ['docx', 'doc']:
        reader = DocxReader()
    elif file_type in ['html', 'htm']:
        # Assuming file is a local HTML file
        reader = SimpleWebPageReader()
    else:
        # Default to simple text reading
        with open(file_path, 'r') as f:
            return Document(text=f.read(), metadata={"source": file_path})

    # Load and extract text
    docs = reader.load_data(file=file_path)

    if not docs:
        return None

    # Clean the text
    cleaned_text = clean_text(docs[0].text)

    # Extract metadata
    metadata = extract_metadata(docs[0].text, file_path)

    # Create processed document
    return Document(text=cleaned_text, metadata=metadata)

# Example usage
processed_doc = process_document("../samples/pdf-report.pdf")
print(f"Processed document: {len(processed_doc.text)} characters")
print(f"Metadata: {processed_doc.metadata}")

Ignoring wrong pointing object 6 0 (offset 0)


Processed document: 2173 characters
Metadata: {'source': '../samples/pdf-report.pdf', 'file_type': 'pdf'}
