# Why Metadata Matters:
- It provides context about our documents (author, date, source, topic)
- It enables filtering and sorting (find all documents from a specific date range)
- It improves search relevance (identifying which documents are most important)
- It helps with organization (grouping related documents together)

In [1]:
from llama_index.core import Document
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode
import os
from datetime import datetime

# Sample document with some metadata potential
sample_doc = """
# Annual Report 2023
## Financial Performance
Our company achieved record profits in 2023, with revenue increasing 15% compared to 2022.

## Product Launches
The new X1000 product line was launched in March 2023 and has exceeded sales expectations.

## Future Outlook
We expect continued growth in 2024, driven by expansion into European markets.
"""

# Create a document with basic metadata
filename = "annual_report_2023.md"
file_path = f"/documents/{filename}"

# Extract basic metadata from filename and content
def extract_basic_metadata(content, filename, file_path):
    """Extract basic metadata from document content and file information"""

    # Get file information
    file_metadata = {
        "file_name": filename,
        "file_path": file_path,
        # Extension without the dot
        "file_type": os.path.splitext(filename)[1][1:],
        "file_size": len(content),  # Simple size in characters
        "extracted_date": datetime.now().strftime("%Y-%m-%d")
    }

    # Try to extract year from content or filename
    year_match = None
    if "2023" in content:
        year_match = "2023"
    elif "2022" in content:
        year_match = "2022"
    elif "2024" in content:
        year_match = "2024"

    if year_match:
        file_metadata["year"] = year_match

    # Try to extract document type
    if "annual report" in content.lower() or "annual report" in filename.lower():
        file_metadata["document_type"] = "annual_report"

    return file_metadata


# Extract metadata
basic_metadata = extract_basic_metadata(sample_doc, filename, file_path)

# Create document with metadata
doc = Document(text=sample_doc, metadata=basic_metadata)

# Show the document with its metadata
print("Document Text (first 50 chars):", doc.text[:50])
print("\nDocument Metadata:")
for key, value in doc.metadata.items():
    print(f"  {key}: {value}")

# Let's create nodes with the metadata
parser = SimpleNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents([doc])

# Show that metadata is propagated to nodes
print("\nNodes created:", len(nodes))
print("First node metadata:")
for key, value in nodes[0].metadata.items():
    print(f"  {key}: {value}")

Document Text (first 50 chars): 
# Annual Report 2023
## Financial Performance
Our

Document Metadata:
  file_name: annual_report_2023.md
  file_path: /documents/annual_report_2023.md
  file_type: md
  file_size: 348
  extracted_date: 2025-03-29
  year: 2023
  document_type: annual_report

Nodes created: 1
First node metadata:
  file_name: annual_report_2023.md
  file_path: /documents/annual_report_2023.md
  file_type: md
  file_size: 348
  extracted_date: 2025-03-29
  year: 2023
  document_type: annual_report
