In [9]:
import sys
import os 
from extracthero import FilterHero, WhatToRetain

# Simple Filtering

In [17]:
content = """
    Welcome to our store!
    
    Product: Professional Laptop
    Price: $1,299
    Features:
    - Intel Core i7 processor
    - 16GB RAM
    - 512GB SSD
    
    Contact us at support@example.com
    Terms and conditions apply.
    """


filter_hero = FilterHero()
specs = [WhatToRetain(name="product", desc="product information including name, price, and features")]
result = filter_hero.run(
            text=content,
            extraction_spec=specs
            # No filter_strategy specified - using default
        )

In [18]:
result.content



'- Product: Professional Laptop  \n  Price: $1,299  \n  Features:  \n  - Intel Core i7 processor  \n  - 16GB RAM  \n  - 512GB SSD  \n\n---\n\n- Contact us at support@example.com  \n- Terms and conditions apply.'

In [19]:
result.generation_result.usage

{'input_tokens': 198,
 'output_tokens': 55,
 'total_tokens': 253,
 'input_cost': 2.97e-05,
 'output_cost': 3.2999999999999996e-05,
 'total_cost': 6.269999999999999e-05}

# Substractive Filtering

In [None]:

filter_hero = FilterHero()
specs = [WhatToRetain(name="product", desc="product information including name, price, and features")]
result = filter_hero.run(
            text=content,
            extraction_spec=specs,
              filter_mode="subtractive"
          
        )




In [13]:
result.content

'\n\n    Product: Professional Laptop\n    Price: $1,299\n    Features:\n    - Intel Core i7 processor\n    - 16GB RAM\n    - 512GB SSD\n\n    '

In [14]:
result.deletions_applied

[{'start_line': 2, 'end_line': 2, 'reason': 'greeting'},
 {'start_line': 11,
  'end_line': 12,
  'reason': 'contact information and legal terms'}]

In [15]:
result.generation_result.usage

{'input_tokens': 243,
 'output_tokens': 58,
 'total_tokens': 301,
 'input_cost': 3.645e-05,
 'output_cost': 3.48e-05,
 'total_cost': 7.125e-05}

# Example Real 

In [20]:
sample_md_path = "samples/page_001_developer.adobe.com_firefly-services_docs_firefly-api_guides_.md"

# Read the file content
with open(sample_md_path, 'r', encoding='utf-8') as file:
    sample_md = file.read()

# Show info about the loaded content
print(f"✅ Loaded {len(sample_md)} characters from {sample_md_path}")
print(f"✅ Number of lines: {len(sample_md.splitlines())}")
print("\n📄 First 500 characters:")
print("-" * 50)
print(sample_md[:500])
print("-" * 50)

✅ Loaded 26444 characters from samples/page_001_developer.adobe.com_firefly-services_docs_firefly-api_guides_.md
✅ Number of lines: 980

📄 First 500 characters:
--------------------------------------------------
# Source: https://developer.adobe.com/firefly-services/docs/firefly-api/guides/

[**Adobe Developer**](/)

[Products](/apis/)

[Products](/apis/)

[ All Firefly Services ](/firefly-services/docs/guides/)[ Firefly API ](/firefly-services/docs/firefly-api/)[ Photoshop API ](https://developer.adobe.com/firefly-services/docs/photoshop/?aio%5Finternal)[ Lightroom API ](/firefly-services/docs/lightroom/)[ Audio/Video API ](https://developer.adobe.com/audio-video-firefly-services/?aio%5Finternal)[ InDe
--------------------------------------------------


In [21]:

filter_hero = FilterHero()

# Define what to extract
spec = WhatToRetain(
        name="api_documentation",
        desc="""Structured reference material and usage guide that explains how to interact with API, usually includes:
                    - Overview: Purpose of the API, supported protocols
                    - Endpoints – List of available routes or methods,
                    - Request Details – Required parameters, query strings, headers, body formats, and example requests.
                    - Response Details – Example outputs, including success and error responses with status codes (e.g., 200 OK, 404 Not Found, 500 Internal Server Error).
                    - Data Models – Definitions of objects and fields (schemas), often provided in JSON or XML.
                    - Authentication & Authorization – How to obtain and use tokens, API keys, or OAuth credentials.
                    - Error Handling – Common error messages, reasons, and suggested remedies.
                    -  Code Samples – Example snippets in different programming languages for practical use.
            """,
        text_rules=[
            "we dont care about clearly deprecated methods", 
            "for Code Samples or similar keywords: Extract ALL code examples in ALL programming languages present",
           " When multiple code examples exist (e.g., Python, JavaScript, Java), include them ALL under appropriate subheadings"
        ]
    )

In [None]:
filter_op_basic = filter_hero.run(
    text=sample_md,  # Use sample_md instead of content
    extraction_spec=[spec],  # Use spec instead of specs
)

In [24]:
filter_op_basic

FilterOp(success=True, content='# API Overview and Usage Guide for Adobe Firefly Services\n\n---\n\n## Overview\n\nThe Adobe Firefly Services API allows developers to generate images through programmatic requests. This API supports multiple programming languages and integrates securely with Adobe\'s authentication mechanisms.\n\n---\n\n## Endpoints\n\n### 1. **Generate Image API**\n- **Endpoint:** `https://firefly-api.adobe.io/v3/images/generate`\n  \n### 2. **Retrieve Access Token**\n- **Token URL:** `https://ims-na1.adobelogin.com/ims/token/v3`\n\n---\n\n## Request Details\n\n### Generating an Access Token\n\n#### Required Parameters\n- `grant_type`: Set to `client_credentials`\n- `client_id`: Your Firefly API Client ID\n- `client_secret`: Your Firefly API Client Secret\n- `scope`: A list of required scopes, e.g., `openid,AdobeID,session,additional_info,read_organizations,firefly_api,ff_apis`\n\n#### Example Request (cURL)\n```bash\ncurl --location \'https://ims-na1.adobelogin.com/im

In [23]:
filter_op_subtractive = filter_hero.run(
    text=sample_md,  # Use sample_md instead of content
    extraction_spec=[spec],  # Use spec instead of specs
    filter_mode="subtractive"
)

In [25]:
# Compare Basic vs Subtractive Filtering Results
print("=" * 80)
print("COMPARISON: Basic (Extractive) vs Subtractive Filtering")
print("=" * 80)

# Original content info
original_chars = len(sample_md)
original_lines = len(sample_md.splitlines())
print(f"\n📄 ORIGINAL CONTENT:")
print(f"   • Characters: {original_chars:,}")
print(f"   • Lines: {original_lines:,}")

# Basic/Extractive mode results
print(f"\n🔵 BASIC/EXTRACTIVE MODE:")
if filter_op_basic.success:
    basic_chars = len(filter_op_basic.content) if filter_op_basic.content else 0
    basic_lines = len(filter_op_basic.content.splitlines()) if filter_op_basic.content else 0
    basic_retention = (basic_chars / original_chars * 100) if original_chars > 0 else 0
    
    print(f"   • Output Characters: {basic_chars:,} ({basic_retention:.1f}% retained)")
    print(f"   • Output Lines: {basic_lines:,}")
    print(f"   • Elapsed Time: {filter_op_basic.elapsed_time:.2f} seconds")
    
    if filter_op_basic.usage:
        print(f"   • Input Tokens: {filter_op_basic.usage.get('input_tokens', 'N/A'):,}")
        print(f"   • Output Tokens: {filter_op_basic.usage.get('output_tokens', 'N/A'):,}")
        print(f"   • Total Cost: ${filter_op_basic.usage.get('total_cost', 0):.6f}")
else:
    print(f"   ❌ Failed: {filter_op_basic.error}")

# Subtractive mode results
print(f"\n🔴 SUBTRACTIVE MODE:")
if filter_op_subtractive.success:
    sub_chars = len(filter_op_subtractive.content) if filter_op_subtractive.content else 0
    sub_lines = len(filter_op_subtractive.content.splitlines()) if filter_op_subtractive.content else 0
    sub_retention = (sub_chars / original_chars * 100) if original_chars > 0 else 0
    
    print(f"   • Output Characters: {sub_chars:,} ({sub_retention:.1f}% retained)")
    print(f"   • Output Lines: {sub_lines:,}")
    print(f"   • Lines Removed: {filter_op_subtractive.lines_removed:,}")
    print(f"   • Elapsed Time: {filter_op_subtractive.elapsed_time:.2f} seconds")
    
    if filter_op_subtractive.usage:
        print(f"   • Input Tokens: {filter_op_subtractive.usage.get('input_tokens', 'N/A'):,}")
        print(f"   • Output Tokens: {filter_op_subtractive.usage.get('output_tokens', 'N/A'):,}")
        print(f"   • Total Cost: ${filter_op_subtractive.usage.get('total_cost', 0):.6f}")
    
    # Show deletion details if available
    if filter_op_subtractive.deletions_applied:
        print(f"\n   📋 Deletion Ranges ({len(filter_op_subtractive.deletions_applied)} total):")
        for i, deletion in enumerate(filter_op_subtractive.deletions_applied[:5], 1):  # Show first 5
            print(f"      {i}. Lines {deletion['start_line']}-{deletion['end_line']}: {deletion.get('reason', 'N/A')}")
        if len(filter_op_subtractive.deletions_applied) > 5:
            print(f"      ... and {len(filter_op_subtractive.deletions_applied) - 5} more")
else:
    print(f"   ❌ Failed: {filter_op_subtractive.error}")

# Comparison Summary
print("\n" + "=" * 80)
print("SUMMARY COMPARISON:")
print("=" * 80)

if filter_op_basic.success and filter_op_subtractive.success:
    # Character comparison
    char_diff = basic_chars - sub_chars
    char_diff_pct = (char_diff / basic_chars * 100) if basic_chars > 0 else 0
    
    # Cost comparison
    basic_cost = filter_op_basic.usage.get('total_cost', 0) if filter_op_basic.usage else 0
    sub_cost = filter_op_subtractive.usage.get('total_cost', 0) if filter_op_subtractive.usage else 0
    cost_diff = basic_cost - sub_cost
    cost_savings = (cost_diff / basic_cost * 100) if basic_cost > 0 else 0
    
    # Time comparison
    time_diff = filter_op_basic.elapsed_time - filter_op_subtractive.elapsed_time
    time_savings = (time_diff / filter_op_basic.elapsed_time * 100) if filter_op_basic.elapsed_time > 0 else 0
    
    print(f"📊 Output Size:")
    print(f"   • Basic/Extractive: {basic_chars:,} chars")
    print(f"   • Subtractive: {sub_chars:,} chars")
    if char_diff > 0:
        print(f"   • Difference: Basic has {char_diff:,} more chars ({char_diff_pct:.1f}% more)")
    else:
        print(f"   • Difference: Subtractive has {-char_diff:,} more chars ({-char_diff_pct:.1f}% more)")
    
    print(f"\n💰 Cost Comparison:")
    print(f"   • Basic/Extractive: ${basic_cost:.6f}")
    print(f"   • Subtractive: ${sub_cost:.6f}")
    if cost_diff > 0:
        print(f"   • Savings: ${cost_diff:.6f} ({cost_savings:.1f}% cheaper with subtractive)")
    else:
        print(f"   • Subtractive costs ${-cost_diff:.6f} more ({-cost_savings:.1f}% more expensive)")
    
    print(f"\n⏱️ Performance:")
    print(f"   • Basic/Extractive: {filter_op_basic.elapsed_time:.2f}s")
    print(f"   • Subtractive: {filter_op_subtractive.elapsed_time:.2f}s")
    if time_diff > 0:
        print(f"   • Subtractive is {time_diff:.2f}s faster ({time_savings:.1f}% improvement)")
    else:
        print(f"   • Basic is {-time_diff:.2f}s faster ({-time_savings:.1f}% improvement)")
    
    print(f"\n🎯 Efficiency Metrics:")
    print(f"   • Subtractive removed {filter_op_subtractive.lines_removed:,} lines ({filter_op_subtractive.lines_removed/original_lines*100:.1f}% of original)")
    print(f"   • Basic retained {basic_retention:.1f}% of original content")
    print(f"   • Subtractive retained {sub_retention:.1f}% of original content")

COMPARISON: Basic (Extractive) vs Subtractive Filtering

📄 ORIGINAL CONTENT:
   • Characters: 26,444
   • Lines: 980

🔵 BASIC/EXTRACTIVE MODE:
   • Output Characters: 5,414 (20.5% retained)
   • Output Lines: 190
   • Elapsed Time: 19.18 seconds
   • Input Tokens: 6,904
   • Output Tokens: 1,300
   • Total Cost: $0.001816

🔴 SUBTRACTIVE MODE:
   • Output Characters: 23,838 (90.1% retained)
   • Output Lines: 919
   • Lines Removed: 61
   • Elapsed Time: 3.35 seconds
   • Input Tokens: 9,971
   • Output Tokens: 165
   • Total Cost: $0.001595

   📋 Deletion Ranges (7 total):
      1. Lines 3-17: navigation_menu
      2. Lines 67-67: navigation_menu
      3. Lines 904-908: advertisement
      4. Lines 910-911: navigation_menu
      5. Lines 933-942: navigation_menu
      ... and 2 more

SUMMARY COMPARISON:
📊 Output Size:
   • Basic/Extractive: 5,414 chars
   • Subtractive: 23,838 chars
   • Difference: Subtractive has 18,424 more chars (340.3% more)

💰 Cost Comparison:
   • Basic/Extracti