In [10]:
from extracthero import FilterHero, WhatToRetain
from extracthero.utils import read_md

filter_hero = FilterHero()

In [11]:
sample_md_path = "samples/page_001_firefly.md"

sample_md= read_md(sample_md_path)

print(f"✅ Loaded {len(sample_md)} characters from {sample_md_path}")
print(f"✅ Number of lines: {len(sample_md.splitlines())}")


✅ Loaded 26444 characters from samples/page_001_firefly.md
✅ Number of lines: 980


In [3]:
what_to_retain=" actionable technical API sections: all information regarding enpoints, data types, all usage examples, authentications,"


In [12]:

# Define what to extract
spec = WhatToRetain(
        name="core_api_documentation",
        desc="""Structured reference material and usage guide that explains how to interact with API, usually includes:
                    - Overview: Purpose of the API, supported protocols
                    - Endpoints – List of available routes or methods,
                    - Request Details – Required parameters, query strings, headers, body formats, and example requests.
                    - Response Details – Example outputs, including success and error responses with status codes (e.g., 200 OK, 404 Not Found, 500 Internal Server Error).
                    - Data Models – Definitions of objects and fields (schemas), often provided in JSON or XML.
                    - Authentication & Authorization – How to obtain and use tokens, API keys, or OAuth credentials.
                    - Error Handling – Common error messages, reasons, and suggested remedies.
                    -  Code Samples – Example snippets in different programming languages for practical use.
            """
        # text_rules=[
        #     "we dont care about clearly deprecated methods", 
        #     "for Code Samples or similar keywords: Extract ALL code examples in ALL programming languages present",
        #     "Focus on actual API content, not website navigation structure"
        # ]
    )


# # Define what to extract
# spec = WhatToRetain(
#         name="api_documentation",
#         desc="""Structured reference material and usage guide that explains how to interact with API, usually includes:
#                     - Overview: Purpose of the API, supported protocols
#                     - Endpoints – List of available routes or methods,
#                     - Request Details – Required parameters, query strings, headers, body formats, and example requests.
#                     - Response Details – Example outputs, including success and error responses with status codes (e.g., 200 OK, 404 Not Found, 500 Internal Server Error).
#                     - Data Models – Definitions of objects and fields (schemas), often provided in JSON or XML.
#                     - Authentication & Authorization – How to obtain and use tokens, API keys, or OAuth credentials.
#                     - Error Handling – Common error messages, reasons, and suggested remedies.
#                     -  Code Samples – Example snippets in different programming languages for practical use.
#             """,
#         text_rules=[
#             "we dont care about clearly deprecated methods", 
#             "for Code Samples or similar keywords: Extract ALL code examples in ALL programming languages present",
#             "When multiple code examples exist (e.g., Python, JavaScript, Java), include them ALL under appropriate subheadings",
#             "EXCLUDE navigation menus, breadcrumbs, and website UI elements",
#             "EXCLUDE links that just point to other pages like [Console](/console) or [Edit Profile]",
#             "EXCLUDE sidebar navigation lists with links to other documentation pages",
#             "EXCLUDE 'Sign in', 'Sign out', 'Edit in GitHub', 'Log an issue' and similar UI actions",
#             "EXCLUDE lists that are just navigation links starting with * or brackets like [Overview] or [Concepts]",
#             "Focus on actual API content, not website navigation structure"
#         ]
#     )



## Extractive

In [16]:
extractive_filter_op = filter_hero.run(
    text=sample_md,  # Use sample_md instead of content
    # extraction_spec=[spec],  # Use spec instead of specs
     extraction_spec=spec,
     model_name="gpt-4.1-mini"
)


In [19]:
print("original_line_count:", extractive_filter_op.original_line_count)
print("filtered_line_count:", extractive_filter_op.filtered_line_count)

print("elapsed_time:", extractive_filter_op.elapsed_time)
print("model:", extractive_filter_op.generation_result.model)



original_line_count: 980
filtered_line_count: 419
elapsed_time: 40.861748933792114
model: gpt-4.1-mini


## subtractive

In [20]:
filter_op_subtractive = filter_hero.run(
    text=sample_md,  
    extraction_spec=spec,
    filter_mode="subtractive",  
)

print("original_line_count:", filter_op_subtractive.original_line_count)
print("filtered_line_count:", filter_op_subtractive.filtered_line_count)

print("elapsed_time:", filter_op_subtractive.elapsed_time)
print("model:", filter_op_subtractive.generation_result.model)


original_line_count: 980
filtered_line_count: 840
elapsed_time: 37.21497869491577
model: gpt-4.1-mini


In [15]:
toc_output=filter_op_subtractive.SSM

print(toc_output)

{
  "sections": [
    {
      "name": "Header and Navigation Links",
      "category": "navigation",
      "start_line": 1,
      "end_line": 66,
      "is_content": false,
      "is_navigation": true
    },
    {
      "name": "Edit on GitHub and Issue Links",
      "category": "navigation",
      "start_line": 67,
      "end_line": 68,
      "is_content": false,
      "is_navigation": true
    },
    {
      "name": "Quickstart Guide Introduction and Image",
      "category": "content",
      "start_line": 69,
      "end_line": 74,
      "is_content": true,
      "is_navigation": false
    },
    {
      "name": "Prerequisites Section",
      "category": "content",
      "start_line": 75,
      "end_line": 81,
      "is_content": true,
      "is_navigation": false
    },
    {
      "name": "Set Up Your Environment Instructions - JavaScript",
      "category": "code",
      "start_line": 82,
      "end_line": 98,
      "is_content": true,
      "is_navigation": false
    },
    {
   

In [None]:
filter_op_subtractive.deletions_applied

In [21]:
filter_op_subtractive.lines_removed

140

In [None]:
sub_md_path = filter_op_subtractive.save_content(
        format="markdown",
        filename="subtractive_result",
        dir="output"
    )
print(f"✅ Subtractive results (markdown) saved to: {sub_md_path}")

In [None]:
filter_op_subtractive.deletions_applied

In [None]:
filter_op_subtractive.deletions_applied

In [None]:
filter_op_subtractive.deletions_applied

In [None]:
# Compare Basic vs Subtractive Filtering Results
print("=" * 80)
print("COMPARISON: Basic (Extractive) vs Subtractive Filtering")
print("=" * 80)

# Original content info
original_chars = len(sample_md)
original_lines = len(sample_md.splitlines())
print(f"\n📄 ORIGINAL CONTENT:")
print(f"   • Characters: {original_chars:,}")
print(f"   • Lines: {original_lines:,}")

# Basic/Extractive mode results
print(f"\n🔵 BASIC/EXTRACTIVE MODE:")
if filter_op_basic.success:
    basic_chars = len(filter_op_basic.content) if filter_op_basic.content else 0
    basic_lines = len(filter_op_basic.content.splitlines()) if filter_op_basic.content else 0
    basic_retention = (basic_chars / original_chars * 100) if original_chars > 0 else 0
    
    print(f"   • Output Characters: {basic_chars:,} ({basic_retention:.1f}% retained)")
    print(f"   • Output Lines: {basic_lines:,}")
    print(f"   • Elapsed Time: {filter_op_basic.elapsed_time:.2f} seconds")
    
    if filter_op_basic.usage:
        print(f"   • Input Tokens: {filter_op_basic.usage.get('input_tokens', 'N/A'):,}")
        print(f"   • Output Tokens: {filter_op_basic.usage.get('output_tokens', 'N/A'):,}")
        print(f"   • Total Cost: ${filter_op_basic.usage.get('total_cost', 0):.6f}")
else:
    print(f"   ❌ Failed: {filter_op_basic.error}")

# Subtractive mode results
print(f"\n🔴 SUBTRACTIVE MODE:")
if filter_op_subtractive.success:
    sub_chars = len(filter_op_subtractive.content) if filter_op_subtractive.content else 0
    sub_lines = len(filter_op_subtractive.content.splitlines()) if filter_op_subtractive.content else 0
    sub_retention = (sub_chars / original_chars * 100) if original_chars > 0 else 0
    
    print(f"   • Output Characters: {sub_chars:,} ({sub_retention:.1f}% retained)")
    print(f"   • Output Lines: {sub_lines:,}")
    print(f"   • Lines Removed: {filter_op_subtractive.lines_removed:,}")
    print(f"   • Elapsed Time: {filter_op_subtractive.elapsed_time:.2f} seconds")
    
    if filter_op_subtractive.usage:
        print(f"   • Input Tokens: {filter_op_subtractive.usage.get('input_tokens', 'N/A'):,}")
        print(f"   • Output Tokens: {filter_op_subtractive.usage.get('output_tokens', 'N/A'):,}")
        print(f"   • Total Cost: ${filter_op_subtractive.usage.get('total_cost', 0):.6f}")
    
    # Show deletion details if available
    if filter_op_subtractive.deletions_applied:
        print(f"\n   📋 Deletion Ranges ({len(filter_op_subtractive.deletions_applied)} total):")
        for i, deletion in enumerate(filter_op_subtractive.deletions_applied[:5], 1):  # Show first 5
            print(f"      {i}. Lines {deletion['start_line']}-{deletion['end_line']}: {deletion.get('reason', 'N/A')}")
        if len(filter_op_subtractive.deletions_applied) > 5:
            print(f"      ... and {len(filter_op_subtractive.deletions_applied) - 5} more")
else:
    print(f"   ❌ Failed: {filter_op_subtractive.error}")

# Comparison Summary
print("\n" + "=" * 80)
print("SUMMARY COMPARISON:")
print("=" * 80)

if filter_op_basic.success and filter_op_subtractive.success:
    # Character comparison
    char_diff = basic_chars - sub_chars
    char_diff_pct = (char_diff / basic_chars * 100) if basic_chars > 0 else 0
    
    # Cost comparison
    basic_cost = filter_op_basic.usage.get('total_cost', 0) if filter_op_basic.usage else 0
    sub_cost = filter_op_subtractive.usage.get('total_cost', 0) if filter_op_subtractive.usage else 0
    cost_diff = basic_cost - sub_cost
    cost_savings = (cost_diff / basic_cost * 100) if basic_cost > 0 else 0
    
    # Time comparison
    time_diff = filter_op_basic.elapsed_time - filter_op_subtractive.elapsed_time
    time_savings = (time_diff / filter_op_basic.elapsed_time * 100) if filter_op_basic.elapsed_time > 0 else 0
    
    print(f"📊 Output Size:")
    print(f"   • Basic/Extractive: {basic_chars:,} chars")
    print(f"   • Subtractive: {sub_chars:,} chars")
    if char_diff > 0:
        print(f"   • Difference: Basic has {char_diff:,} more chars ({char_diff_pct:.1f}% more)")
    else:
        print(f"   • Difference: Subtractive has {-char_diff:,} more chars ({-char_diff_pct:.1f}% more)")
    
    print(f"\n💰 Cost Comparison:")
    print(f"   • Basic/Extractive: ${basic_cost:.6f}")
    print(f"   • Subtractive: ${sub_cost:.6f}")
    if cost_diff > 0:
        print(f"   • Savings: ${cost_diff:.6f} ({cost_savings:.1f}% cheaper with subtractive)")
    else:
        print(f"   • Subtractive costs ${-cost_diff:.6f} more ({-cost_savings:.1f}% more expensive)")
    
    print(f"\n⏱️ Performance:")
    print(f"   • Basic/Extractive: {filter_op_basic.elapsed_time:.2f}s")
    print(f"   • Subtractive: {filter_op_subtractive.elapsed_time:.2f}s")
    if time_diff > 0:
        print(f"   • Subtractive is {time_diff:.2f}s faster ({time_savings:.1f}% improvement)")
    else:
        print(f"   • Basic is {-time_diff:.2f}s faster ({-time_savings:.1f}% improvement)")
    
    print(f"\n🎯 Efficiency Metrics:")
    print(f"   • Subtractive removed {filter_op_subtractive.lines_removed:,} lines ({filter_op_subtractive.lines_removed/original_lines*100:.1f}% of original)")
    print(f"   • Basic retained {basic_retention:.1f}% of original content")
    print(f"   • Subtractive retained {sub_retention:.1f}% of original content")