In [3]:
!pip install raganything --quiet

In [6]:
import asyncio
import os

In [8]:
async def test_rag_anything_parsing():
    from raganything import RAGAnything, RAGAnythingConfig
    
    # Minimal config - no API keys needed for parsing only
    config = RAGAnythingConfig(
        working_dir="./rag_test",
        parser="mineru",
        parse_method="auto",
        enable_image_processing=True,
        enable_table_processing=True,
    )
    
    # Initialize without LLM functions (just for parsing)
    rag = RAGAnything(config=config)
    
    # Parse the PDF
    print("ðŸ“„ Parsing PDF...")
    content_list = await rag.parse_document(
        file_path="/Users/brunamedeiros/Documents/GitHub/Capstone/Data_Ingestion_VDB/test_dataset/Diagnostic Imaging Genitourinary ( PDFDrive ).pdf",
        output_dir="./output"
    )
    
    # Show results
    print(f"\nâœ… Extracted {len(content_list)} items")
    print(f"   Text: {sum(1 for x in content_list if x['type'] == 'text')}")
    print(f"   Images: {sum(1 for x in content_list if x['type'] == 'image')}")
    print(f"   Tables: {sum(1 for x in content_list if x['type'] == 'table')}")
    
    # Show first few items
    for i, item in enumerate(content_list[:5]):
        print(f"\n[{i+1}] Type: {item['type']}")
        if item['type'] == 'text':
            print(f"    {item['text'][:100]}...")
        elif item['type'] == 'image':
            print(f"    Image: {item.get('img_path', 'N/A')}")
    
    return content_list

# Run it
content = await test_rag_anything_parsing()

INFO: RAGAnything initialized with config:
INFO:   Working directory: ./rag_test
INFO:   Parser: mineru
INFO:   Parse method: auto
INFO:   Multimodal processing - Image: True, Table: True, Equation: True
INFO:   Max concurrent files: 1
INFO: Starting document parsing: /Users/brunamedeiros/Documents/GitHub/Capstone/Data_Ingestion_VDB/test_dataset/Diagnostic Imaging Genitourinary ( PDFDrive ).pdf
INFO: Using mineru parser with method: auto
INFO: Detected PDF file, using parser for PDF...


ðŸ“„ Parsing PDF...


INFO: Parsing /Users/brunamedeiros/Documents/GitHub/Capstone/Data_Ingestion_VDB/test_dataset/Diagnostic Imaging Genitourinary ( PDFDrive ).pdf complete! Extracted 11949 content blocks
INFO: 
Content Information:
INFO: * Total blocks in content_list: 11949
INFO: * Content block types:
INFO:   - text: 10882
INFO:   - image: 1021
INFO:   - table: 46



âœ… Extracted 2 items


TypeError: list indices must be integers or slices, not str