# Content Agnostic Extraction Test

This notebook tests the `ContentAgnosticExtractor` to verify extraction of lab results including CBC differential (Basophils, etc.)

In [1]:
import sys
sys.path.insert(0, '../src')

import asyncio
import pymupdf
from medical_ingestion.extractors.content_agnostic_extractor import ContentAgnosticExtractor
from medical_ingestion.extractors.universal_text_extractor import UniversalTextExtractor

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


## 1. Load and Extract Text from PDF

In [2]:
# Test file - LabCorp CBC report
file_path = '../data/samples/labs/labcorp/SampleLabCorpReport.pdf'

# Method 1: Direct PyMuPDF extraction
doc = pymupdf.open(file_path)
pymupdf_text = ''.join(page.get_text() for page in doc)
print(f'PyMuPDF text length: {len(pymupdf_text)}')
print('\nFirst 500 chars:')
print(pymupdf_text[:500])

PyMuPDF text length: 6707

First 500 chars:
Faloon, Bill
DOB: 11/07/1954
Patient Report
Patient ID: B0118302589
Age: 66
Account Number: 09134070
Specimen ID: 231-291-0955-0
Sex: Male
Ordering Physician: D BOWLING
 
Ordered Items: CMP14+LP+4AC+CBC/D/Plt; Troponin T; Creatine Kinase,Total
 
Date Collected: 08/19/2021
Date Received: 08/19/2021
Date Reported: 08/22/2021
Fasting: No
CMP14+LP+4AC+CBC/D/Plt
  Test
Current Result and Flag
Previous Result and Date
Units
Reference Interval
 
Chemistries 01
 
Glucose 01
95
mg/dL
65-99
 
Uric Acid 01


In [3]:
# Method 2: UniversalTextExtractor (used by pipeline)
async def get_universal_text():
    extractor = UniversalTextExtractor()
    result = await extractor.extract(file_path)
    return result

text_result = await get_universal_text()
print(f'UniversalTextExtractor text length: {len(text_result.full_text)}')
print(f'Has layout: {text_result.layout is not None}')



UniversalTextExtractor text length: 15922
Has layout: True


## 2. Check for CBC Differential Keywords in Text

In [4]:
# Check for CBC differential keywords
cbc_keywords = ['wbc', 'rbc', 'hemoglobin', 'hematocrit', 'platelet', 
                'neutrophil', 'lymphocyte', 'monocyte', 'eosinophil', 'basophil', 'baso']

text_lower = text_result.full_text.lower()
found_keywords = [kw for kw in cbc_keywords if kw in text_lower]
print(f'Found CBC keywords: {found_keywords}')

# Show context around basophils
if 'baso' in text_lower:
    idx = text_lower.find('baso')
    print(f'\nBasophils context (position {idx}):')
    print(text_result.full_text[max(0, idx-100):idx+150])

Found CBC keywords: ['wbc', 'rbc', 'hemoglobin', 'hematocrit', 'platelet', 'neutrophil', 'monocyte', 'baso']

Basophils context (position 9617):
ot Estab.
                                                                                    
     Basos 01               2                                    %        Not Estab.
     Neutrophils (Absolute) 01 2.0                             x10E3/u


## 3. Run ContentAgnosticExtractor

In [5]:
# Run extraction with default config (max_text_length=2500)
async def run_extraction(text, config=None):
    extractor = ContentAgnosticExtractor(config or {})
    result = await extractor.extract(text)
    return result

# Test with default config
result = await run_extraction(text_result.full_text)

print(f'Total test results extracted: {len(result.test_results)}')
print(f'Extraction confidence: {result.extraction_confidence}')
print(f'Warnings: {result.warnings}')

  from .autonotebook import tqdm as notebook_tqdm
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x17f235490>
Unclosed connector
connections: ['deque([(<aiohttp.client_proto.ResponseHandler object at 0x302808910>, 49326.354493958)])']
connector: <aiohttp.connector.TCPConnector object at 0x14b6f3890>


Total test results extracted: 52
Extraction confidence: 0.856


In [6]:
# Show all extracted test results
print('=== All Extracted Test Results ===')
for i, test in enumerate(result.test_results, 1):
    flag = f' [{test.abnormal_flag}]' if test.abnormal_flag else ''
    print(f'{i:2}. {test.name}: {test.value} {test.unit or ""}{flag}')

=== All Extracted Test Results ===
 1. Glucose: 95 mg/dL
 2. Uric Acid: 5.4 mg/dL
 3. BUN: 21 mg/dL
 4. Creatinine: 0.93 mg/dL
 5. eGFR: 85 mL/min/1.73
 6. BUN/Creatinine Ratio: 10 
 7. Sodium: 143 mmol/L
 8. Potassium: 4.2 mmol/L
 9. Chloride: 104 mmol/L
10. Carbon Dioxide, Total: 22 mmol/L
11. Calcium: 9.0 mg/dL
12. Phosphorus: 3.9 mg/dL
13. Protein, Total: 6.4 g/dL
14. Albumin: 4.5 g/dL
15. Globulin, Total: 1.9 g/dL
16. A/G Ratio: 2.4  [High]
17. Bilirubin, Total: 0.2 mg/dL
18. Alkaline Phosphatase: 40 IU/L [Low]
19. LDH: 119 IU/L [Low]
20. AST (SGOT): 34 IU/L
21. ALT (SGPT): 29 IU/L
22. Iron: 135 ug/dL
23. Cholesterol, Total: 125 mg/dL
24. Triglycerides: 50 mg/dL
25. HDL Cholesterol: 57 mg/dL
26. VLDL Cholesterol Cal: 12 mg/dL
27. LDL Chol Calc (NIH): 56 mg/dL
28. T. Chol/HDL Ratio: 2.2 ratio
29. VLDL Cholesterol: 12 mg/dL
30. WBC: 3.8 x10E3/uL
31. RBC: 4.35 x10E6/uL
32. Hemoglobin: 13.5 g/dL
33. Hematocrit: 40.3 %
34. MCV: 93 fL
35. MCH: 31.0 pg
36. MCHC: 33.5 g/dL
37. RDW: 12.7 %

In [7]:
# Check specifically for CBC differential
cbc_tests = [t for t in result.test_results 
             if any(k in t.name.lower() for k in cbc_keywords)]

print(f'\n=== CBC/Differential Tests ({len(cbc_tests)}) ===')
for test in cbc_tests:
    print(f'  {test.name}: {test.value} {test.unit or ""}')

# Check for basophils specifically
baso_tests = [t for t in result.test_results if 'baso' in t.name.lower()]
if baso_tests:
    print(f'\n✅ BASOPHILS FOUND:')
    for t in baso_tests:
        print(f'   {t.name}: {t.value} {t.unit or ""}')
else:
    print(f'\n❌ Basophils NOT found in extraction')


=== CBC/Differential Tests (11) ===
  WBC: 3.8 x10E3/uL
  RBC: 4.35 x10E6/uL
  Hemoglobin: 13.5 g/dL
  Hematocrit: 40.3 %
  Platelets: 142 x10E3/uL
  Neutrophils: 51 %
  Monocytes: 13 %
  Basos: 2 %
  Neutrophils (Absolute): 2.0 x10E3/uL
  Monocytes(Absolute): 0.5 x10E3/uL
  Baso (Absolute): 0.1 x10E3/uL

✅ BASOPHILS FOUND:
   Basos: 2 %
   Baso (Absolute): 0.1 x10E3/uL


## 4. Test with Different Config Options

In [None]:
# Test with different max_text_length values
configs = [
    {'max_text_length': 2000},
    {'max_text_length': 2500},
    {'max_text_length': 4000},
]

for config in configs:
    result = await run_extraction(text_result.full_text, config)
    baso_count = len([t for t in result.test_results if 'baso' in t.name.lower()])
    print(f"max_text_length={config['max_text_length']}: {len(result.test_results)} tests, Basophils: {'✅' if baso_count > 0 else '❌'}")

## 5. Check Patient and Other Extracted Data

In [None]:
# Patient info
if result.patient:
    print('=== Patient Info ===')
    print(f'  Name: {result.patient.name}')
    print(f'  DOB: {result.patient.dob}')
    print(f'  Gender: {result.patient.gender}')
    print(f'  MRN: {result.patient.mrn}')

# Providers
if result.providers:
    print('\n=== Providers ===')
    for p in result.providers:
        print(f'  {p.name} ({p.role})')

# Organizations
if result.organizations:
    print('\n=== Organizations ===')
    for o in result.organizations:
        print(f'  {o.name}')

# Raw fields
if result.raw_fields:
    print('\n=== Raw Fields ===')
    for k, v in result.raw_fields.items():
        print(f'  {k}: {v}')

## 6. Export Results as Dict

In [None]:
# Get full result as dictionary
result_dict = result.to_dict()
print('Result dict keys:', list(result_dict.keys()))
print(f"\nTest results count: {len(result_dict['test_results'])}")