# API Extraction Test

This notebook tests what the API returns when processing a document file.

In [None]:
import requests
import json
import time

API_BASE = 'http://localhost:8000'

## 1. Check API Health

In [None]:
response = requests.get(f'{API_BASE}/api/health')
print(f'Status: {response.status_code}')
print(json.dumps(response.json(), indent=2))

## 2. List Available Sample Files

In [None]:
response = requests.get(f'{API_BASE}/api/samples')
samples = response.json()
print(f'Found {len(samples)} sample files:')
for sample in samples[:10]:
    print(f"  - {sample.get('path', sample)}")

## 3. Process a Sample Document via API

In [None]:
# Process LabCorp sample
file_path = 'data/samples/labs/labcorp/SampleLabCorpReport.pdf'

response = requests.post(
    f'{API_BASE}/api/v2/samples/process',
    params={'file_path': file_path, 'strategy': 'router'}
)

job_data = response.json()
job_id = job_data.get('job_id')
print(f'Job ID: {job_id}')
print(f'Status: {job_data.get("status")}')

In [None]:
# Poll for completion
def wait_for_job(job_id, timeout=300):
    start = time.time()
    while time.time() - start < timeout:
        response = requests.get(f'{API_BASE}/api/v2/jobs/{job_id}')
        data = response.json()
        status = data.get('status')
        print(f'Status: {status}')
        if status in ['completed', 'failed']:
            return data
        time.sleep(5)
    return None

result = wait_for_job(job_id)
print(f'\nFinal status: {result.get("status")}')

## 4. Inspect the Full API Response

In [None]:
# Show top-level keys
print('Top-level keys:', list(result.keys()))

# Show result keys
if result.get('result'):
    print('\nResult keys:', list(result['result'].keys()))

In [None]:
# Inspect extracted_values (what frontend receives)
extracted_values = result.get('result', {}).get('extracted_values', [])
print(f'Extracted values count: {len(extracted_values)}')

if extracted_values:
    print('\nFirst extracted value structure:')
    print(json.dumps(extracted_values[0], indent=2))
    
    print('\n=== All Extracted Values ===')
    for i, ev in enumerate(extracted_values, 1):
        name = ev.get('field_name', '')
        value = ev.get('value', '')
        unit = ev.get('unit', '')
        print(f'{i:2}. {name}: {value} {unit}')

In [None]:
# Inspect universal_extraction (raw extraction data)
universal = result.get('result', {}).get('universal_extraction', {})
print('Universal extraction keys:', list(universal.keys()))

test_results = universal.get('test_results', [])
print(f'\nTest results count: {len(test_results)}')

if test_results:
    print('\nFirst test result structure:')
    print(json.dumps(test_results[0], indent=2))

## 5. Check for CBC Differential in API Response

In [None]:
# Check for CBC differential in extracted_values
cbc_keywords = ['wbc', 'rbc', 'hemoglobin', 'platelet', 'neutrophil', 
                'lymph', 'monocyte', 'eos', 'baso']

cbc_in_extracted = [ev for ev in extracted_values 
                    if any(k in ev.get('field_name', '').lower() for k in cbc_keywords)]

print(f'CBC tests in extracted_values: {len(cbc_in_extracted)}')
for ev in cbc_in_extracted:
    print(f"  {ev.get('field_name')}: {ev.get('value')}")

# Check for basophils
baso_extracted = [ev for ev in extracted_values if 'baso' in ev.get('field_name', '').lower()]
if baso_extracted:
    print(f'\n✅ BASOPHILS in extracted_values:')
    for b in baso_extracted:
        print(f"   {b.get('field_name')}: {b.get('value')}")
else:
    print('\n❌ Basophils NOT in extracted_values')

In [None]:
# Check for CBC differential in universal_extraction.test_results
cbc_in_universal = [t for t in test_results 
                    if any(k in t.get('name', '').lower() for k in cbc_keywords)]

print(f'CBC tests in universal_extraction: {len(cbc_in_universal)}')
for t in cbc_in_universal:
    print(f"  {t.get('name')}: {t.get('value')}")

# Check for basophils
baso_universal = [t for t in test_results if 'baso' in t.get('name', '').lower()]
if baso_universal:
    print(f'\n✅ BASOPHILS in universal_extraction:')
    for b in baso_universal:
        print(f"   {b.get('name')}: {b.get('value')}")
else:
    print('\n❌ Basophils NOT in universal_extraction')

## 6. Compare extracted_values vs universal_extraction

In [None]:
# Compare counts
print(f'extracted_values count: {len(extracted_values)}')
print(f'universal_extraction.test_results count: {len(test_results)}')

# Find tests in universal but not in extracted
extracted_names = {ev.get('field_name', '').lower() for ev in extracted_values}
universal_names = {t.get('name', '').lower() for t in test_results}

missing_from_extracted = universal_names - extracted_names
print(f'\nTests in universal but NOT in extracted_values ({len(missing_from_extracted)}):')
for name in sorted(missing_from_extracted):
    print(f'  - {name}')

## 7. Inspect Classification Result

In [None]:
classification = result.get('result', {}).get('classification', {})
print('Classification result:')
print(json.dumps(classification, indent=2))

print(f"\nDocument type: {result.get('result', {}).get('document_type')}")

In [None]:
# List all jobs (v2 endpoint)
response = requests.get(f'{API_BASE}/api/v2/jobs')
jobs = response.json()

print(f'Total jobs: {len(jobs)}')
for job in jobs[-5:]:  # Show last 5
    print(f"  {job.get('job_id')[:8]}... - {job.get('status')} - {job.get('document_type', 'unknown')}")

In [None]:
# Get workflow steps for the job (v2 endpoint)
response = requests.get(f'{API_BASE}/api/v2/jobs/{job_id}/workflow')
workflow = response.json()

print('Workflow steps:')
for step in workflow.get('steps', []):
    status = step.get('status', 'unknown')
    duration = step.get('duration_seconds', 0)
    print(f"  {step.get('name')}: {status} ({duration:.2f}s)" if duration else f"  {step.get('name')}: {status}")

## 7b. Check Workflow Steps (v2 endpoint)

## 8. Test with File Upload (Optional)

In [None]:
# Upload a file directly
file_path = '../data/samples/labs/labcorp/SampleLabCorpReport.pdf'

with open(file_path, 'rb') as f:
    response = requests.post(
        f'{API_BASE}/api/upload',
        files={'file': f}
    )

upload_result = response.json()
print('Upload result:')
print(json.dumps(upload_result, indent=2))

file_id = upload_result.get('file_id')

In [None]:
# Process the uploaded file
if file_id:
    response = requests.post(
        f'{API_BASE}/api/v2/process',
        params={'file_id': file_id, 'strategy': 'router'}
    )
    
    job_data = response.json()
    print(f'Job ID: {job_data.get("job_id")}')
    
    # Wait for completion
    result = wait_for_job(job_data.get('job_id'))
    
    extracted_values = result.get('result', {}).get('extracted_values', [])
    print(f'\nExtracted values: {len(extracted_values)}')
    
    baso = [ev for ev in extracted_values if 'baso' in ev.get('field_name', '').lower()]
    print(f'Basophils found: {"YES" if baso else "NO"}')

## 9. Save Full Response for Debugging

In [None]:
# Save full API response to file for inspection
with open('api_response_debug.json', 'w') as f:
    json.dump(result, f, indent=2, default=str)

print('Full API response saved to api_response_debug.json')