# 2. AI-Powered Data Extraction

This notebook guides you through:
1. Loading included papers
2. Extracting brain coordinates using AI
3. Extracting effect sizes using AI
4. Validating and saving the dataset

In [None]:
# Setup
import sys
sys.path.insert(0, '..')

import pandas as pd
from pathlib import Path

from core import Study, Coordinate, EffectSize, MetaAnalysisDataset
from extraction.extractors import CoordinateExtractor, EffectSizeExtractor

## Step 1: Create Dataset and Initialize Extractors

In [None]:
# Create a new dataset
dataset = MetaAnalysisDataset(
    name="T-Maze Decision Making",
    description="Meta-analysis of brain activation during T-maze and spatial decision tasks"
)

# Initialize extractors
coord_extractor = CoordinateExtractor()
es_extractor = EffectSizeExtractor()

print("Dataset created:", dataset.name)

## Step 2: Demo - Manual Data Entry

First, let's see how to manually create studies with coordinates.

In [None]:
# Example: Manually create a study with coordinates
study1 = Study(
    study_id="smith2020",
    title="Neural correlates of spatial decision-making in a virtual T-maze",
    authors=["Smith, J.", "Jones, M.", "Williams, K."],
    year=2020,
    doi="10.1000/example.2020.001",
    n_total=30,
    mean_age=25.4,
    percent_female=53.3,
    task_name="Virtual T-maze",
    imaging_modality="fMRI",
    contrast_name="Decision > Control",
    coordinates=[
        Coordinate(x=-24, y=-8, z=52, region="Left premotor cortex", statistic_value=4.2, statistic_type="z"),
        Coordinate(x=28, y=-10, z=48, region="Right premotor cortex", statistic_value=3.9, statistic_type="z"),
        Coordinate(x=-6, y=12, z=44, region="Pre-SMA", statistic_value=5.1, statistic_type="z"),
        Coordinate(x=-30, y=-52, z=46, region="Left parietal", statistic_value=4.5, statistic_type="z"),
    ]
)

dataset.add_study(study1)
print(f"Added study: {study1.citation}")
print(f"  Coordinates: {len(study1.coordinates)}")

In [None]:
# Add a few more example studies
study2 = Study(
    study_id="johnson2019",
    title="Hippocampal activation during spatial navigation decisions",
    authors=["Johnson, A.", "Brown, B."],
    year=2019,
    n_total=25,
    task_name="T-maze navigation",
    coordinates=[
        Coordinate(x=-26, y=-20, z=-14, region="Left hippocampus", statistic_value=4.8, statistic_type="t"),
        Coordinate(x=28, y=-18, z=-12, region="Right hippocampus", statistic_value=4.2, statistic_type="t"),
        Coordinate(x=-4, y=-30, z=30, region="Posterior cingulate", statistic_value=3.7, statistic_type="t"),
    ]
)

study3 = Study(
    study_id="chen2021",
    title="Prefrontal contributions to spatial decision-making",
    authors=["Chen, L.", "Wang, X.", "Liu, Y."],
    year=2021,
    n_total=35,
    coordinates=[
        Coordinate(x=-42, y=32, z=20, region="Left DLPFC", statistic_value=5.2, statistic_type="z"),
        Coordinate(x=44, y=30, z=22, region="Right DLPFC", statistic_value=4.9, statistic_type="z"),
        Coordinate(x=-2, y=28, z=40, region="ACC", statistic_value=4.4, statistic_type="z"),
        Coordinate(x=-32, y=-58, z=44, region="Left SPL", statistic_value=4.1, statistic_type="z"),
        Coordinate(x=34, y=-56, z=46, region="Right SPL", statistic_value=3.8, statistic_type="z"),
    ]
)

dataset.add_study(study2)
dataset.add_study(study3)

print(f"\nDataset now has {dataset.n_studies} studies with {dataset.n_coordinates} coordinates")

## Step 3: AI Extraction from Paper Text

Extract coordinates from paper text using Claude.

In [None]:
# Example paper text (simulated results section)
example_paper_text = """
Results

Whole-brain analysis revealed significant activation during spatial decision-making 
compared to the control condition. Peak activations were observed in the following regions:

Table 2. Significant activation peaks for Decision > Control contrast

Region                  x      y      z      Z-score   Cluster size
Left Premotor Cortex   -26    -6     54     4.82      324
Right Premotor Cortex   28    -8     52     4.45      287
Pre-SMA                 -4    14     48     5.21      512
Left Hippocampus       -28   -22    -12     4.12      156
Right Hippocampus       30   -20    -10     3.89      142
Left DLPFC             -44    34     22     4.67      298
Posterior Cingulate     -2   -32     34     4.33      245

All coordinates are reported in MNI space. Cluster-level FWE correction was applied 
at p < 0.05 with a cluster-forming threshold of p < 0.001.
"""

print("Paper text loaded (simulated example)")

In [None]:
# Extract coordinates using AI
# NOTE: This requires ANTHROPIC_API_KEY to be set

try:
    extracted_coords = coord_extractor.extract(example_paper_text)
    print(f"Extracted {len(extracted_coords)} coordinates:\n")
    
    for i, coord in enumerate(extracted_coords, 1):
        print(f"{i}. ({coord.x}, {coord.y}, {coord.z}) - {coord.region or 'Unknown'} [z={coord.statistic_value}]")
        
except Exception as e:
    print(f"Extraction error (API key may not be set): {e}")
    print("\nUsing manual coordinates instead...")
    extracted_coords = []

In [None]:
# Create study from extracted coordinates
if extracted_coords:
    ai_study = Study(
        study_id="ai_extracted_example",
        title="AI-Extracted Example Study",
        authors=["Example, A."],
        year=2023,
        n_total=28,
        coordinates=extracted_coords,
        extraction_confidence=0.85,
        extraction_notes="Extracted by AI from simulated paper text"
    )
    
    # Validate
    is_valid, errors = coord_extractor.validate(extracted_coords)
    print(f"Validation: {'PASSED' if is_valid else 'FAILED'}")
    if errors:
        for error in errors:
            print(f"  - {error}")

## Step 4: View and Validate Dataset

In [None]:
# Print dataset summary
print(dataset.summary())

In [None]:
# View coordinates as DataFrame
coords_df = dataset.to_coordinates_df()
coords_df.head(15)

In [None]:
# Plot coordinate distribution
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

axes[0].hist(coords_df['x'], bins=20, edgecolor='black')
axes[0].set_xlabel('X (mm)')
axes[0].set_title('X Distribution')

axes[1].hist(coords_df['y'], bins=20, edgecolor='black')
axes[1].set_xlabel('Y (mm)')
axes[1].set_title('Y Distribution')

axes[2].hist(coords_df['z'], bins=20, edgecolor='black')
axes[2].set_xlabel('Z (mm)')
axes[2].set_title('Z Distribution')

plt.tight_layout()
plt.show()

## Step 5: Save Dataset

In [None]:
# Save dataset
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

# Save as JSON (for loading back)
dataset.save(output_dir / "tmaze_dataset.json")
print(f"Saved dataset to {output_dir / 'tmaze_dataset.json'}")

# Also save as CSV for inspection
csv_files = dataset.to_csv(output_dir / "tmaze_csv")
print(f"\nCSV files:")
for key, path in csv_files.items():
    print(f"  {key}: {path}")

## Next Steps

1. Load dataset in notebook 03 for ALE meta-analysis
2. Or notebook 04 for effect size meta-analysis
3. Review and manually verify AI extractions