# T-Maze Batch Processing & HPC Integration

This notebook demonstrates automated pipelines:
- Batch processing with checkpointing
- Slurm job submission for HPC
- BIDS format handling
- Report generation

In [None]:
import numpy as np
import sys
from pathlib import Path
import json
import time

sys.path.append('..')

from pipelines.batch import (
    BatchProcessor,
    BatchConfig,
    parallel_map,
    checkpoint_resume,
    create_processing_report
)
from pipelines.hpc import (
    SlurmSubmitter,
    generate_slurm_template
)
from pipelines.bids import (
    BIDSDataset,
    validate_bids,
    bids_to_tmaze,
    tmaze_to_bids
)
from pipelines.reporting import (
    generate_report,
    statistics_table,
    export_for_publication
)

## 1. Batch Processing

In [None]:
# Define a sample processing function
def process_subject(subject_id, data_dir=None, param1=1.0):
    """
    Example processing function for a single subject.
    In practice, this would load data and run analysis.
    """
    # Simulate processing time
    time.sleep(0.1)
    
    # Simulate results
    np.random.seed(hash(subject_id) % 2**32)
    
    return {
        'subject_id': subject_id,
        'accuracy': 0.5 + np.random.rand() * 0.3,
        'n_trials': np.random.randint(80, 120),
        'parameter': param1
    }

# Create list of subjects
subject_ids = [f'sub-{i:02d}' for i in range(1, 11)]
print(f"Processing {len(subject_ids)} subjects: {subject_ids}")

In [None]:
# Configure batch processing
config = BatchConfig(
    n_jobs=4,  # Number of parallel jobs
    verbose=10,
    checkpoint_dir=Path('./checkpoints'),
    checkpoint_freq=5,
    continue_on_error=True,
    max_retries=2
)

# Create processor
processor = BatchProcessor(process_subject, config)

# Run batch processing
print("\nStarting batch processing...")
batch_result = processor.run(
    subject_ids,
    data_dir='/path/to/data',
    param1=2.0
)

print(f"\n{batch_result.summary()}")

In [None]:
# Generate processing report
report = create_processing_report(batch_result)
print(report)

In [None]:
# Extract results from successful subjects
successful = batch_result.get_successful()
accuracies = [r.result['accuracy'] for r in successful]

print(f"\nSuccessful subjects: {len(successful)}")
print(f"Mean accuracy: {np.mean(accuracies):.3f} Â± {np.std(accuracies):.3f}")

## 2. Parallel Processing with parallel_map

In [None]:
# Simple parallel processing
def compute_roi_accuracy(roi_idx, X=None, y=None):
    """Compute accuracy for a single ROI."""
    time.sleep(0.05)  # Simulate computation
    return {'roi': roi_idx, 'accuracy': 0.5 + np.random.rand() * 0.2}

# Parallel map across ROIs
roi_indices = list(range(50))

print("Running parallel_map on 50 ROIs...")
start = time.time()

results = parallel_map(
    compute_roi_accuracy,
    roi_indices,
    n_jobs=4,
    verbose=5
)

elapsed = time.time() - start
print(f"\nCompleted in {elapsed:.2f}s")
print(f"Results: {len(results)} ROIs processed")

## 3. Checkpoint and Resume

In [None]:
# Processing with checkpoint/resume
def long_running_analysis(subject_id):
    time.sleep(0.2)  # Simulate long computation
    return {'subject': subject_id, 'result': np.random.rand()}

checkpoint_file = Path('./analysis_checkpoint.json')

# First run - process some subjects
print("First run (will checkpoint after each subject)...")
partial_subjects = subject_ids[:5]

results = checkpoint_resume(
    long_running_analysis,
    partial_subjects,
    checkpoint_file
)

print(f"Processed: {len(results)} subjects")

# Second run - resume with more subjects
print("\nSecond run (resuming with all subjects)...")
all_results = checkpoint_resume(
    long_running_analysis,
    subject_ids,
    checkpoint_file
)

print(f"Total processed: {len(all_results)} subjects")

# Cleanup
if checkpoint_file.exists():
    checkpoint_file.unlink()

## 4. HPC Job Submission (Slurm)

In [None]:
# Configure Slurm submitter
submitter = SlurmSubmitter(
    partition='main',
    time='04:00:00',
    mem='16G',
    cpus_per_task=4,
    account='jss388',  # Your account
    output_dir=Path('./slurm_jobs'),
    conda_env='tmaze'
)

print("Slurm configuration:")
print(f"  Partition: {submitter.partition}")
print(f"  Time limit: {submitter.time}")
print(f"  Memory: {submitter.mem}")
print(f"  CPUs: {submitter.cpus_per_task}")

In [None]:
# Generate a template script (dry run)
template_path = Path('./slurm_jobs/analysis_template.sh')

generate_slurm_template(
    template_path,
    python_script='run_analysis.py',
    job_name='tmaze_analysis',
    partition='main',
    time='04:00:00',
    mem='16G',
    cpus=4,
    conda_env='tmaze'
)

# Display template
print("\nGenerated Slurm template:")
print("-" * 50)
with open(template_path, 'r') as f:
    print(f.read())

In [None]:
# Submit job array (dry run)
print("\nSubmitting job array (DRY RUN)...")

jobs = submitter.submit_array(
    python_script='scripts/run_subject_analysis.py',
    subject_ids=subject_ids,
    job_name='tmaze_batch',
    array_batch_size=5,
    dry_run=True,  # Don't actually submit
    output_dir='./results'
)

print(f"\nWould submit {len(jobs)} job(s):")
for job in jobs:
    print(f"  Job: {job.name}")
    print(f"  Script: {job.script_path}")
    print(f"  Subjects: {len(job.metadata.get('subjects', []))}")

## 5. BIDS Format Handling

In [None]:
# Create a mock BIDS structure for demonstration
bids_root = Path('./mock_bids')
bids_root.mkdir(exist_ok=True)

# Create required files
desc = {
    'Name': 'T-Maze EEG-fMRI Dataset',
    'BIDSVersion': '1.8.0',
    'Authors': ['Test Author']
}
with open(bids_root / 'dataset_description.json', 'w') as f:
    json.dump(desc, f)

# Create participants.tsv
with open(bids_root / 'participants.tsv', 'w') as f:
    f.write('participant_id\tage\tsex\n')
    for i in range(1, 6):
        f.write(f'sub-{i:02d}\t{20+i}\tM\n')

# Create subject directories
for i in range(1, 6):
    subj_dir = bids_root / f'sub-{i:02d}' / 'eeg'
    subj_dir.mkdir(parents=True, exist_ok=True)
    (subj_dir / f'sub-{i:02d}_task-tmaze_eeg.set').touch()

print(f"Created mock BIDS dataset at: {bids_root}")

In [None]:
# Validate BIDS structure
validation = validate_bids(bids_root)

print("BIDS Validation Results:")
print(f"  Valid: {validation['valid']}")
print(f"  Subjects: {validation['n_subjects']}")
if validation['issues']:
    print(f"  Issues: {validation['issues']}")
if validation['warnings']:
    print(f"  Warnings: {validation['warnings']}")

In [None]:
# Load BIDS dataset
dataset = BIDSDataset(bids_root, validate=False)

subjects = dataset.get_subjects()
print(f"\nSubjects in dataset: {subjects}")

# Get files for a subject
subj = subjects[0]
eeg_files = dataset.get_eeg_files(subj, task='tmaze')
print(f"\nEEG files for {subj}: {eeg_files}")

In [None]:
# Export results to BIDS derivatives format
analysis_results = {
    'sub-01': {'accuracy': 0.72, 'auc': 0.78},
    'sub-02': {'accuracy': 0.68, 'auc': 0.74},
    'sub-03': {'accuracy': 0.75, 'auc': 0.81}
}

deriv_dir = tmaze_to_bids(
    analysis_results,
    output_dir=bids_root,
    analysis_name='tmaze-classification',
    description='T-maze reward vs no-reward classification'
)

print(f"\nDerivatives exported to: {deriv_dir}")

## 6. Report Generation

In [None]:
# Prepare results for report
report_results = {
    'n_subjects': 20,
    'classification': {
        'accuracy': 0.72,
        'accuracy_std': 0.08,
        'auc': 0.78
    },
    'group_stats': {
        'Overall': {
            'statistic': 4.52,
            'p_value': 0.0002,
            'effect_size': 0.85
        },
        'Reward Effect': {
            'statistic': 3.21,
            'p_value': 0.003,
            'effect_size': 0.62
        }
    }
}

# Generate HTML report
report_path = generate_report(
    report_results,
    output_path=Path('./tmaze_report.html')
)

print(f"Report generated: {report_path}")

In [None]:
# Generate statistics table in different formats
stats = report_results['group_stats']

print("\nMarkdown Table:")
print(statistics_table(stats, format='markdown'))

print("\n\nLaTeX Table:")
print(statistics_table(stats, format='latex'))

In [None]:
# Export for publication
pub_dir = Path('./publication_outputs')
exported = export_for_publication(
    report_results,
    pub_dir,
    format='latex'
)

print(f"\nExported files:")
for name, path in exported.items():
    print(f"  {name}: {path}")

## Summary

In [None]:
# Cleanup mock files
import shutil
for p in ['./mock_bids', './checkpoints', './slurm_jobs', './publication_outputs']:
    if Path(p).exists():
        shutil.rmtree(p)
        
for f in ['./tmaze_report.html']:
    if Path(f).exists():
        Path(f).unlink()

print("\n" + "="*60)
print("PIPELINE AUTOMATION SUMMARY")
print("="*60)
print("""
Demonstrated capabilities:

1. Batch Processing
   - BatchProcessor for multi-subject analysis
   - Parallel execution with joblib
   - Automatic checkpointing and resume
   - Error handling with retries

2. HPC Integration (Slurm)
   - SlurmSubmitter for job submission
   - Job array support for batch subjects
   - Template generation
   - Job monitoring

3. BIDS Format
   - Dataset validation
   - File querying (EEG, fMRI, events)
   - Derivatives export

4. Reporting
   - HTML report generation
   - Statistics tables (Markdown, LaTeX)
   - Publication export
""")
print("="*60)