# Tutorial: Batch Processing

This tutorial demonstrates how to classify multiple publications from a CSV file.

In [None]:
from openness_classifier.batch import classify_csv, BatchJob
from openness_classifier.config import load_config
from pathlib import Path
import pandas as pd

## 1. Basic Batch Processing

In [None]:
# Process a CSV file
input_path = Path('../../resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv')

if input_path.exists():
    # Load a small subset for demo
    df = pd.read_csv(input_path).head(10)
    subset_path = Path('../../data/demo_subset.csv')
    df.to_csv(subset_path, index=False)
    
    # Process with progress callback
    def progress(processed, total):
        print(f"\rProcessing: {processed}/{total} ({100*processed/total:.0f}%)", end="")
    
    job = classify_csv(
        input_path=subset_path,
        output_path=Path('../../data/demo_classified.csv'),
        progress_callback=progress
    )
    
    print(f"\n\n{job.summary()}")
else:
    print(f"Input file not found: {input_path}")

## 2. Review Results

In [None]:
# Load and display results
result_path = Path('../../data/demo_classified.csv')
if result_path.exists():
    results = pd.read_csv(result_path)
    print("Classification Results:")
    print(results[['doi', 'data_classification', 'data_confidence', 
                   'code_classification', 'code_confidence']].to_string())
    
    print("\n\nData Classification Distribution:")
    print(results['data_classification'].value_counts())

## 3. Error Handling Options

The `error_handling` parameter controls behavior on failures:
- `'skip'`: Skip failed rows, continue processing
- `'fail'`: Stop on first error
- `'log'`: Log error and continue (default)

In [None]:
# Check job error log
if 'job' in dir():
    if job.error_log:
        print("Errors encountered:")
        for pub_id, error in job.error_log:
            print(f"  {pub_id}: {error}")
    else:
        print("No errors encountered!")

## Summary

Key functions:
- `classify_csv()`: Process a CSV file with multiple publications
- `BatchJob`: Track progress, statistics, and errors
- `progress_callback`: Monitor progress in real-time

Output columns added:
- `data_classification`: Openness category for data
- `data_confidence`: Confidence score (0-1)
- `code_classification`: Openness category for code
- `code_confidence`: Confidence score (0-1)