# Data Module

> Data loading, training examples, and sentence embeddings.

This module handles:
- Loading training data from articles_reviewed.csv
- Creating TrainingExample objects with ground truth labels
- Computing sentence embeddings for kNN example selection
- Train/test splitting for validation

In [None]:
#| default_exp data

In [None]:
#| export
from __future__ import annotations
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional, List, Tuple, Dict, Any
from datetime import date
import logging

from sklearn.model_selection import train_test_split as sklearn_split

from openness_classifier.core import (
    OpennessCategory,
    ClassificationType,
    DataError
)

## Data Classes

### Publication

Represents a scholarly article with data and code availability statements.

In [None]:
#| export
@dataclass
class Publication:
    """A scholarly publication with data and code availability statements.
    
    Attributes:
        id: Unique identifier (DOI, PMID, or internal ID)
        data_statement: Data availability statement text (None if missing)
        code_statement: Code availability statement text (None if missing)
        metadata: Additional publication metadata
    """
    id: str
    data_statement: Optional[str] = None
    code_statement: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)
    
    def has_data_statement(self) -> bool:
        """Check if publication has a valid data statement."""
        return bool(self.data_statement and self.data_statement.strip() 
                   and self.data_statement.lower() != 'nothing')
    
    def has_code_statement(self) -> bool:
        """Check if publication has a valid code statement."""
        return bool(self.code_statement and self.code_statement.strip()
                   and self.code_statement.lower() != 'nothing')

### TrainingExample

A manually coded example used for few-shot learning.

In [None]:
#| export
@dataclass
class TrainingExample:
    """A manually coded training example for few-shot learning.
    
    Attributes:
        id: Unique identifier
        statement_text: Data or code availability statement
        ground_truth: Human-coded openness classification
        statement_type: Whether this is a data or code statement
        source: Source of the example (e.g., 'articles_reviewed.csv')
        embedding: Sentence embedding for kNN selection (computed lazily)
    """
    id: str
    statement_text: str
    ground_truth: OpennessCategory
    statement_type: ClassificationType
    source: str = 'articles_reviewed.csv'
    embedding: Optional[np.ndarray] = None
    
    def to_prompt_example(self) -> str:
        """Format as a few-shot prompt example."""
        return f"""Statement: {self.statement_text}
Classification: {self.ground_truth.value}"""

## Loading Training Data

In [None]:
#| export
def load_training_data(
    path: str | Path,
    filter_dropped: bool = True,
    filter_climate: bool = False,
    min_statement_length: int = 10
) -> Tuple[List[TrainingExample], List[TrainingExample]]:
    """Load training examples from articles_reviewed.csv.
    
    Returns separate lists for data and code training examples.
    
    Args:
        path: Path to articles_reviewed.csv
        filter_dropped: Exclude rows with dropped=1
        filter_climate: Only include rows with is_climate=1
        min_statement_length: Minimum statement length to include
        
    Returns:
        Tuple of (data_examples, code_examples)
        
    Raises:
        DataError: If file not found or required columns missing
    """
    path = Path(path)
    if not path.exists():
        raise DataError(f"Training data file not found: {path}")
    
    # Load CSV with latin-1 encoding to handle non-UTF-8 characters
    try:
        df = pd.read_csv(path, encoding='latin-1')
    except Exception as e:
        raise DataError(f"Error reading training data: {e}")
    
    # Validate required columns
    required_cols = ['data_statement', 'code_statement', 'data_open', 'code_open']
    missing_cols = [c for c in required_cols if c not in df.columns]
    if missing_cols:
        raise DataError(f"Missing required columns: {missing_cols}")
    
    # Apply filters
    if filter_dropped and 'dropped' in df.columns:
        df = df[df['dropped'] != 1]
    
    if filter_climate and 'is_climate' in df.columns:
        df = df[df['is_climate'] == 1]
    
    # Extract training examples
    data_examples = []
    code_examples = []
    
    for idx, row in df.iterrows():
        pub_id = row.get('doi', str(idx))
        
        # Process data statement
        data_stmt = row.get('data_statement', '')
        data_label = row.get('data_open', '')
        
        if _is_valid_statement(data_stmt, min_statement_length) and _is_valid_label(data_label):
            try:
                data_examples.append(TrainingExample(
                    id=f"{pub_id}_data",
                    statement_text=str(data_stmt).strip(),
                    ground_truth=OpennessCategory.from_string(data_label),
                    statement_type=ClassificationType.DATA,
                ))
            except ValueError as e:
                logging.warning(f"Skipping row {idx} data: {e}")
        
        # Process code statement
        code_stmt = row.get('code_statement', '')
        code_label = row.get('code_open', '')
        
        if _is_valid_statement(code_stmt, min_statement_length) and _is_valid_label(code_label):
            try:
                code_examples.append(TrainingExample(
                    id=f"{pub_id}_code",
                    statement_text=str(code_stmt).strip(),
                    ground_truth=OpennessCategory.from_string(code_label),
                    statement_type=ClassificationType.CODE,
                ))
            except ValueError as e:
                logging.warning(f"Skipping row {idx} code: {e}")
    
    logging.info(f"Loaded {len(data_examples)} data examples and {len(code_examples)} code examples")
    
    return data_examples, code_examples


def _is_valid_statement(stmt: Any, min_length: int) -> bool:
    """Check if statement is valid for training."""
    if pd.isna(stmt) or stmt is None:
        return False
    stmt_str = str(stmt).strip().lower()
    if stmt_str == 'nothing' or stmt_str == '' or len(stmt_str) < min_length:
        return False
    return True


def _is_valid_label(label: Any) -> bool:
    """Check if label is valid for training."""
    if pd.isna(label) or label is None:
        return False
    label_str = str(label).strip().lower()
    if label_str == '' or label_str == 'nothing':
        return False
    return True

## Sentence Embeddings

Using sentence-transformers for computing embeddings for kNN example selection.

In [None]:
#| export
class EmbeddingModel:
    """Wrapper for sentence-transformers embedding model.
    
    Computes sentence embeddings for semantic similarity-based
    kNN example selection.
    
    Example:
        >>> model = EmbeddingModel('all-MiniLM-L6-v2')
        >>> embedding = model.encode("Data available at Zenodo")
    """
    
    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer(model_name)
        self.model_name = model_name
    
    def encode(self, text: str | List[str]) -> np.ndarray:
        """Encode text(s) to embedding vector(s).
        
        Args:
            text: Single string or list of strings
            
        Returns:
            Embedding vector(s) as numpy array
        """
        return self.model.encode(text, convert_to_numpy=True)
    
    def compute_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
        """Compute cosine similarity between two embeddings."""
        return float(np.dot(embedding1, embedding2) / 
                    (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)))

In [None]:
#| export
def compute_embeddings(
    examples: List[TrainingExample],
    model: EmbeddingModel
) -> List[TrainingExample]:
    """Compute embeddings for a list of training examples.
    
    Modifies examples in-place to add embedding field.
    
    Args:
        examples: List of training examples
        model: EmbeddingModel to use
        
    Returns:
        Same list with embeddings computed
    """
    if not examples:
        return examples
    
    # Batch encode for efficiency
    texts = [ex.statement_text for ex in examples]
    embeddings = model.encode(texts)
    
    # Assign embeddings
    for ex, emb in zip(examples, embeddings):
        ex.embedding = emb
    
    return examples

## Train/Test Split

In [None]:
#| export
def train_test_split(
    examples: List[TrainingExample],
    test_size: float = 0.2,
    stratify: bool = True,
    random_state: int = 42
) -> Tuple[List[TrainingExample], List[TrainingExample]]:
    """Split training examples into train and test sets.
    
    Args:
        examples: List of training examples
        test_size: Fraction for test set (default: 0.2)
        stratify: Whether to stratify by ground_truth label
        random_state: Random seed for reproducibility
        
    Returns:
        Tuple of (train_examples, test_examples)
    """
    if len(examples) < 5:
        logging.warning(f"Very few examples ({len(examples)}), not splitting")
        return examples, []
    
    labels = [ex.ground_truth.value for ex in examples] if stratify else None
    
    # Check if stratification is possible (need at least 2 samples per class)
    if stratify:
        from collections import Counter
        label_counts = Counter(labels)
        if min(label_counts.values()) < 2:
            logging.warning("Some classes have <2 samples, disabling stratification")
            labels = None
    
    train, test = sklearn_split(
        examples,
        test_size=test_size,
        stratify=labels,
        random_state=random_state
    )
    
    return list(train), list(test)

## Data Validation

In [None]:
#| export
def validate_training_data(
    examples: List[TrainingExample],
    min_class_fraction: float = 0.1
) -> Dict[str, Any]:
    """Validate training data quality and class balance.
    
    Args:
        examples: Training examples to validate
        min_class_fraction: Minimum fraction for any class (warn if below)
        
    Returns:
        Dictionary with validation results and statistics
    """
    from collections import Counter
    
    if not examples:
        return {'valid': False, 'error': 'No examples provided'}
    
    # Class distribution
    labels = [ex.ground_truth.value for ex in examples]
    distribution = Counter(labels)
    total = len(examples)
    
    # Check class balance
    warnings = []
    for label, count in distribution.items():
        fraction = count / total
        if fraction < min_class_fraction:
            warnings.append(
                f"Class '{label}' has only {count} examples ({fraction:.1%}), "
                f"below threshold of {min_class_fraction:.1%}"
            )
    
    # Statement length statistics
    lengths = [len(ex.statement_text) for ex in examples]
    
    return {
        'valid': len(warnings) == 0,
        'total_examples': total,
        'class_distribution': dict(distribution),
        'class_fractions': {k: v/total for k, v in distribution.items()},
        'warnings': warnings,
        'statement_length': {
            'min': min(lengths),
            'max': max(lengths),
            'mean': np.mean(lengths),
            'median': np.median(lengths),
        }
    }

In [None]:
#| export
def reload_training_data(
    path: str | Path,
    embedding_model: Optional[EmbeddingModel] = None,
    **kwargs
) -> Tuple[List[TrainingExample], List[TrainingExample]]:
    """Reload training data and recompute embeddings.
    
    Use this after updating the training CSV to refresh examples.
    
    Args:
        path: Path to training data CSV
        embedding_model: Model for computing embeddings (creates new if None)
        **kwargs: Additional arguments passed to load_training_data
        
    Returns:
        Tuple of (data_examples, code_examples) with embeddings
    """
    data_examples, code_examples = load_training_data(path, **kwargs)
    
    if embedding_model is None:
        embedding_model = EmbeddingModel()
    
    compute_embeddings(data_examples, embedding_model)
    compute_embeddings(code_examples, embedding_model)
    
    logging.info(
        f"Reloaded {len(data_examples)} data and {len(code_examples)} code examples "
        f"with embeddings"
    )
    
    return data_examples, code_examples

In [None]:
# Test data loading
import os
from pathlib import Path

# Check if we're in the right directory
test_path = Path('resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv')
if not test_path.exists():
    # Try from repo root
    test_path = Path('/home/user/open_sesame/resources/abpoll-open-b71bd12/data/processed/articles_reviewed.csv')

if test_path.exists():
    data_ex, code_ex = load_training_data(test_path)
    print(f"Loaded {len(data_ex)} data examples and {len(code_ex)} code examples")
    
    # Show class distribution
    if data_ex:
        validation = validate_training_data(data_ex)
        print(f"Data class distribution: {validation['class_distribution']}")
        if validation['warnings']:
            print(f"Warnings: {validation['warnings']}")
else:
    print(f"Test file not found at {test_path}")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()