# Module 5

In [None]:
import os
import json
import re
from spacy.tokens import DocBin

# Assuming this script is in the same root directory as your project_data folder
# Import directory paths from your main processor
from e_inference import TOKEN_DIR, CORRECTIONS_DIR, DATA_DIR

# Define where to save the final training data
SPACY_TRAINING_DIR = os.path.join(DATA_DIR, 'spacy_training')
os.makedirs(SPACY_TRAINING_DIR, exist_ok=True)


def find_token_sequence(tokens, target_text):
    """
    Searches for a sequence of tokens that perfectly matches the target text.
    Handles cases where the target text is split across multiple tokens.
    
    Returns:
        A list of indices of the matching tokens, or None if not found.
    """
    target_words = target_text.strip().split()
    if not target_words:
        return None

    all_token_texts = [tok['text'] for tok in tokens]
    
    for i in range(len(all_token_texts) - len(target_words) + 1):
        # Create a window of tokens to check
        window = all_token_texts[i : i + len(target_words)]
        
        # Check if the sequence of tokens in the window matches the target words
        if window == target_words:
            return list(range(i, i + len(target_words))) # Return indices
            
    return None


def create_training_data(limit=None):
    """
    Processes corrected JSON files and original tokens to create a labeled dataset
    in the BIO format, then converts it to spaCy's DocBin format. Can set limit to 80%
    """
    training_data = []
    
    # Get a list of corrected files(ground truth)
    corrected_files = [f for f in os.listdir(CORRECTIONS_DIR) if f.endswith('_corrected.json')]
    if limit:
        corrected_files = corrected_files[:limit]
        
    print(f"Found {len(corrected_files)} corrected files to process.")

    for filename in corrected_files:
        # Construct paths for corrected data and original tokens
        base_name = filename.replace('_corrected.json', '')
        corrected_json_path = os.path.join(CORRECTIONS_DIR, filename)
        token_json_path = os.path.join(TOKEN_DIR, f"{base_name}_tokens.json")

        if not os.path.exists(token_json_path):
            print(f"WARNING: Token file not found for {filename}. Skipping.")
            continue

        with open(corrected_json_path, 'r', encoding='utf-8') as f:
            corrected_data = json.load(f)
        with open(token_json_path, 'r', encoding='utf-8') as f:
            tokens = json.load(f)

        # Initialize labels for all tokens as 'O' (Outside)
        token_labels = ['O'] * len(tokens)

        # --- 1. Tag Patient Info ---
        for key, value in corrected_data.get('patient_info', {}).items():
            # Create a simple, robust label from the key
            label = key.upper().replace(' ', '_').replace('/', '_')
            
            indices = find_token_sequence(tokens, value)
            if indices:
                # Apply BIO tagging
                token_labels[indices[0]] = f"B-{label}"
                for i in indices[1:]:
                    token_labels[i] = f"I-{label}"

        # --- 2. Tag Lab Results ---
        for result in corrected_data.get('lab_results', []):
            for key, cell_data in result.items():
                # The value might be a simple string or a dict {'value': '...', 'confidence': ...}
                value = cell_data if isinstance(cell_data, str) else cell_data.get('value', '')
                if not value:
                    continue
                
                label = key.upper().replace(' ', '_').replace('/', '_')
                
                indices = find_token_sequence(tokens, value)
                if indices:
                    token_labels[indices[0]] = f"B-{label}"
                    for i in indices[1:]:
                        token_labels[i] = f"I-{label}"

        # --- 3. Convert labeled tokens to spaCy's training format ---
        # We'll treat the entire document as one large text block for simplicity
        full_text = " ".join([tok['text'] for tok in tokens])
        entities = []
        
        current_char_pos = 0
        for i, token in enumerate(tokens):
            label = token_labels[i]
            if label != 'O':
                start_char = current_char_pos
                end_char = start_char + len(token['text'])
                # The BIO prefix (B- or I-) is only for our internal logic.
                # spaCy just needs the core label (e.g., "PATIENT_NAME").
                clean_label = label[2:] 
                entities.append((start_char, end_char, clean_label))
            
            # Update position for the next token (text + one space)
            current_char_pos += len(token['text']) + 1
        
        if entities:
            training_data.append((full_text, {"entities": entities}))

    print(f"\nSuccessfully processed {len(training_data)} documents.")
    return training_data


if __name__ == '__main__':
    # Generate the training data in spaCy's format
    spacy_formatted_data = create_training_data()

    # Save the data to a JSON file for inspection (optional, but recommended)
    output_json_path = os.path.join(SPACY_TRAINING_DIR, 'spacy_training_data.json')
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(spacy_formatted_data, f, indent=4)
        
    print(f"\n✅ Training data prepared and saved for inspection at: {output_json_path}")
    print("\nNext step is to use this data to train a spaCy model.")

Setup complete. Directories created and paths defined.
 Trained spaCy model loaded successfully from models/model-best.
Found 9 corrected files to process.

Successfully processed 9 documents.

✅ Training data prepared and saved for inspection at: project_data\spacy_training\spacy_training_data.json

Next step is to use this data to train a spaCy model.


In [None]:
# ! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
import json
import random
from pathlib import Path
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

# Import the path to the prepared data from the previous script
# from prepare_training_data import SPACY_TRAINING_DIR

def convert_to_spacy_format():
    """
    Loads the JSON-formatted training data, splits it into training and validation sets,
    and saves them in spaCy's .spacy binary format.
    """
    input_json_path = Path(SPACY_TRAINING_DIR) / 'spacy_training_data.json'
    output_dir = Path(SPACY_TRAINING_DIR)
    
    # Load the prepared data
    with open(input_json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Split data into training (80%) and development (20%) sets
    train_data, dev_data = train_test_split(data, test_size=0.2, random_state=42)
    
    print(f"Data split: {len(train_data)} training examples, {len(dev_data)} development examples.")

    # Create a blank English model for tokenization
    nlp = spacy.blank("en")

    # Process and save the training data
    db_train = DocBin()
    for text, annotations in train_data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping entity: Span couldn't be formed for '{text[start:end]}' in '{text}'")
            else:
                ents.append(span)
        doc.ents = ents
        db_train.add(doc)
    db_train.to_disk(output_dir / "train.spacy")
    print(f" Created train.spacy file at {output_dir}")

    # Process and save the development (validation) data
    db_dev = DocBin()
    for text, annotations in dev_data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annotations.get("entities"):
            span = doc.char_span(start, end, label=label)
            if span is None:
                print(f"Skipping entity: Span couldn't be formed for '{text[start:end]}' in '{text}'")
            else:
                ents.append(span)
        doc.ents = ents
        db_dev.add(doc)
    db_dev.to_disk(output_dir / "dev.spacy")
    print(f"✅ Created dev.spacy file at {output_dir}")

if __name__ == '__main__':
    convert_to_spacy_format()

Data split: 7 training examples, 2 development examples.
 Created train.spacy file at project_data\spacy_training
✅ Created dev.spacy file at project_data\spacy_training


In [4]:
! python -m spacy train config.cfg --output ./models --paths.train ./project_data/spacy_training/train.spacy --paths.dev ./project_data/spacy_training/dev.spacy

[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    254.87    0.00    0.00    0.00    0.00
 28     200       4429.25   7441.21   54.84   85.00   40.48    0.55
 57     400        109.73    146.33   60.32   90.48   45.24    0.60
 85     600         19.81     12.03   62.02   88.89   47.62    0.62
114     800         48.07     13.07   62.02   88.89   47.62    0.62
142    1000         27.96      7.07   60.00   84.78   46.43    0.60
171    1200        181.06     53.97   57.14   85.71   42.86    0.57
200    1400        228.70     48.79   50.79   76.19   38.10    0.51
228    1600        337.26     55.99   55.81   80.00   42.86    0.56
257    1800          8.22      2.18   62.02   