## Synthetic Data Generator for Text Anonymization Model Training

This notebook is used to generate synthetic training data for text anonymization model. The generated data contains various types of documents (medical, banking, business etc.) along with their anonymized counterparts.

### Requirements
- Python 3.10+
- Installed packages from `requirements.txt`
- `.env` file with OpenAI API key (OPENAI_API_KEY)

### Data Structure
Generated data is saved in JSON format and contains:
- Original text
- Anonymized version
- Used anonymization labels
- Metadata (document type, generation date etc.)

In [None]:
# Importing required libraries

import json
from pathlib import Path
from datetime import datetime
from typing import List, Union, Literal
from enum import Enum

from dotenv import load_dotenv
from pydantic import BaseModel
from openai import AsyncOpenAI
import asyncio


load_dotenv()


In [2]:
# Core functionalities for synthetic data generation

class DocumentType(str, Enum):
    """
    Enum defining supported document types for data generation.
    Used to categorize and generate appropriate examples.
    """
    MEDICAL = "medical"
    BANKING = "banking" 
    BUSINESS = "business"
    RECRUITMENT = "recruitment"
    SOCIAL_MEDIA = "social_media"
    LEGAL = "legal"
    EDUCATIONAL = "educational"
    INSURANCE = "insurance"
    CHAT_PERSONAL = "chat_personal"
    CHAT_BUSINESS = "chat_business"
    CHAT_SUPPORT = "chat_support"
    EMAIL_THREAD = "email_thread"

class FineTunedDataItem(BaseModel):
    """
    Single training data item containing original and anonymized text.
    
    Attributes:
        context: Original text with sensitive data
        anonymized_context: Text with sensitive data replaced by tags
        used_labels: String containing list of anonymization tags used
    """
    context: str
    anonymized_context: str
    used_labels: str
    
class FineTunedData(BaseModel):
    """Container for multiple training data items."""
    items: list[FineTunedDataItem]
    
def get_document_prompt(doc_type: DocumentType) -> str:
    """
    Get document type specific prompt for GPT model.
    
    Args:
        doc_type: Type of document to generate examples for
        
    Returns:
        String containing prompt tailored for given document type
    """
    prompts = {
        DocumentType.MEDICAL: """
            Generate medical records, patient cards, hospital discharge summaries, and medical test results.
            Include multiple participants like doctors, nurses, and patients.
        """,
        DocumentType.BANKING: """
            Generate bank statements, transfer confirmations, loan applications, and credit card documents.
            Make sure to include various transaction types and multiple account holders.
        """,
        DocumentType.BUSINESS: """
            Generate business correspondence, invoices, contracts, and company internal documents.
            Include various business contexts and multiple stakeholders.
        """,
        DocumentType.RECRUITMENT: """
            Generate CVs, job applications, recommendation letters, and employment contracts.
            Include different positions and multiple references.
        """,
        DocumentType.SOCIAL_MEDIA: """
            Generate social media posts, user profiles, and public comments.
            Include interactions between multiple users and various post types.
        """,
        DocumentType.CHAT_PERSONAL: """
            Generate personal chat conversations between friends or family members.
            Include:
            - Multiple participants in the conversation
            - Mix of formal and informal language
            - References to personal information, dates, and locations
            - Message timestamps
            - Sharing of contact information
            Format as a chat with multiple messages, using timestamps and sender names.
        """,
        DocumentType.CHAT_BUSINESS: """
            Generate business chat conversations between colleagues or business partners.
            Include:
            - Multiple participants discussing business matters
            - References to meetings, projects, and deadlines
            - Sharing of business contact information
            - Professional language and terms
            - Discussion of company-specific information
            Format as a business chat with timestamps and professional titles.
        """,
        DocumentType.CHAT_SUPPORT: """
            Generate customer support chat conversations.
            Include:
            - Customer service representative and customer interaction
            - Account verification process
            - Personal information sharing for verification
            - Problem description and resolution
            - Reference numbers and case IDs
            Format as a support chat with timestamps and roles.
        """,
        DocumentType.EMAIL_THREAD: """
            Generate email thread conversations with multiple participants.
            Include:
            - Email headers with addresses and timestamps
            - Multiple replies and forwards
            - Signature blocks with contact information
            - CC and BCC fields
            - References to attachments and previous emails
            Format as an email thread with proper headers and quotations.
        """,
    }
    return prompts.get(doc_type, "Generate examples with personal and sensitive information.")

async def generate_training_data_async(
    document_type: DocumentType,
    num_examples: int,
    additional_labels: List[str] = None,
    temperature: float = 0.3
) -> FineTunedData:
    """
    Generate synthetic training data for text anonymization model using GPT API.
    
    This function generates realistic examples of documents containing sensitive data,
    along with their anonymized versions where sensitive data is replaced with tags.
    
    Args:
        document_type: Type of documents to generate (medical, banking etc.)
        num_examples: Number of examples to generate
        additional_labels: Optional list of additional anonymization tags to use
        temperature: GPT API temperature parameter for controlling randomness
        
    Returns:
        FineTunedData object containing generated examples
        
    Example:
        >>> data = await generate_training_data_async(
        ...     DocumentType.BANKING,
        ...     num_examples=5,
        ...     additional_labels=["CURRENCY", "TRANSACTION_ID"]
        ... )
    """
    
    client = AsyncOpenAI()
    
    base_system_message = """
    You are an assistant that creates examples for training a text anonymization model.
    Your task is to generate input-output pairs where:
    1. Input text (context) should contain real personal and sensitive data according to GDPR (like real names, emails, addresses etc.)
    2. Output text (anonymized_context) should be the same text but with sensitive data replaced with appropriate tags
    3. CRITICAL: Tag numbering MUST start from 1 in EACH new example - NEVER continue numbering from previous examples
    4. For chat and email threads, maintain the conversation flow and format while anonymizing personal data

    IMPORTANT RULES FOR TAG NUMBERING:
    - Each example is completely independent (each item in items list is independent)!
    - Tag numbering MUST start from 1 in each new example
    - NEVER continue numbering from previous examples
    - Each type of tag ([NAME], [EMAIL], etc.) starts counting from 1 in each new example

    Examples showing correct tag numbering:

    Example 1:
    Context: "Hello, my name is John Smith and my email is john.smith@gmail.com. I work with Mary Johnson (mary.j@company.com)."
    Anonymized context: "Hello, my name is [NAME_1] and my email is [EMAIL_1]. I work with [NAME_2] ([EMAIL_2])."

    Example 2 (notice numbering starts from 1 again):
    Context: "Hi, I'm Alice Brown and my colleague Bob White can be reached at bob.white@work.com"
    Anonymized context: "Hi, I'm [NAME_1] and my colleague [NAME_2] can be reached at [EMAIL_1]"

    Example 3 (again, numbering starts from 1):
    Context: "Contact our support: Tom Wilson (tom@support.com) or Jane Davis (jane@support.com)"
    Anonymized context: "Contact our support: [NAME_1] ([EMAIL_1]) or [NAME_2] ([EMAIL_2])"

    Supported GDPR tags for anonymization (use only in anonymized_context):
    - [NAME] - first and last names (use [NAME_1], [NAME_2] etc. within single example)
    - [EMAIL] - email addresses (use [EMAIL_1], [EMAIL_2] etc. within single example) 
    - [PHONE] - phone numbers (use [PHONE_1], [PHONE_2] etc. within single example)
    - [ADDRESS] - physical addresses (use [ADDRESS_1], [ADDRESS_2] etc. within single example)
    - [PESEL] - Polish national ID numbers
    - [NIP] - Polish tax identification numbers
    - [DATE] - birth dates and other personal dates (use [DATE_1], [DATE_2] etc. within single example)
    - [ACCOUNT] - bank account numbers (use [ACCOUNT_1], [ACCOUNT_2] etc. within single example)
    - [USERNAME] - usernames in chats and social media
    - [CURRENCY] - monetary amounts with currency 
    """
    
    if additional_labels:
        base_system_message += "\nAdditional supported tags for anonymization:\n" + "\n".join([f"- [{label}]" for label in additional_labels])
    
    base_system_message += """
    Generated examples should be realistic and contain real sensitive data according to GDPR in the context field.
    The anonymized_context field should contain the same text but with all sensitive data replaced with appropriate tags.
    Make sure to use as many different types of sensitive data as possible to have equal distribution of tags in the dataset.
    You're not allowed to generate new tag types - use only the provided list.
    Remember: Each example is independent - you must start numbering tags from 1 in each new example.
    """
    
    document_specific_prompt = get_document_prompt(document_type)
    
    user_message = f"Generate {num_examples} realistic examples from {document_type.value} documents. {document_specific_prompt}"
    
    completion = await client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": base_system_message},
            {"role": "user", "content": user_message},
        ],
        response_format=FineTunedData,
        temperature=temperature
    )
    
    return completion.choices[0].message.parsed

In [3]:
# Function to append new data to the synthetic data file

def append_to_synthetic_data(new_data: Union[FineTunedData, List[FineTunedDataItem]], document_type: str = None):
    """
    Appends new data to synthetic_data.json file or creates a new file if it doesn't exist.
    
    Args:
        new_data: New data to add (FineTunedData or list of FineTunedDataItem)
        document_type: Document type (optional, for metadata)
    """
    file_path = Path("../data/synthetic_data.json")
    
    # Convert new_data to list of items if we received FineTunedData
    if isinstance(new_data, FineTunedData):
        items_to_add = new_data.items
    else:
        items_to_add = new_data
    
    # Prepare data structure
    if file_path.exists():
        # Load existing file
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        except json.JSONDecodeError:
            print(f"Error in file {file_path}. Creating new file.")
            existing_data = {
                "metadata": {
                    "creation_date": datetime.now().isoformat(),
                    "last_update": datetime.now().isoformat(),
                    "total_examples": 0,
                    "data_types": {}
                },
                "data": []
            }
    else:
        # Create new structure
        file_path.parent.mkdir(parents=True, exist_ok=True)
        existing_data = {
            "metadata": {
                "creation_date": datetime.now().isoformat(),
                "last_update": datetime.now().isoformat(),
                "total_examples": 0,
                "data_types": {}
            },
            "data": []
        }
    
    # Update metadata
    existing_data["metadata"]["last_update"] = datetime.now().isoformat()
    existing_data["metadata"]["total_examples"] += len(items_to_add)
    
    if document_type:
        existing_data["metadata"]["data_types"][document_type] = \
            existing_data["metadata"]["data_types"].get(document_type, 0) + len(items_to_add)
    
    # Add new data
    for item in items_to_add:
        if isinstance(item, FineTunedDataItem):
            existing_data["data"].append(item.model_dump())
        else:
            existing_data["data"].append(item)
    
    # Save updated data
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=2)
    
    print(f"Added {len(items_to_add)} new examples to {file_path}")
    print(f"Total number of examples: {existing_data['metadata']['total_examples']}")
    if document_type:
        print(f"Data type distribution: {json.dumps(existing_data['metadata']['data_types'], indent=2)}")


In [4]:
# Configuration for synthetic data generation
# This dictionary defines parameters for different document types:
# - First value in tuple: Number of examples to generate
# - Second value in tuple: List of additional labels to include in generation
doc_config = {
    DocumentType.MEDICAL: (550, ["DIAGNOSIS", "MEDICATION", "DOCTOR", "HOSPITAL"]),
    DocumentType.BANKING: (400, ["CURRENCY", "TRANSACTION_ID", "ACCOUNT_TYPE"]),
    DocumentType.BUSINESS: (400, ["COMPANY", "POSITION", "DEPARTMENT"]),
    DocumentType.RECRUITMENT: (300, ["SKILL", "EXPERIENCE", "POSITION"]),
    DocumentType.SOCIAL_MEDIA: (300, ["USERNAME", "MENTION"]),
    DocumentType.LEGAL: (300, ["CASE_NUMBER", "LAW", "COURT"]),
    DocumentType.EDUCATIONAL: (300, ["SCHOOL", "GRADE", "DEGREE"]),
    DocumentType.INSURANCE: (300, ["POLICY_NUMBER", "CLAIM_ID", "INSURER"]),
    DocumentType.CHAT_PERSONAL: (550, ["USERNAME", "CHAT_ID"]),
    DocumentType.CHAT_BUSINESS: (550, ["USERNAME", "COMPANY", "POSITION", "PROJECT"]),
    DocumentType.CHAT_SUPPORT: (500, ["USERNAME", "TICKET_ID", "ORDER_NUMBER"]),
    DocumentType.EMAIL_THREAD: (400, ["SUBJECT", "THREAD_ID", "SIGNATURE"])
}

async def generate_batch_async(doc_type: DocumentType, batch_size: int, additional_labels: List[str]):
    """
    Asynchronously generates a batch of synthetic data for a given document type.
    
    Args:
        doc_type (DocumentType): Type of document to generate
        batch_size (int): Number of examples to generate in this batch
        additional_labels (List[str]): Additional labels to include in generation
        
    Returns:
        bool: True if generation was successful, False otherwise
    """
    try:
        dataset = await generate_training_data_async(
            document_type=doc_type,
            num_examples=batch_size,
            additional_labels=additional_labels,
            temperature=0.3  # Lower temperature for more focused generation
        )
        append_to_synthetic_data(dataset, document_type=doc_type.value)
        return True
    except Exception as e:
        print(f"Error generating data for {doc_type.value}: {str(e)}")
        return False

async def generate_all_data_async(doc_config: dict):
    """
    Main function to generate all synthetic data asynchronously.
    Processes data in batches of 5 examples for better performance and error handling.
    
    Args:
        doc_config (dict): Configuration dictionary defining parameters for each document type
        
    Prints summary of successful and failed batch generations.
    """
    tasks = []
    for doc_type, (num_examples, additional_labels) in doc_config.items():
        print(f"Planning to generate {num_examples} examples for {doc_type.value}...")
        for batch_start in range(0, num_examples, 5):
            batch_size = min(5, num_examples - batch_start)
            task = asyncio.create_task(
                generate_batch_async(doc_type, batch_size, additional_labels)
            )
            tasks.append(task)
    
    results = await asyncio.gather(*tasks)
    successful = sum(1 for r in results if r)
    failed = len(results) - successful
    print(f"\nData generation completed:")
    print(f"Successful batches: {successful}")
    print(f"Failed batches: {failed}")


In [None]:
await generate_all_data_async(doc_config)