In [None]:
pip install --upgrade datasets

# Load Data

In [4]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification, pipeline
import numpy as np
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Load only english texts
dataset = load_dataset("ai4privacy/pii-masking-400k")
english_dataset = dataset.filter(lambda x: x['language'] == 'en')

device = 0 if torch.cuda.is_available() else -1
print("device", device)

device 0


In [5]:
print("\nFirst lines of train:")
print(english_dataset['train'][:3])


First lines of train:
{'source_text': ['<p>My child faozzsd379223 (DOB: May/58) will undergo treatment with Dr. faozzsd379223, office at Hill Road. Our ZIP code is 28170-6392. Consult policy M.UE.227995. Contact number: 0070.606.322.6244. Handle transactions with 6225427220412963. Queries? Email: faozzsd379223@outlook.com.</p>', 'Guardians:*BF6* and *BF6* grant permission for their child *BF6*, born on *1960-08-01T00:00:00*, to participate in the Early Intervention Program at *Bicester Bucknell* University. The programme leader, Dr. *BF6* can be contacted at *52siddharta@aol.com* or *536373370485280*.', 'We, *bahara.cathers19* and *bahara.cathers19* reside at *358*, *Totnes*, *United Kingdom*, consent for our child *bahara.cathers19*, born on *August/72*, to participate. Please reach us at *18C@gmail.com* or *888517851168245*.'], 'locale': ['US', 'GB', 'GB'], 'language': ['en', 'en', 'en'], 'split': ['train', 'train', 'train'], 'privacy_mask': [[{'label': 'USERNAME', 'start': 12, 'end

In [6]:
df = pd.DataFrame(english_dataset['train'][:5])
print(df)

                                         source_text locale language  split  \
0  <p>My child faozzsd379223 (DOB: May/58) will u...     US       en  train   
1  Guardians:*BF6* and *BF6* grant permission for...     GB       en  train   
2  We, *bahara.cathers19* and *bahara.cathers19* ...     GB       en  train   
3  Student: Blagojka van der Boog\nDOB: 8th Janua...     US       en  train   
4  Child: Anna-Louise Dolderer\nDate of Birth: 05...     US       en  train   

                                        privacy_mask     uid  \
0  [{'label': 'USERNAME', 'start': 12, 'end': 25,...  302521   
1  [{'label': 'USERNAME', 'start': 11, 'end': 14,...  120409   
2  [{'label': 'USERNAME', 'start': 5, 'end': 21, ...  120411   
3  [{'label': 'GIVENNAME', 'start': 9, 'end': 17,...  128429   
4  [{'label': 'GIVENNAME', 'start': 7, 'end': 18,...  128431   

                                         masked_text  \
0  <p>My child [USERNAME_2] (DOB: [DATEOFBIRTH_1]...   
1  Guardians:*[USERNAME_4]* 

In [7]:
for column in english_dataset['train'].column_names:
    first_value = english_dataset['train'][0][column]
    print(f"Column: {column}, Type: {type(first_value)}")

Column: source_text, Type: <class 'str'>
Column: locale, Type: <class 'str'>
Column: language, Type: <class 'str'>
Column: split, Type: <class 'str'>
Column: privacy_mask, Type: <class 'list'>
Column: uid, Type: <class 'int'>
Column: masked_text, Type: <class 'str'>
Column: mbert_tokens, Type: <class 'list'>
Column: mbert_token_classes, Type: <class 'list'>


# Utils

In [8]:
def parse_privacy_masks(privacy_masks):
    """
    Parse privacy masks from various input formats into a consistent list
    """
    # If input is a string, try parsing as JSON or using eval
    if isinstance(privacy_masks, str):
        try:
            # parse as JSON, replacing single quotes with double quotes
            return json.loads(privacy_masks.replace("'", '"'))
        except:
            try:
                return eval(privacy_masks)
            except:
                # Both parsing methods fail
                return []
    
    # If input is already a list, no parsing needed
    elif isinstance(privacy_masks, list):
        return privacy_masks
    return []

def extract_labels(privacy_masks):
    """
    Extract unique labels from privacy masks
    """
    masks = parse_privacy_masks(privacy_masks)
    return set(mask['label'] for mask in masks)

def prepare_model_and_tokenizer(dataset):
    """
    Prepare the model and tokenizer for named entity recognition
    """
    # Collect all unique labels from the training dataset
    all_labels = set()
    for privacy_mask in dataset['train']['privacy_mask']:
        all_labels.update(extract_labels(privacy_mask))
    
    # Create label list with BIO (Begin, Inside, Outside)
    # 'O' represents tokens not part of any named entity
    # 'B-' prefix marks the beginning of an entity
    # 'I-' prefix marks inside/continuation of an entity
    label_list = ['O'] + [f'B-{label}' for label in all_labels] + [f'I-{label}' for label in all_labels]
    
    # Create bidirectional label-id mappings
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}
    
    # Load pre-trained BERT model and tokenizer
    model_checkpoint = "bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )
    
    return tokenizer, model, label_list, label2id

def tokenize_and_align_labels(examples, tokenizer, label2id):
    """
    Tokenize input text and align privacy mask labels with tokenized inputs
    """
    # Tokenize input text
    tokenized_inputs = tokenizer(
        examples['source_text'],
        truncation=True,
        padding=True,
        is_split_into_words=False,
        return_offsets_mapping=True
    )
    
    labels = []
    # Process batch
    for i, source_text in enumerate(examples['source_text']):
        # Get token offsets and word ids for the current example
        offsets = tokenized_inputs['offset_mapping'][i]
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        
        # Initialize labels with 'O' (Outside) (BIO)
        label = np.full(len(offsets), label2id['O'], dtype=int)
        
        # Process privacy masks
        privacy_masks = parse_privacy_masks(examples['privacy_mask'][i])
        
        # Assign labels based on token offsets and privacy masks
        for mask in privacy_masks:
            start, end, label_type = mask['start'], mask['end'], mask['label']
            
            # Check each token's alignment with privacy mask
            for j, (token_start, token_end) in enumerate(offsets):
                if word_ids[j] is None:
                    continue
                
                # Check if token is within the privacy mask span
                # First token gets 'B-' (Begin) (BIO) 
                # the following tokens get 'I-' (Inside) (BIO)
                if token_start >= start and token_end <= end:
                    if token_start == start:
                        label[j] = label2id[f'B-{label_type}']
                    else:
                        label[j] = label2id[f'I-{label_type}']
        
        labels.append(label)
    
    # Remove offset mapping and add labels to tokenized inputs
    tokenized_inputs.pop("offset_mapping")
    tokenized_inputs['labels'] = labels
    
    return tokenized_inputs

def compute_metrics(pred):
    """
    Compute evaluation metrics for model predictions.
    """
    # Extract true labels and model predictions
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Create a mask to ignore special tokens
    mask = labels != -100
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels[mask], preds[mask], average='weighted'
    )
    acc = accuracy_score(labels[mask], preds[mask])
    
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Train Model

In [9]:
tokenizer, model, label_list, label2id = prepare_model_and_tokenizer(english_dataset)

tokenized_datasets = english_dataset.map(
    lambda examples: tokenize_and_align_labels(examples, tokenizer, label2id),
    batched=True,
    remove_columns=english_dataset['train'].column_names
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # Desactivate WandB
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/68275 [00:00<?, ? examples/s]

Map:   0%|          | 0/17046 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0102,0.011441,0.996053,0.9963,0.996053,0.996132
2,0.0082,0.009346,0.996748,0.996879,0.996748,0.996782
3,0.0056,0.008927,0.997203,0.997203,0.997203,0.997195


TrainOutput(global_step=25605, training_loss=0.012590204792712273, metrics={'train_runtime': 7559.2616, 'train_samples_per_second': 27.096, 'train_steps_per_second': 3.387, 'total_flos': 3.087168124996782e+16, 'train_loss': 0.012590204792712273, 'epoch': 3.0})

# Save and Reload Model

In [10]:
model_path = "./saved_model"

def save_trained_model(model_path):
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    print("Model Saved")

save_trained_model(model_path)

Model Saved


In [11]:
def load_trained_model(model_path="./saved_model"):
    """
    Load the trained model from the result directory
    """
    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    print("Model reloaded")
    
    return tokenizer, model

# Test Model

In [12]:
def create_pipeline(tokenizer, model):
    """
    Create a named entity recognition (NER) pipeline
    """
    return pipeline(
        "ner", 
        model=model, 
        tokenizer=tokenizer, 
        aggregation_strategy="simple"
    )

def test_pii_detection(ner_pipeline, masked_text):
    """
    Detect and mask (PII) in a text
    """
    # Run the NER pipeline
    pipeline = ner_pipeline(masked_text)
    
    # Sort the results by the start position to avoid offset issues
    sorted_results = sorted(pipeline, key=lambda x: x['start'], reverse=True)
    
    # Mask each detected entity
    for entity in sorted_results:
        start, end = entity['start'], entity['end']
        label = entity['entity_group']
        
        masked_text = masked_text[:start] + f"[{label}]" + masked_text[end:]
    
    return masked_text, pipeline

def main():
    test_texts = [
        "My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890.",
        "I live at 123 Main Street, Anytown, USA 12345. My credit card is 4111-1111-1111-1111.",
        "Contact Dr. Smith at his office: 0070.606.322.6244 or email: doctor@hospital.org",
        "John Doe lives in Paris, and his phone number is 123-456-7890.",
        "Alice lives in New York. Her email is alice@example.com."
    ]
    
    # Load model and create pipeline
    tokenizer, model = load_trained_model()
    ner_pipeline = create_pipeline(tokenizer, model)
    
    # Tester chaque texte
    for text in test_texts:
        print("\nOriginal text:\n", text)
        
        masked_text, entities = test_pii_detection(ner_pipeline, text)
        
        print("\nMasked text:\n", masked_text)
        
        print("\nPII detected:")
        for entity in entities:
            print(f"- {entity['word']}: {entity['entity_group']} (confidence: {entity['score']:.2f})")

if __name__ == "__main__":
    main()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Model reloaded

Original text:
 My name is John Doe and my email is john.doe@example.com. My phone number is 123-456-7890.

Masked text:
 My name is John Doe and my email is [EMAIL]. My phone number is [TELEPHONENUM].

PII detected:
- john. doe @ example. com: EMAIL (confidence: 0.99)
- 123 - 456 - 7890: TELEPHONENUM (confidence: 1.00)

Original text:
 I live at 123 Main Street, Anytown, USA 12345. My credit card is 4111-1111-1111-1111.

Masked text:
 I live at [BUILDINGNUM] [STREET], [CITY], USA [ZIPCODE]. My credit card is [CREDITCARDNUMBER].

PII detected:
- 123: BUILDINGNUM (confidence: 1.00)
- Main Street: STREET (confidence: 0.79)
- Anytown: CITY (confidence: 1.00)
- 12345: ZIPCODE (confidence: 0.66)
- 4111 - 1111 - 1111 - 1111: CREDITCARDNUMBER (confidence: 0.99)

Original text:
 Contact Dr. Smith at his office: 0070.606.322.6244 or email: doctor@hospital.org

Masked text:
 Contact Dr. Smith at his office: [TELEPHONENUM] or email: doctor@hospital.org

PII detected:
- 0070. 606. 

# Save Model In Zip

In [13]:
ls "./saved_model"

  pid, fd = os.forkpty()


config.json        special_tokens_map.json  tokenizer_config.json
model.safetensors  tokenizer.json           vocab.txt


In [19]:
!zip -r saved_model.zip saved_model

  adding: saved_model/ (stored 0%)
  adding: saved_model/tokenizer_config.json (deflated 76%)
  adding: saved_model/special_tokens_map.json (deflated 42%)
  adding: saved_model/vocab.txt (deflated 45%)
  adding: saved_model/model.safetensors (deflated 7%)
  adding: saved_model/tokenizer.json (deflated 67%)
  adding: saved_model/config.json (deflated 64%)


In [20]:
from IPython.display import FileLink
FileLink("saved_model.zip")