In [1]:
pip install "numpy<2.0"




In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [3]:
pip install tqdm



In [4]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=15ceeab58bae77da89fff7b0dcbfc9813377c2a367b469f576cde52fb4e7de28
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [5]:
pip install pytorch-crf


Collecting pytorch-crf
  Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)
Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [6]:
pip install seaborn



In [7]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from functools import partial
import uuid
from collections import Counter
from seqeval.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

In [None]:
import logging

In [None]:
import re

In [8]:
class BERTForTokenClassification(nn.Module):
    def __init__(self, bert_model, num_labels, dropout=0.1):
        super(BERTForTokenClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        logits = self.classifier(sequence_output)
        return logits

In [9]:
class PIIDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        item = self.examples[idx]
        encoding = self.tokenizer(item['source_text'], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        label_ids = torch.tensor(item['labels'], dtype=torch.long)
        label_ids = torch.where(attention_mask == 1, label_ids, torch.tensor(-100))
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label_ids
        }


In [10]:
def process_data(example, labels2int, tokenizer, max_length, ignore_subwords=True):
    text, labels = example["source_text"], example["privacy_mask"]
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=max_length,
                         return_offsets_mapping=True, return_special_tokens_mask=True)
    label_ids = [labels2int["O"]] * len(encoding['input_ids'])
    for label in labels:
        start_char, end_char = label['start'], label['end']
        token_start = encoding.char_to_token(start_char)
        token_end = encoding.char_to_token(end_char - 1)
        if token_start is not None and token_end is not None:
            label_type = label['label']
            if label_type != "CARDISSUER":  # Skip CARDISSUER labels
                # Merge GIVENNAME and LASTNAME labels
                if label_type.startswith("GIVENNAME"):
                    label_type = "GIVENNAME"
                elif label_type.startswith("LASTNAME"):
                    label_type = "LASTNAME"

                label_ids[token_start] = labels2int.get(f'B-{label_type}', labels2int["O"])
                for i in range(token_start + 1, token_end + 1):
                    label_ids[i] = labels2int.get(f'I-{label_type}', labels2int["O"])
    for i, (input_id, special_token_mask) in enumerate(zip(encoding['input_ids'], encoding['special_tokens_mask'])):
        if special_token_mask == 1 or (ignore_subwords and tokenizer.convert_ids_to_tokens(input_id).startswith("##")):
            label_ids[i] = labels2int["O"]
    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': label_ids
    }


In [11]:
def train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs, id2label, patience=3):
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model = None
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix({'train_loss': f"{loss.item():.4f}"})
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        print(f"\nEpoch {epoch+1}/{num_epochs}, Average training loss: {avg_train_loss:.4f}")
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
                val_loss += loss.item()
                preds = torch.argmax(outputs, dim=2)
                for i, mask in enumerate(attention_mask):
                    pred = [id2label[p.item()] for p, m in zip(preds[i], mask) if m.item() == 1]
                    true = [id2label[l.item()] for l, m in zip(labels[i], mask) if m.item() == 1]
                    if pred and true:
                        all_preds.append(pred)
                        all_labels.append(true)
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        print(f"Validation loss: {avg_val_loss:.4f}")
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            best_model = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"Early stopping triggered after epoch {epoch+1}")
                model.load_state_dict(best_model)
                break
        print("Label distribution in true labels:")
        print(Counter(sum(all_labels, [])))
        print("\nLabel distribution in predictions:")
        print(Counter(sum(all_preds, [])))
        print(f"\nNumber of sequences evaluated: {len(all_labels)}")
        print(f"Number of sequences with predictions: {len(all_preds)}")
        print(f"Number of sequences with labels: {len(all_labels)}")
        print("\nClassification Report:")
        print(classification_report(all_labels, all_preds))
    # Plot learning curve
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
    plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Learning Curve')
    plt.legend()
    plt.savefig('learning_curve.png')
    plt.close()
    return model, all_labels, all_preds

In [12]:

def plot_confusion_matrix(true_labels, pred_labels, label_list):
    # Flatten the lists of labels
    true_flat = [item for sublist in true_labels for item in sublist]
    pred_flat = [item for sublist in pred_labels for item in sublist]
    # Count occurrences of each label
    true_counts = Counter(true_flat)
    pred_counts = Counter(pred_flat)
    print("True label counts:", true_counts)
    print("Pred label counts:", pred_counts)
    # Create confusion matrix
    cm = confusion_matrix(true_flat, pred_flat, labels=label_list)
    # Plot confusion matrix
    plt.figure(figsize=(20, 16))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_list, yticklabels=label_list)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    print(f"Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
from datasets import load_from_disk

In [14]:
def main():
    # Define labels (excluding CARDISSUER)
    labels = [
        "BOD", "BUILDING", "CITY", "COUNTRY", "DATE", "DRIVERLICENSE",
        "EMAIL", "GEOCOORD", "GIVENNAME", "IDCARD", "IP", "LASTNAME",
        "PASS", "PASSPORT", "POSTCODE", "SECADDRESS", "SEX",
        "SOCIALNUMBER", "STATE", "STREET", "TEL", "TIME", "TITLE", "USERNAME"
    ]

    label_list = ["O"] + [f"B-{label}" for label in labels] + [f"I-{label}" for label in labels]
    labels = [f"I-{label}" for label in labels] + [f"B-{label}" for label in labels] + ["O"]
    label2id = {label: i for i, label in enumerate(labels)}
    id2label = {v: k for k, v in label2id.items()}

    # Load tokenizer
    bert_model = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(bert_model)

    # Load and process dataset
    ds = load_dataset("ai4privacy/pii-masking-300k")
    ds = ds.filter(lambda x: x["language"] == "English", num_proc=4)

    # Remove rows with "CARDISSUER" label
    ds = ds.filter(lambda x: "CARDISSUER" not in x["mbert_bio_labels"], num_proc=4)

    max_length = 512
    ds = ds.map(
        partial(process_data, labels2int=label2id, tokenizer=tokenizer, max_length=max_length),
        remove_columns=['privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_proc=8
    )

    # Split the validation set into validation and test sets
    val_test_split = 0.3
    val_dataset = ds["validation"]
    val_dataset = val_dataset.shuffle(seed=42)
    val_size = int(len(val_dataset) * (1 - val_test_split))
    test_size = len(val_dataset) - val_size
    val_dataset, test_dataset = val_dataset.select(range(val_size)), val_dataset.select(range(val_size, len(val_dataset)))

    # Create datasets and dataloaders
    train_dataset = PIIDataset(ds["train"], tokenizer, max_length)
    val_dataset = PIIDataset(val_dataset, tokenizer, max_length)
    test_dataset = PIIDataset(test_dataset, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    test_loader = DataLoader(test_dataset, batch_size=64)

    # Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = BERTForTokenClassification(
        bert_model=bert_model,
        num_labels=len(labels)
    ).to(device)

    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss(ignore_index=-100)

    # Train the model
    num_epochs = 3
    print(f"Starting training for {num_epochs} epochs...")
    model, true_labels, pred_labels = train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs, id2label)

    # Save the model
    model_save_path = f"bert_pii_model_{str(uuid.uuid4())}.pth"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved successfully to {model_save_path}")

    # Test the model
    model.eval()
    test_preds = []
    test_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs, dim=2)

            for i, mask in enumerate(attention_mask):
                pred = [id2label[p.item()] for p, m in zip(preds[i], mask) if m.item() == 1]
                true = [id2label[l.item()] for l, m in zip(labels[i], mask) if m.item() == 1]
                if pred and true:
                    test_preds.append(pred)
                    test_labels.append(true)

    # Print test results
    print("\nTest Results:")
    print(classification_report(test_labels, test_preds))

    # Plot confusion matrix for test results
    plot_confusion_matrix(test_labels, test_preds, label_list)

In [15]:
if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/103M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/114M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/104M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/102M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/29.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/27.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/177677 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/47728 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/177677 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/47728 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/29908 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/7946 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/29908 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/7946 [00:00<?, ? examples/s]

Using device: cuda


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Starting training for 3 epochs...


Epoch 1/3: 100%|██████████| 935/935 [45:15<00:00,  2.90s/it, train_loss=0.0288]



Epoch 1/3, Average training loss: 0.1664


Validation: 100%|██████████| 87/87 [04:18<00:00,  2.97s/it]


Validation loss: 0.0408
Label distribution in true labels:
Counter({'O': 627117, 'I-IP': 15136, 'I-EMAIL': 8108, 'I-BOD': 6299, 'I-TEL': 6233, 'I-TIME': 5348, 'I-SOCIALNUMBER': 5193, 'I-DATE': 4523, 'I-PASS': 4177, 'I-DRIVERLICENSE': 3010, 'B-TIME': 2618, 'B-LASTNAME': 2240, 'B-USERNAME': 2023, 'I-STREET': 1957, 'B-EMAIL': 1846, 'B-IDCARD': 1825, 'B-GIVENNAME': 1824, 'B-SOCIALNUMBER': 1724, 'B-PASSPORT': 1696, 'B-DRIVERLICENSE': 1693, 'B-BOD': 1656, 'B-IP': 1527, 'B-SEX': 1437, 'B-CITY': 1410, 'B-STATE': 1398, 'I-GEOCOORD': 1383, 'B-TITLE': 1361, 'B-BUILDING': 1356, 'B-TEL': 1342, 'B-POSTCODE': 1326, 'B-STREET': 1326, 'B-DATE': 1213, 'I-USERNAME': 1207, 'B-PASS': 1166, 'B-COUNTRY': 1124, 'I-POSTCODE': 949, 'I-CITY': 821, 'I-SECADDRESS': 623, 'B-SECADDRESS': 594, 'I-COUNTRY': 490, 'I-SEX': 385, 'I-LASTNAME': 205, 'I-GIVENNAME': 182, 'B-GEOCOORD': 152, 'I-TITLE': 82, 'I-STATE': 73, 'I-PASSPORT': 38, 'I-IDCARD': 26, 'I-BUILDING': 2})

Label distribution in predictions:
Counter({'O': 62641

Epoch 2/3: 100%|██████████| 935/935 [45:25<00:00,  2.91s/it, train_loss=0.0195]



Epoch 2/3, Average training loss: 0.0404


Validation: 100%|██████████| 87/87 [04:19<00:00,  2.99s/it]


Validation loss: 0.0343
Label distribution in true labels:
Counter({'O': 627117, 'I-IP': 15136, 'I-EMAIL': 8108, 'I-BOD': 6299, 'I-TEL': 6233, 'I-TIME': 5348, 'I-SOCIALNUMBER': 5193, 'I-DATE': 4523, 'I-PASS': 4177, 'I-DRIVERLICENSE': 3010, 'B-TIME': 2618, 'B-LASTNAME': 2240, 'B-USERNAME': 2023, 'I-STREET': 1957, 'B-EMAIL': 1846, 'B-IDCARD': 1825, 'B-GIVENNAME': 1824, 'B-SOCIALNUMBER': 1724, 'B-PASSPORT': 1696, 'B-DRIVERLICENSE': 1693, 'B-BOD': 1656, 'B-IP': 1527, 'B-SEX': 1437, 'B-CITY': 1410, 'B-STATE': 1398, 'I-GEOCOORD': 1383, 'B-TITLE': 1361, 'B-BUILDING': 1356, 'B-TEL': 1342, 'B-POSTCODE': 1326, 'B-STREET': 1326, 'B-DATE': 1213, 'I-USERNAME': 1207, 'B-PASS': 1166, 'B-COUNTRY': 1124, 'I-POSTCODE': 949, 'I-CITY': 821, 'I-SECADDRESS': 623, 'B-SECADDRESS': 594, 'I-COUNTRY': 490, 'I-SEX': 385, 'I-LASTNAME': 205, 'I-GIVENNAME': 182, 'B-GEOCOORD': 152, 'I-TITLE': 82, 'I-STATE': 73, 'I-PASSPORT': 38, 'I-IDCARD': 26, 'I-BUILDING': 2})

Label distribution in predictions:
Counter({'O': 62558

Epoch 3/3: 100%|██████████| 935/935 [45:28<00:00,  2.92s/it, train_loss=0.0243]



Epoch 3/3, Average training loss: 0.0294


Validation: 100%|██████████| 87/87 [04:17<00:00,  2.95s/it]


Validation loss: 0.0311
Label distribution in true labels:
Counter({'O': 627117, 'I-IP': 15136, 'I-EMAIL': 8108, 'I-BOD': 6299, 'I-TEL': 6233, 'I-TIME': 5348, 'I-SOCIALNUMBER': 5193, 'I-DATE': 4523, 'I-PASS': 4177, 'I-DRIVERLICENSE': 3010, 'B-TIME': 2618, 'B-LASTNAME': 2240, 'B-USERNAME': 2023, 'I-STREET': 1957, 'B-EMAIL': 1846, 'B-IDCARD': 1825, 'B-GIVENNAME': 1824, 'B-SOCIALNUMBER': 1724, 'B-PASSPORT': 1696, 'B-DRIVERLICENSE': 1693, 'B-BOD': 1656, 'B-IP': 1527, 'B-SEX': 1437, 'B-CITY': 1410, 'B-STATE': 1398, 'I-GEOCOORD': 1383, 'B-TITLE': 1361, 'B-BUILDING': 1356, 'B-TEL': 1342, 'B-POSTCODE': 1326, 'B-STREET': 1326, 'B-DATE': 1213, 'I-USERNAME': 1207, 'B-PASS': 1166, 'B-COUNTRY': 1124, 'I-POSTCODE': 949, 'I-CITY': 821, 'I-SECADDRESS': 623, 'B-SECADDRESS': 594, 'I-COUNTRY': 490, 'I-SEX': 385, 'I-LASTNAME': 205, 'I-GIVENNAME': 182, 'B-GEOCOORD': 152, 'I-TITLE': 82, 'I-STATE': 73, 'I-PASSPORT': 38, 'I-IDCARD': 26, 'I-BUILDING': 2})

Label distribution in predictions:
Counter({'O': 62607

Testing: 100%|██████████| 38/38 [01:51<00:00,  2.94s/it]



Test Results:
               precision    recall  f1-score   support

          BOD       0.95      0.96      0.96       781
     BUILDING       0.98      0.99      0.98       586
         CITY       0.94      0.98      0.96       715
      COUNTRY       0.94      0.97      0.96       443
         DATE       0.90      0.92      0.91       573
DRIVERLICENSE       0.93      0.97      0.95      1298
        EMAIL       0.97      1.00      0.99      2212
     GEOCOORD       0.97      0.95      0.96       122
    GIVENNAME       0.84      0.90      0.87       743
       IDCARD       0.92      0.93      0.93       834
           IP       0.99      1.00      0.99      2856
     LASTNAME       0.91      0.88      0.90      1035
         PASS       0.93      0.97      0.95       871
     PASSPORT       0.93      0.91      0.92       752
     POSTCODE       0.94      0.99      0.97       797
   SECADDRESS       0.96      0.97      0.97       398
          SEX       0.96      0.98      0.97     