In [1]:
import obfuscator
import spacy
from spacy.tokens import Doc, DocBin

import random
import pandas as pd
import json

from presidio_evaluator import InputSample, Span

from tqdm.auto import tqdm
nlp = spacy.load('en_core_web_sm')    

import logging
logging.basicConfig(level=logging.INFO)

VERSION = 6

In [2]:
doc_bin = DocBin().from_disk('data/ubiai_export.spacy')
docs = list(doc_bin.get_docs(nlp.vocab))
print(len(docs))

22728


In [3]:
docs = [doc for doc in docs if doc.text != 'text']
print(len(docs))

22688


In [4]:
hider = obfuscator.SurrogateAnonymizer(remember_replacements="document")

In [None]:
obfuscation_records = []
anonymized_samples = []

for doc in tqdm(docs):
    results = hider.anonymize(doc)

    # sort items by start index
    items = sorted(results.items, key=lambda x: x.start)

    # create a list of spans
    spans = [Span(item.entity_type, 0.5, item.start, item.end) for item in items]
    anonymized_samples.append(
        InputSample(
            full_text=results.text,
            spans=spans,
            create_tags_from_span=True,
            token_model_version="en_core_web_sm",
            scheme="BIO",
        )
    )

    for ent, item in zip(doc.ents, items):
        obfuscation_records.append(
            {
                "orig_text": doc.text,
                "obfuscated_text": results.text,
                "orig_pii": ent.text,
                "obfuscated_pii": item.text,
                "pii_type": ent.label_,
                "obfuscation_strategy": item.operator,
                "orig_start": ent.start_char,
                "orig_end": ent.end_char,
                "obfuscated_start": item.start,
                "obfuscated_end": item.end,
            }
        )

In [6]:
df = pd.DataFrame.from_records(obfuscation_records)

In [8]:
df.to_csv(f'output/obfsucation_records_{VERSION:02}.csv')

## Competition formats

### CONLL

In [9]:
def to_conll(sample: InputSample):
    conll = []
    for i, token in enumerate(sample.tokens):
        label = sample.tags[i]
        conll.append(
            {
                "token": token.text,
                "trailing_space": True if token.whitespace_ else False,
                "label": label,
            },
        )

    return conll

def create_conll_dataset(dataset):
    conlls = []
    i = 0
    for sample in tqdm(dataset):
        conll = to_conll(sample)
        for token in conll:
            token["document"] = i
            conlls.append(token)
        i += 1

    return pd.DataFrame(conlls)[["document", "token", "trailing_space", "label"]]

In [None]:
conll = create_conll_dataset(anonymized_samples)

In [12]:
conll_competition = conll.copy()

to_remove = [
    'B-LOCATION', 'I-LOCATION',
    'B-EDUCATION', 'I-EDUCATION',
    'B-NAME_INSTRUCTOR', 'I-NAME_INSTRUCTOR',
    'B-EMPLOYER', 'I-EMPLOYER',
    'B-DATE', 'I-DATE',
    'B-OTHER', 'I-OTHER',
    'B-AGE', 'I-AGE'
    ]

conll_competition.loc[conll_competition.label.isin(to_remove), 'label'] = 'O'

In [14]:
conll_competition.to_csv(f'output/obfuscated_data_{VERSION:02}.csv')

### JSON

In [15]:
from presidio_evaluator import span_to_tag

In [16]:
def to_dict(dataset):
    example_dicts = []
    for i, example in enumerate(dataset):
        labels = [tag if tag not in to_remove else "O" for tag in example.tags]
        example_dicts.append(
            {
                "full_text": example.full_text,
                "document": i,
                "tokens": [t.text for t in example.tokens],
                "trailing_whitespace": [
                    True if t.whitespace_ else False for t in example.tokens
                ],
                "labels": labels,
            }
        )

    return example_dicts

In [17]:
example_dicts = to_dict(anonymized_samples)

In [19]:
with open(f"output/obfuscated_data_{VERSION:02}.json", "w+", encoding="utf-8") as f:
    json.dump(example_dicts, f, ensure_ascii=False)

### Zip of .txt

In [20]:
import zipfile


def write_zip(dataset):
    # Create a new zip archive
    with zipfile.ZipFile(f"output/obfuscated_data_{VERSION:02}.zip", "w") as zip_file:
        # Write each string to a new text file in the archive
        for i, example in enumerate(dataset):
            filename = f"{i}.txt"
            zip_file.writestr(filename, example.full_text)

In [21]:
write_zip(anonymized_samples)