In [2]:
import spacy
from spacy.tokens import DocBin

nlp = spacy.load('en_core_web_sm')

In [3]:
doc_bin = DocBin().from_disk('../data/ubiai_export.spacy')
docs = list(doc_bin.get_docs(nlp.vocab))

In [4]:
len(docs)

22728

In [6]:
from collections import defaultdict

entities = defaultdict(list)

for doc in docs:
    for ent in doc.ents:
        # Count the number of entities
        entities[ent.label_].append(ent)

## The Obfuscation Plan
Obfuscate with surrogate name generator:
 - NAME_STUDENT
 - NAME_INSTRUCTOR

Obfuscate with URL randomizer:
 - URL

Obfuscate with random character replacement:
 - ID_NUM
 - AGE
 - DATE

Obfuscate with Faker:
 - EMAIL
 - PHONE_NUM
 - STREET_ADDRESS
 - USERNAME

Manually review:
 - OTHER

Shuffle Obfuscate:
 - LOCATION
 - EDUCATION
 - EMPLOYER


### Manually create and save surrogates for Other

In [7]:
import ipywidgets as widgets
from ipywidgets import Layout, HTML
from spacy import displacy
from pathlib import Path
import shelve


In [8]:
class ManualSurrogation:
    
    all_ents = entities
    color = 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'
    mappings_path = Path('../data/manual_surrogate_mappings')
    
    def __init__(self, entity_type: str):
        # Create/Load surrogate mappings file
        self.shelf = shelve.open(self.mappings_path / entity_type, 'c')
        
        self.counter = 0
        # Set up generator to cycle through entities
        self.entity_type = entity_type
        self.ents = self.all_ents[self.entity_type]
        self.current_ent = None
        
        # Displacy Options
        self.options = {
            'ents': [self.entity_type],
            'colors': {self.entity_type: self.color}
        }
        
        # Header for displaying ent + context
        self.header = HTML(
                value='<h1>PII will appear here.</h1>',
                layout=Layout(width='auto', height='auto')
                )
        self.header_box = widgets.HBox(
            [self.header],
            layout=Layout(height='400px', overflow_y='auto',
                          margin='10px 10px 10px 10px')
            )
        
        # Main interaction widgets
        self.text_input = widgets.Text(value='',
                                       continuous_update=False)
        self.text_input_label = widgets.Label(value='Surrogate:')
        self.confirm_button = widgets.Button(description='Start')
        self.not_pii_button = widgets.Button(description='Not PII')
        self.previous_button = widgets.Button(description='Previous')
        self.next_button = widgets.Button(description='Next')
        
        self.center_box = widgets.HBox(
            [self.text_input_label,
             self.text_input,
             self.confirm_button,
             self.previous_button,
             self.next_button,
             self.not_pii_button]
            )

        self.confirm_button.on_click(self.submit_surrogate)
        self.previous_button.on_click(self.get_last_ent)
        self.next_button.on_click(self.get_next_ent)
        self.not_pii_button.on_click(self.submit_not_pii)
   
        self.rows = [] # Used for logging
        self.log_box = HTML(
                value='<h4>Saved Mappings:</h4>',
                layout=Layout(width='auto', height='50px')
                )


        self.widget = widgets.AppLayout(
            header=self.header_box,
            center=self.center_box,
            footer=self.log_box,
            width='50%',
            height='500px',
            pane_heights=['300px', '40px', '160px'],
            )

        self.log = ['']

        display(self.widget)

    def update_widgets(self, window=25):
        start = max(0, self.current_ent.start - window)
        end = min(self.current_ent.end + window, len(self.current_ent.doc))
        self.header.value = displacy.render(
            self.current_ent.doc[start:end],
            style='ent',
            options=self.options
            )
        self.text_input_label.value = f'Surrogate for {self.current_ent.text}:'
        self.text_input.value = self.shelf.get(self.current_ent.text, '')

    def get_next_ent(self, *args):
        self.counter += 1
        try:
            self.current_ent = self.ents[self.counter]
            if self.current_ent.text in self.shelf:
                self.get_next_ent()
            self.update_widgets()
        except IndexError:
            self.confirm_button.disabled = True
            self.shelf.close()
            self.header.value = '<h1>Done!</h1>'

    def get_last_ent(self, *args):
        self.counter -= 1
        self.current_ent = self.ents[self.counter]
        self.update_widgets()

    def submit_surrogate(self, *args):
        self.confirm_button.description = 'Confirm'
        if self.current_ent:
            self.shelf[self.current_ent.text] = self.text_input.value
            self.log_box.value = self.add_log()
        self.get_next_ent()
    
    def submit_not_pii(self, *args):
        self.text_input.value = 'NOT_PII'
        self.submit_surrogate()

    def add_log(self):
        max_rows = 5
        self.rows.insert(
            0, f'{self.counter}: {self.current_ent.text} -> {self.text_input.value}'
            )
        if len(self.rows) > max_rows:
            self.rows.pop(5)
        return '<h4>Saved Mappings:</h4><ul>{}</ul>'.format(
            '<li>'.join([''] + self.rows)
            )
    
    
        
other = ManualSurrogation('OTHER')

AppLayout(children=(HBox(children=(HTML(value='<h1>PII will appear here.</h1>', layout=Layout(height='auto', w…

In [None]:
import json
other_dict = dict(shelve.open('../data/manual_surrogate_mappings/OTHER', 'c'))

with open('data/other_mappings.json', 'w') as f:
    json.dump(other_dict, f)