In [2]:
#Removing names from any document, e.g GDPR Compliance

import spacy
import textacy
from spacy import displacy
nlp = spacy.load('en_core_web_lg')

In [3]:
text = "Barcelona forward Malcom has described Lionel Messi as a football alien and said he is learning so much just from playing alongside the Argentine. Malcom, 21, moved to the Camp Nou from Bordeaux in the summer for an initial €41 million, scoring his first goal in the 1-1 draw against Inter Milan in the Champions League. Following the departure of Andres Iniesta in the summer, Messi was named Barcelona captain. And Malcom told ESPN Brasil that his skipper is an inspiring figure in the dressing room and has a way of getting the best out of his teammates. A few minutes before starting the match, in the tunnel, he told me that when he gets the ball he will find me, Malcom said. And he does because he's a football alien."

#Parse the text with spaCy. This runs the entire NLP pipeline.
doc = nlp(text)

In [4]:
#Print all the named entities detected
for entity in doc.ents:
    print(entity.text,": ",entity.label_)

Barcelona :  GPE
Malcom :  PERSON
Lionel Messi :  PERSON
Argentine :  NORP
Malcom :  PERSON
21 :  DATE
Bordeaux :  GPE
the summer :  DATE
€41 million :  MONEY
first :  ORDINAL
1 :  CARDINAL
Inter Milan :  ORG
the Champions League :  ORG
Andres Iniesta :  PERSON
the summer :  DATE
Messi :  PERSON
Barcelona :  GPE
Malcom :  PERSON
ESPN :  ORG
Brasil :  PERSON
A few minutes :  TIME
Malcom :  PERSON


In [5]:
print(doc.ents)

(Barcelona, Malcom, Lionel Messi, Argentine, Malcom, 21, Bordeaux, the summer, €41 million, first, 1, Inter Milan, the Champions League, Andres Iniesta, the summer, Messi, Barcelona, Malcom, ESPN, Brasil, A few minutes, Malcom)


In [6]:
entity.label_

'PERSON'

In [7]:
spacy.explain('GPE')

'Countries, cities, states'

In [8]:
spacy.explain('ORDINAL')

'"first", "second", etc.'

In [9]:
def deidentify_names(text):
    doc = nlp(text)
    deidentified_sentence = []
    for token in doc:
        if(token.ent_type_ == "PERSON"):
            deidentified_sentence.append("[DEIDENTIFIED]")
        else:
            deidentified_sentence.append(token.string)
    return "".join(deidentified_sentence)

In [10]:
deidentify_names(text)

"Barcelona forward [DEIDENTIFIED]has described [DEIDENTIFIED][DEIDENTIFIED]as a football alien and said he is learning so much just from playing alongside the Argentine. [DEIDENTIFIED], 21, moved to the Camp Nou from Bordeaux in the summer for an initial €41 million, scoring his first goal in the 1-1 draw against Inter Milan in the Champions League. Following the departure of [DEIDENTIFIED][DEIDENTIFIED]in the summer, [DEIDENTIFIED]was named Barcelona captain. And [DEIDENTIFIED]told ESPN [DEIDENTIFIED]that his skipper is an inspiring figure in the dressing room and has a way of getting the best out of his teammates. A few minutes before starting the match, in the tunnel, he told me that when he gets the ball he will find me, [DEIDENTIFIED]said. And he does because he's a football alien."

In [11]:
#Two consecutive deidentified means two words in a name. Merging that to one.
def deidentify_names(text):
    doc = nlp(text)
    deidentified_sentence = []
    for ent in doc.ents:
        ent.merge()
    for token in doc:
        if(token.ent_type_ == "PERSON"):
            deidentified_sentence.append("[DEIDENTIFIED]")
        else:
            deidentified_sentence.append(token.string)
    return "".join(deidentified_sentence)

In [12]:
deidentify_names(text)

"Barcelona forward [DEIDENTIFIED]has described [DEIDENTIFIED]as a football alien and said he is learning so much just from playing alongside the Argentine. [DEIDENTIFIED], 21, moved to the Camp Nou from Bordeaux in the summer for an initial €41 million, scoring his first goal in the 1-1 draw against Inter Milan in the Champions League. Following the departure of [DEIDENTIFIED]in the summer, [DEIDENTIFIED]was named Barcelona captain. And [DEIDENTIFIED]told ESPN [DEIDENTIFIED]that his skipper is an inspiring figure in the dressing room and has a way of getting the best out of his teammates. A few minutes before starting the match, in the tunnel, he told me that when he gets the ball he will find me, [DEIDENTIFIED]said. And he does because he's a football alien."