# Normalization of Persons & Places using NER
## Whitelist Determination (Identifying Specific Entities of Interest)

In [1]:
import spacy
import re
from glob import glob
from collections import Counter

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
DATA_DIR = "./dataset/preproc1/"

files = glob(DATA_DIR + '*.txt')

In [4]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [5]:
# Notable NER:Persons that should *NOT* be converted to "XXX"
PERSONS_WHITELIST = [
    'Vladimir V. Putin', 'Vladimir Putin', 'Putin',
    'Volodymyr Zelensky', 'Volodymyr Zelenskyy', 'Zelensky',
    'Biden',
    'Xi Jinping', 'Xi',
    'Olaf Scholz', 'Scholz',
    'Boris Johnson', 'Johnson',
    'Emmanuel Macron', 'Macron',
    'Sergey V. Lavrov', 'Lavrov',
    'Aleksei A. Navalny', 'Navalny',
    'Jens Stoltenberg', 'Stoltenberg',
    'Antony J. Blinken', 'Blinken',
    'Mark A. Milley', 'Milley',
    'Kamala Harris', 'Harris',
    'Barack Obama', 'Obama',
    'Donald J. Trump', 'Trump',
    ### Misclassified as PERSON (by Spacy) ###
    'Stinger', 'Javelin', 'Brexit', 'C.I.A.', 'Twitter', 'Mykolaiv'
]

In [6]:
# Notable NER:Places that should *NOT* be converted to "YYY"
PLACES_WHITELIST = [
    'U.S.', 'United States', 'Washington',
    'Russia', 'Soviet Union', 'Moscow', 'Crimea', 'Belarus', 'Chechnya',
    'Ukraine', 'Kyiv', 'Kharkiv', 'Lviv', 'Kherson', 'Odessa', 'Mariupol', 'Donetsk', 'Irpin', 'Mykolaiv',
    'China', 'Beijing',
    'Germany', 'Berlin',
    'U.K.', 'Britain', 'London',
    'France', 'Paris',
    'Poland', 'Warsaw',
    'Brussels', 'Netherlands',
    'Lithuania', 'Romania', 'Latvia', 'Estonia', 'Moldova', 'Slovakia',
    'Canada', 'China', 'Israel', 'Syria', 'Afghanistan', 'Iran', 'Iraq', 'North Korea',
    ### Misclassified as PLACE (my Spacy) ###
    'Ukrainian'
]

In [7]:
pa_apostr = re.compile(r"(.+)'s")

def process_persons(doc, persons):
    for ent in doc.ents:
        if ent.label != 380: continue
        text = ent.text
        if m := pa_apostr.match(text):
            text = m.group(1)
        if not text in PERSONS_WHITELIST:
            persons.update([ent.text])

In [8]:
pa_the = re.compile(r"the (.+)")
places_whitelist = [place.lower() for place in PLACES_WHITELIST]

def process_places(doc, places):
    for ent in doc.ents:
        if ent.label != 384: continue
        text = ent.text.lower()
        if m := pa_the.match(text):
            text = m.group(1)
        if not text in places_whitelist:
            places.update([ent.text])

In [9]:
persons = Counter()
places = Counter()

for file in files:
    with open(file) as f:
        # print(file.split('/')[-1])
        print('.', end='')
        text = f.read()
        doc = nlp(text)
        process_persons(doc, persons)
        process_places(doc, places)

............................................................

## Remaining Person Enties (will be Normalized to 'XXX')

In [10]:
persons.most_common(40)

[('Albagir', 14),
 ('Kim', 12),
 ('Sullivan', 11),
 ('Stetsenko', 11),
 ('Maslova', 10),
 ('Lebedev', 9),
 ('Burns', 9),
 ('Novaya Gazeta', 8),
 ('Venediktov', 7),
 ('Jen Psaki', 6),
 ('Naftali Bennett', 6),
 ('Kushnir', 6),
 ('Kovalchuk', 6),
 ('Sindeyeva', 6),
 ('Lloyd J. Austin III', 5),
 ('Smetana', 5),
 ('Jake Sullivan', 5),
 ('Yashenko', 5),
 ('Poterek', 5),
 ('Lutsk', 5),
 ('Vitali Klitschko', 4),
 ('Morawiecki', 4),
 ('Andryushchenko', 4),
 ('Ursula von der', 4),
 ('Leyen', 4),
 ('Bernstam', 4),
 ('Zelenskyy', 4),
 ('Melitopol', 4),
 ('Kovalensky', 4),
 ('Chernihiv', 4),
 ('Yang', 4),
 ('Aleksandr G. Lukashenko', 4),
 ('Bennett', 4),
 ('Kolykhaev', 4),
 ('Grandi', 4),
 ('Watling', 4),
 ('Ania', 4),
 ('Oleksiy Arestovich', 4),
 ('Roskomnadzor', 4),
 ('Igor Konashenkov', 3)]

## Remaining Places Enties (will be Normalized to 'YYY')

In [11]:
places.most_common(40)

[('Hungary', 11),
 ('Georgia', 10),
 ('Istanbul', 8),
 ('Venezuela', 7),
 ('Sumy', 6),
 ('Rome', 6),
 ('Turkey', 6),
 ('Sweden', 6),
 ('the Czech Republic', 5),
 ('Bulgaria', 5),
 ('Taiwan', 5),
 ('Australia', 5),
 ('Soviet republic', 5),
 ('Novoyavorivsk', 5),
 ('Switzerland', 5),
 ('Lutsk', 5),
 ('Munich', 4),
 ('Madrid', 4),
 ('Geneva', 4),
 ('Italy', 4),
 ('Japan', 4),
 ('Finland', 4),
 ('America', 4),
 ('the Russian Federation', 3),
 ('Slovenia', 3),
 ('Siret', 3),
 ("the Soviet Union's", 3),
 ('Norway', 3),
 ('Berdyansk', 3),
 ('Donbas', 3),
 ('Luhansk', 3),
 ('Prague', 3),
 ('Gazprom', 3),
 ('Sudan', 3),
 ('St. Petersburg', 3),
 ('Pakistan', 3),
 ('Jerusalem', 3),
 ('Pennsylvania', 3),
 ('Cherkasy', 3),
 ('New York', 2)]