# Named entity recognition (NER)

In [3]:
import nltk
import os
import ast

We import the dataset that includes the clean words, but not lemmatized yet. 

In [2]:
dataset = []
for f in os.listdir('sep_articles_clean/'):
    with open('sep_articles_clean/' + f) as raw_text:
        dataset.append(ast.literal_eval(raw_text.read()))

## Finding Proper Nouns

We count each proper noun in the dataset and get its frequency.

In [4]:
from collections import defaultdict
frequency_counter = defaultdict(int)

for text in dataset:
    for sentence in text:
        for word in sentence:
            if word[1] == 'NNP':
                if len(word[0]) > 2:
                    frequency_counter[word[0]] += 1

In [5]:
import pandas as pd
import seaborn as sns

In [6]:
frequencies = pd.DataFrame.from_dict(frequency_counter, orient='index')

In [10]:
frequencies.sort_values(0, ascending=False)[:20]

Unnamed: 0,0
God,19423
Aristotle,9132
Kant,7732
Plato,4502
Hume,4007
John,3680
Russell,3565
Locke,3465
Leibniz,3275
Lewis,3145


It is an interesting first approximation, but has several flaws.

* It does not consider names and last names as one proper noun, but separates them. This is why "John" is among the most mentioned names. A possible solution would be to eliminate first names and only take the last names, but this would entail problems with, for instances, "James."
* Proper nouns also include things like "Philosophy" here. 

# With SpaCy

In [11]:
dataset = []
for f in os.listdir('sep_articles/'):
    with open('sep_articles/' + f) as raw_text:
        dataset.append(raw_text.read().decode('utf8'))

In [12]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [None]:
all_labels = []
all_items = []
for text in dataset:
    doc = nlp(text)
    labels = [x.label_ for x in doc.ents]
    items = [x.text for x in doc.ents if x.label_ == 'PERSON']
    all_items += items
    all_labels += labels

In [71]:
Counter(all_labels)

Counter({u'PERSON': 3018, u'CARDINAL': 1899, u'ORG': 1818, u'DATE': 1386, u'ORDINAL': 614, u'GPE': 583, u'NORP': 492, u'WORK_OF_ART': 76, u'LOC': 72, u'PRODUCT': 64, u'FAC': 51, u'MONEY': 36, u'LANGUAGE': 25, u'EVENT': 24, u'LAW': 18, u'PERCENT': 4, u'TIME': 4, u'QUANTITY': 3})

In [78]:
Counter(items).most_common()

[(u'Leibniz', 212), (u'Monadologie', 10), (u'Syst\xe8me', 6), (u'Russell', 4), (u'Judas', 4), (u'Principes de la Nature et de la Grace', 4), (u'Arnauld', 4), (u'Jesus', 3), (u'Burnett', 3), (u'Remond', 2), (u'CP:31', 2), (u'Charles Hugony', 2), (u'Logicarum A.VI.iv.a.624\u2013630', 1), (u'Garber 2009', 1), (u'Hartz 2007', 1), (u'Biber', 1), (u'Antoine\nArnauld', 1), (u'Simon Foucher', 1), (u'AG:138', 1), (u"Bertrand Russell's", 1), (u'Works', 1), (u'Plato', 1), (u'Esoteric', 1), (u'Geometrical', 1), (u'AG:215', 1), (u'AG:214', 1), (u'Essais de', 1), (u'Nelson', 1), (u'Grace', 1), (u'De Obligatione Credendi', 1), (u'Adams', 1), (u'Nicolas Remond', 1), (u'G. W. Leibniz', 1), (u'Confessio Philosophi', 1), (u'AG:213', 1), (u'Russell 1945', 1), (u'Pierre Bayle', 1), (u'Esoteric Form', 1), (u'Kant', 1), (u'Sophia', 1), (u'Essay', 1), (u'Curley', 1), (u'Jesuit', 1), (u'Syst\xe8me Nouveau\n\n\n', 1), (u'Spinoza', 1), (u'De Notionibus', 1), (u'Notionum Praeparanda A.VI.iv.a.630\u2013635', 1), (