# Named entities extraction

#### After the exploratory analysis, the focus is on identifying and extracting named entities in the text

In [1]:
import nltk # imports nltk

In [2]:
nltk.download("punkt") # downloads punkt tokenizer models
nltk.download('averaged_perceptron_tagger') # downloads the algorithm for predicting the part of speech information
nltk.download('maxent_ne_chunker') # downloads the maximum entropy chunker that has been trained on the ACE 2004 corpus https://catalog.ldc.upenn.edu/LDC2005T09
nltk.download('words') #downloads the list of English language words
nltk.download('stopwords') # downloads the list of stopwords
from nltk import word_tokenize, pos_tag, ne_chunk #imports a word tokenizer, part of speech tagger and named entity tagger

[nltk_data] Downloading package punkt to /home/harish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/harish/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/harish/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /home/harish/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/harish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from IPython.display import display
import string
import urllib2
import os
import re
from collections import OrderedDict
import sys
import codecs
codecs.register_error("strict", codecs.ignore_errors)

#### Function for extracting named entities

In [None]:
def extract_entity_names(t, entity_type): # this function has been defined here: https://gist.github.com/onyxfish/322906
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == entity_type:
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child, entity_type))

    return entity_names

#### Function for converting a list to a dictionary structure

In [None]:
def list_to_dict(li):  
     dct = {}  
     for item in li:  
         if dct.has_key(item):  
             dct[item] = dct[item] + 1  
         else:  
             dct[item] = 1  
     return dct  

#### File opening and reading 

In [None]:
url = "https://www.gutenberg.org/files/140/140-0.txt" #assigns a link to a variable
data = urllib2.urlopen(url.encode('utf-8')).read() #opens up the link/file
data = data.replace("\r\n", " ")
data = re.sub('^(.*?)START OF THIS PROJECT GUTENBERG EBOOK', '', data) #removes Project Gutenberg metadata
data = re.sub('End of the Project Gutenberg(.*?)$', '', data)#removes Project Gutenberg metadata
data = unicode(data, errors="ignore") #ignores encoding errors
data[:10000]
#Alternative texts to use
#https://www.gutenberg.org/files/140/140-0.txt The Jungle by Upton Sinclair
# https://www.gutenberg.org/files/1342/1342-0.txt Pride and Prejudice by Jane Austen
#https://www.gutenberg.org/files/543/543-0.txt The Main Street by Sinclair Lewis

#### Sentence tokenization

In [None]:
sentences = nltk.sent_tokenize(data) #tokenizes the file into sentences
sentences [20:40]

In [None]:
senten = len(sentences)
senten

####  Word tokenization

In [None]:
words = [nltk.word_tokenize(sent) for sent in sentences] # rather than tokenizing the entire file that has been converted into a string, this function tokenizes each of the sentence -- the sentence boundaries are maintained


In [None]:
words[50:70] # notice the sentence boundaries

#### Extracting part of speech information from the text 

In [None]:
tagged_sentences = [nltk.pos_tag(word) for word in words] # for each sentence in words (sentence boundaries are maintained in words) assign the most probable part of speech for each word in the sentence

In [None]:
#%store tagged_sentences >> pos_pride_and_prejudice.txt
#%store tagged_sentences >> pos_the_main_street.txt
#%store tagged_sentences >> pos_the_jungle.txt

In [None]:
tagged_sentences[100:150] # notice the sentence boundaries

In [None]:
#locations = [] # opens an empty list for locations
gpe =[] #opens an empty list for geopolitical entities
#persons = [] #opens an empty list for persons
#organizations = [] #opens an empty list for organizations

#### Extracting named entities from the text

In [None]:
for i in tagged_sentences: # for each part of speech tagged sentence in a tagged_sentences list, extract named entities based on the chunking
    chunked_sentence = nltk.ne_chunk(i, binary=False) # predict which noun phrases in the text are named entities. If binary is set to True, ask the model to predict whether something is a named entity or not. If set to False, the model predicts the type of named entity.   
    gpe.extend(extract_entity_names(chunked_sentence, 'GPE'))#extract geopolitical entities from the text
    #locations.extend(extract_entity_names(chunked_sentence, 'LOCATION')) # extract loctions from the text
    #persons.extend(extract_entity_names(chunked_sentence, 'PERSON')) #extract persons from the text
    #organizations.extend(extract_entity_names(chunked_sentence, 'ORGANIZATION')) #extract organizations from the text

In [None]:
length = len(gpe)
#length = len(locations)
#length = len(persons)
#length = len(organizations)

In [None]:
length


In [None]:
set(gpe) #get a unique set of geopolitical entities from the text
#set(locations)
#set(persons)
#set(organizations)

In [None]:
#di_locations = list_to_dict(locations) # convert a list to a dictionary that counts the occurrences of each of geographical entity
#di_persons = list_to_dict(persons)
#di_organizations = list_to_dict(organizations)
di_gpe = list_to_dict(gpe)

In [None]:
#di_locations = OrderedDict(sorted(di_locations.items(), key=lambda t: t[1], reverse=True))
#di_persons = OrderedDict(sorted(di_persons.items(), key=lambda t: t[1], reverse=True))
#di_organizatios = OrderedDict(sorted(di_organizations.items(), key=lambda t: t[1], reverse=True))
di_gpe = OrderedDict(sorted(di_gpe.items(), key=lambda t: t[1], reverse=True)) #sort the dictionary in a decreasing order

In [None]:
#di_locations
#di_persons
#di_organizations
di_gpe

In [None]:
#%store di_persons >> dic_pride_and_prejudice_per.txt 
#%store di_locations >> dic_pride_and_prejudice_loc.txt
#%store di_organizations >> dic_pride_and_prejudice_org.txt

#%store di_persons >> dic_main_street_per.txt
#%store di_locations >> dic_main_street_per_loc.txt
#%store di_organizations >> dic_main_street_org_org.txt

#%store di_persons >> dic_the_jungle_per.txt
#%store di_locations >> dic_the_jungle_loc.txt
#%store di_organizations >> dic_the_jungle_org.txt 

#### Extracting noun phrases

In [None]:
nounphrases = []
grammar = "NP: {<DT>?<JJ>*<NN>}" #running regular expressions on part of speech to obtain noun phrase constituents from the text
for sent in tagged_sentences:
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(sent)
    for sent in result.subtrees():
            if sent.label() == "NP":
                nounphrases.append(str(sent.leaves()))
for i in nounphrases:
    print (i)

#print (nounphrases)
#%store nounphrases >> nounphrases_pride_and_prejudice.txt
#%store nounphrases >> nounphrases_main_street.txt
#%store nounphrases >> nounphrases_the_jungle.txt