# Named entities extraction

#### After the exploratory analysis, the focus is on identifying and extracting named entities in the text

In [1]:
import nltk # imports nltk

In [2]:
nltk.download("punkt") # downloads punkt tokenizer models
nltk.download('averaged_perceptron_tagger') # downloads the algorithm for predicting the part of speech information
nltk.download('maxent_ne_chunker') # downloads the maximum entropy chunker that has been trained on the ACE 2004 corpus https://catalog.ldc.upenn.edu/LDC2005T09
nltk.download('words') #downloads the list of English language words
nltk.download('stopwords') # downloads the list of stopwords
from nltk import word_tokenize, pos_tag, ne_chunk #imports a word tokenizer, part of speech tagger and named entity tagger

[nltk_data] Downloading package punkt to /home/harish/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/harish/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/harish/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/harish/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/harish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from IPython.display import display
import string
import urllib2
import os
import re
from collections import OrderedDict
import sys
import codecs
codecs.register_error("strict", codecs.ignore_errors)

#### Function for extracting named entities

In [4]:
def extract_entity_names(t, entity_type): # this function has been defined here: https://gist.github.com/onyxfish/322906
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == entity_type:
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child, entity_type))

    return entity_names

#### Function for converting a list to a dictionary structure

In [5]:
def list_to_dict(li):  
     dct = {}  
     for item in li:  
         if dct.has_key(item):  
             dct[item] = dct[item] + 1  
         else:  
             dct[item] = 1  
     return dct  

#### File opening and reading 

In [6]:
url = "https://www.gutenberg.org/files/140/140-0.txt" #assigns a link to a variable
data = urllib2.urlopen(url.encode('utf-8')).read() #opens up the link/file
data = data.replace("\r\n", " ")
data = re.sub('^(.*?)START OF THIS PROJECT GUTENBERG EBOOK', '', data) #removes Project Gutenberg metadata
data = re.sub('End of the Project Gutenberg(.*?)$', '', data)#removes Project Gutenberg metadata
data = unicode(data, errors="ignore") #ignores encoding errors
data[:10000]
#Alternative texts to use
#https://www.gutenberg.org/files/140/140-0.txt The Jungle by Upton Sinclair
# https://www.gutenberg.org/files/1342/1342-0.txt Pride and Prejudice by Jane Austen
#https://www.gutenberg.org/files/543/543-0.txt The Main Street by Sinclair Lewis

u' THE JUNGLE ***     Produced by David Meltzer, Christy Phillips, Scott Coulter, Leroy Smith and David Widger       THE JUNGLE  by Upton Sinclair   (1906)      Chapter 1   It was four oclock when the ceremony was over and the carriages began to arrive. There had been a crowd following all the way, owing to the exuberance of Marija Berczynskas. The occasion rested heavily upon Marijas broad shoulders--it was her task to see that all things went in due form, and after the best home traditions; and, flying wildly hither and thither, bowling every one out of the way, and scolding and exhorting all day with her tremendous voice, Marija was too eager to see that others conformed to the proprieties to consider them herself. She had left the church last of all, and, desiring to arrive first at the hall, had issued orders to the coachman to drive faster. When that personage had developed a will of his own in the matter, Marija had flung up the window of the carriage, and, leaning out, proceede

#### Sentence tokenization

In [7]:
sentences = nltk.sent_tokenize(data) #tokenizes the file into sentences
sentences [20:40]

[u'She stood in the doorway, shepherded by Cousin Marija, breathless from pushing through the crowd, and in her happiness painful to look upon.',
 u'There was a light of wonder in her eyes and her lids trembled, and her otherwise wan little face was flushed.',
 u'She wore a muslin dress, conspicuously white, and a stiff little veil coming to her shoulders.',
 u'There were five pink paper roses twisted in the veil, and eleven bright green rose leaves.',
 u'There were new white cotton gloves upon her hands, and as she stood staring about her she twisted them together feverishly.',
 u'It was almost too much for her--you could see the pain of too great emotion in her face, and all the tremor of her form.',
 u'She was so young--not quite sixteen--and small for her age, a mere child; and she had just been married--and married to Jurgis,* (*Pronounced Yoorghis) of all men, to Jurgis Rudkus, he with the white flower in the buttonhole of his new black suit, he with the mighty shoulders and the 

In [8]:
senten = len(sentences)
senten

6643

####  Word tokenization

In [9]:
words = [nltk.word_tokenize(sent) for sent in sentences] # rather than tokenizing the entire file that has been converted into a string, this function tokenizes each of the sentence -- the sentence boundaries are maintained


In [26]:
words[20:70] # notice the sentence boundaries

[[u'She',
  u'stood',
  u'in',
  u'the',
  u'doorway',
  u',',
  u'shepherded',
  u'by',
  u'Cousin',
  u'Marija',
  u',',
  u'breathless',
  u'from',
  u'pushing',
  u'through',
  u'the',
  u'crowd',
  u',',
  u'and',
  u'in',
  u'her',
  u'happiness',
  u'painful',
  u'to',
  u'look',
  u'upon',
  u'.'],
 [u'There',
  u'was',
  u'a',
  u'light',
  u'of',
  u'wonder',
  u'in',
  u'her',
  u'eyes',
  u'and',
  u'her',
  u'lids',
  u'trembled',
  u',',
  u'and',
  u'her',
  u'otherwise',
  u'wan',
  u'little',
  u'face',
  u'was',
  u'flushed',
  u'.'],
 [u'She',
  u'wore',
  u'a',
  u'muslin',
  u'dress',
  u',',
  u'conspicuously',
  u'white',
  u',',
  u'and',
  u'a',
  u'stiff',
  u'little',
  u'veil',
  u'coming',
  u'to',
  u'her',
  u'shoulders',
  u'.'],
 [u'There',
  u'were',
  u'five',
  u'pink',
  u'paper',
  u'roses',
  u'twisted',
  u'in',
  u'the',
  u'veil',
  u',',
  u'and',
  u'eleven',
  u'bright',
  u'green',
  u'rose',
  u'leaves',
  u'.'],
 [u'There',
  u'were',
  u

#### Extracting part of speech information from the text 

In [11]:
tagged_sentences = [nltk.pos_tag(word) for word in words] # for each sentence in words (sentence boundaries are maintained in words) assign the most probable part of speech for each word in the sentence

In [28]:
import os
os.getcwd()

'/home/harish/PycharmProjects/Events/DHCS/DHCS-workshop-notebooks/DHCS-workshop-notebooks'

In [27]:
#%store tagged_sentences >> pos_pride_and_prejudice.txt
%store tagged_sentences >> pos_the_main_street.txt
#%store tagged_sentences >> pos_the_jungle.txt

Writing 'tagged_sentences' (list) to file 'pos_the_main_street.txt'.


In [13]:
tagged_sentences[100:150] # notice the sentence boundaries

[[(u'Then', 'RB'),
  (u'the', 'DT'),
  (u'tears', 'NNS'),
  (u'begin', 'VBP'),
  (u'to', 'TO'),
  (u'come', 'VB'),
  (u'into', 'IN'),
  (u'her', 'PRP$'),
  (u'eyes', 'NNS'),
  (u';', ':'),
  (u'and', 'CC'),
  (u'as', 'IN'),
  (u'she', 'PRP'),
  (u'is', 'VBZ'),
  (u'ashamed', 'VBN'),
  (u'to', 'TO'),
  (u'wipe', 'VB'),
  (u'them', 'PRP'),
  (u'away', 'RB'),
  (u',', ','),
  (u'and', 'CC'),
  (u'ashamed', 'VBD'),
  (u'to', 'TO'),
  (u'let', 'VB'),
  (u'them', 'PRP'),
  (u'run', 'VB'),
  (u'down', 'RP'),
  (u'her', 'PRP$'),
  (u'cheeks', 'NN'),
  (u',', ','),
  (u'she', 'PRP'),
  (u'turns', 'VBZ'),
  (u'and', 'CC'),
  (u'shakes', 'VBZ'),
  (u'her', 'PRP$'),
  (u'head', 'NN'),
  (u'a', 'DT'),
  (u'little', 'JJ'),
  (u',', ','),
  (u'and', 'CC'),
  (u'then', 'RB'),
  (u'flushes', 'NNS'),
  (u'red', 'VBD'),
  (u'when', 'WRB'),
  (u'she', 'PRP'),
  (u'sees', 'VBZ'),
  (u'that', 'IN'),
  (u'Jurgis', 'NNP'),
  (u'is', 'VBZ'),
  (u'watching', 'VBG'),
  (u'her', 'PRP'),
  (u'.', '.')],
 [(u'When'

In [14]:
#locations = [] # opens an empty list for locations
gpe =[] #opens an empty list for geopolitical entities
#persons = [] #opens an empty list for persons
#organizations = [] #opens an empty list for organizations

#### Extracting named entities from the text

In [15]:
for i in tagged_sentences: # for each part of speech tagged sentence in a tagged_sentences list, extract named entities based on the chunking
    chunked_sentence = nltk.ne_chunk(i, binary=False) # predict which noun phrases in the text are named entities. If binary is set to True, ask the model to predict whether something is a named entity or not. If set to False, the model predicts the type of named entity.   
    gpe.extend(extract_entity_names(chunked_sentence, 'GPE'))#extract geopolitical entities from the text
    #locations.extend(extract_entity_names(chunked_sentence, 'LOCATION')) # extract loctions from the text
    #persons.extend(extract_entity_names(chunked_sentence, 'PERSON')) #extract persons from the text
    #organizations.extend(extract_entity_names(chunked_sentence, 'ORGANIZATION')) #extract organizations from the text

In [16]:
length = len(gpe)
#length = len(locations)
#length = len(persons)
#length = len(organizations)

In [17]:
length


1125

In [18]:
set(gpe) #get a unique set of geopolitical entities from the text
#set(locations)
#set(persons)
#set(organizations)

{u'Africa',
 u'Again',
 u'Agriculture',
 u'Ah',
 u'Aha',
 u'Ai',
 u'Aint',
 u'Aladdin',
 u'Aldermen',
 u'Always',
 u'America',
 u'American',
 u'Americans',
 u'And',
 u'Aniele',
 u'Anieles',
 u'Antanas',
 u'Anyhow',
 u'Anything',
 u'Appeal',
 u'Argentina',
 u'August',
 u'Aunt',
 u'Australia',
 u'Awful',
 u'Ballroom',
 u'Bean',
 u'Beda',
 u'Been',
 u'Bethlehem',
 u'Beyond',
 u'Bohemian',
 u'Bohemians',
 u'Boston',
 u'Bribery',
 u'Brotherhood',
 u'Brown',
 u'Browns',
 u'Bungtown',
 u'Bureau',
 u'CHICAGO',
 u'California',
 u'Canada',
 u'Capital',
 u'Cheap',
 u'Chicago',
 u'Chicagos',
 u'Chinamen',
 u'Christ',
 u'Christmas',
 u'Cincinnati',
 u'City',
 u'Cleo',
 u'Colorado',
 u'Come',
 u'Comrade',
 u'Couldnt',
 u'County',
 u'Creek',
 u'Cut',
 u'Czia',
 u'De',
 u'Dekui',
 u'Delicatessen',
 u'Der',
 u'Detroit',
 u'Dieve',
 u'Dont',
 u'Dot',
 u'Downstairs',
 u'Doyle',
 u'Duane',
 u'Duanes',
 u'Duck',
 u'Durham',
 u'Durhams',
 u'Durys',
 u'Dutchwoman',
 u'East',
 u'Eastern',
 u'Eat',
 u'Ecrasez'

In [19]:
#di_locations = list_to_dict(locations) # convert a list to a dictionary that counts the occurrences of each of geographical entity
#di_persons = list_to_dict(persons)
#di_organizations = list_to_dict(organizations)
di_gpe = list_to_dict(gpe)

In [20]:
#di_locations = OrderedDict(sorted(di_locations.items(), key=lambda t: t[1], reverse=True))
#di_persons = OrderedDict(sorted(di_persons.items(), key=lambda t: t[1], reverse=True))
#di_organizatios = OrderedDict(sorted(di_organizations.items(), key=lambda t: t[1], reverse=True))
di_gpe = OrderedDict(sorted(di_gpe.items(), key=lambda t: t[1], reverse=True)) #sort the dictionary in a decreasing order

In [21]:
#di_locations
#di_persons
#di_organizations
di_gpe

OrderedDict([(u'Jurgis', 166),
             (u'Packingtown', 72),
             (u'Chicago', 64),
             (u'Ona', 45),
             (u'America', 34),
             (u'Durhams', 18),
             (u'Jurgiss', 18),
             (u'English', 17),
             (u'Lithuanian', 15),
             (u'Onas', 15),
             (u'American', 14),
             (u'Kotrina', 13),
             (u'Browns', 12),
             (u'Hamilton', 11),
             (u'United States', 10),
             (u'Lithuania', 10),
             (u'Elzbieta', 9),
             (u'Marija', 9),
             (u'Im', 9),
             (u'Polish', 8),
             (u'Ah', 8),
             (u'Dont', 7),
             (u'New York', 7),
             (u'Socialism', 6),
             (u'Good', 6),
             (u'Irish', 6),
             (u'Guvner', 5),
             (u'Marijas', 5),
             (u'French', 5),
             (u'Antanas', 5),
             (u'Little', 5),
             (u'Nobody', 4),
             (u'Massachusetts', 4),

In [22]:
#%store di_persons >> dic_pride_and_prejudice_per.txt 
#%store di_locations >> dic_pride_and_prejudice_loc.txt
#%store di_organizations >> dic_pride_and_prejudice_org.txt

#%store di_persons >> dic_main_street_per.txt
#%store di_locations >> dic_main_street_per_loc.txt
#%store di_organizations >> dic_main_street_org_org.txt

#%store di_persons >> dic_the_jungle_per.txt
#%store di_locations >> dic_the_jungle_loc.txt
#%store di_organizations >> dic_the_jungle_org.txt 

#### Extracting noun phrases

In [23]:
nounphrases = []
grammar = "NP: {<DT>?<JJ>*<NN>}" #running regular expressions on part of speech to obtain noun phrase constituents from the text
for sent in tagged_sentences:
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(sent)
    for sent in result.subtrees():
            if sent.label() == "NP":
                nounphrases.append(str(sent.leaves()))
for i in nounphrases:
    print (i)

#print (nounphrases)
#%store nounphrases >> nounphrases_pride_and_prejudice.txt
#%store nounphrases >> nounphrases_main_street.txt
#%store nounphrases >> nounphrases_the_jungle.txt

[(u'Chapter', 'NN')]
[(u'oclock', 'NN')]
[(u'the', 'DT'), (u'ceremony', 'NN')]
[(u'a', 'DT'), (u'crowd', 'NN')]
[(u'the', 'DT'), (u'way', 'NN')]
[(u'the', 'DT'), (u'exuberance', 'NN')]
[(u'The', 'DT'), (u'occasion', 'NN')]
[(u'task', 'NN')]
[(u'due', 'JJ'), (u'form', 'NN')]
[(u'home', 'NN')]
[(u'out', 'NN')]
[(u'the', 'DT'), (u'way', 'NN')]
[(u'all', 'DT'), (u'day', 'NN')]
[(u'tremendous', 'JJ'), (u'voice', 'NN')]
[(u'the', 'DT'), (u'church', 'NN')]
[(u'the', 'DT'), (u'hall', 'NN')]
[(u'the', 'DT'), (u'coachman', 'NN')]
[(u'faster', 'NN')]
[(u'that', 'DT'), (u'personage', 'NN')]
[(u'the', 'DT'), (u'matter', 'NN')]
[(u'the', 'DT'), (u'window', 'NN')]
[(u'the', 'DT'), (u'carriage', 'NN')]
[(u'opinion', 'NN')]
[(u'the', 'DT'), (u'advantage', 'NN')]
[(u'altitude', 'NN')]
[(u'the', 'DT'), (u'driver', 'NN')]
[(u'ground', 'NN')]
[(u'the', 'DT'), (u'result', 'NN')]
[(u'a', 'DT'), (u'furious', 'JJ'), (u'altercation', 'NN')]
[(u'the', 'DT'), (u'way', 'NN')]
[(u'a', 'DT'), (u'new', 'JJ'), (u'swar

[(u'an', 'DT'), (u'extraordinary', 'JJ'), (u'piece', 'NN')]
[(u'good', 'JJ'), (u'fortune', 'NN')]
[(u'this', 'DT'), (u'juncture', 'NN')]
[(u'the', 'DT'), (u'morning', 'NN')]
[(u'a', 'DT'), (u'woeful', 'JJ'), (u'voyage', 'NN')]
[(u'a', 'DT'), (u'man', 'NN')]
[(u'part', 'NN')]
[(u'the', 'DT'), (u'day', 'NN')]
[(u'this', 'DT'), (u'new', 'JJ'), (u'world', 'NN')]
[(u'the', 'DT'), (u'point', 'NN')]
[(u'a', 'DT'), (u'boardinghouse', 'NN')]
[(u'the', 'DT'), (u'other', 'JJ'), (u'side', 'NN')]
[(u'choice', 'NN')]
[(u'the', 'DT'), (u'moment', 'NN')]
[(u'nothing', 'NN')]
[(u'practical', 'JJ'), (u'experience', 'NN')]
[(u'this', 'DT'), (u'land', 'NN')]
[(u'the', 'DT'), (u'cruel', 'NN')]
[(u'fact', 'NN')]
[(u'a', 'DT'), (u'land', 'NN')]
[(u'the', 'DT'), (u'poor', 'JJ'), (u'man', 'NN')]
[(u'any', 'DT'), (u'other', 'JJ'), (u'corner', 'NN')]
[(u'the', 'DT'), (u'earth', 'NN')]
[(u'a', 'DT'), (u'night', 'NN')]
[(u'wealth', 'NN')]
[(u'the', 'DT'), (u'discovery', 'NN')]
[(u'spending', 'NN')]
[(u'money', 'NN

[(u'the', 'DT'), (u'balance', 'NN')]
[(u'a', 'DT'), (u'month', 'NN')]
[(u'the', 'DT'), (u'total', 'NN')]
[(u'the', 'DT'), (u'sale', 'NN')]
[(u'a', 'DT'), (u'house', 'NN')]
[(u'the', 'DT'), (u'house', 'NN')]
[(u'lot', 'NN')]
[(u'everything', 'NN')]
[(u'the', 'DT'), (u'lawyer', 'NN')]
[(u'any', 'DT'), (u'sort', 'NN')]
[(u'the', 'DT'), (u'world', 'NN')]
[(u'anything', 'NN')]
[(u'question', 'NN')]
[(u'mute', 'NN')]
[(u'agony', 'NN')]
[(u'fate', 'NN')]
[(u'questioning', 'NN')]
[(u'the', 'DT'), (u'time', 'NN')]
[(u'the', 'DT'), (u'bargain', 'NN')]
[(u'twice', 'NN')]
[(u'this', 'DT'), (u'lawyer', 'NN')]
[(u'the', 'DT'), (u'truth', 'NN')]
[(u'the', 'DT'), (u'conspiracy', 'NN')]
[(u'excuse', 'NN')]
[(u'the', 'DT'), (u'room', 'NN')]
[(u'decision', 'NN')]
[(u'half', 'JJ'), (u'blind', 'NN')]
[(u'jacket', 'NN')]
[(u'the', 'DT'), (u'precious', 'JJ'), (u'money', 'NN')]
[(u'a', 'DT'), (u'corner', 'NN')]
[(u'the', 'DT'), (u'room', 'NN')]
[(u'a', 'DT'), (u'fever', 'NN')]
[(u'fright', 'NN')]
[(u'stepmoth

[(u'the', 'DT'), (u'blow', 'NN')]
[(u'an', 'DT'), (u'enemy', 'NN')]
[(u'a', 'DT'), (u'day', 'NN')]
[(u'the', 'DT'), (u'rain', 'NN')]
[(u'all', 'DT'), (u'day', 'NN')]
[(u'no', 'DT'), (u'laughing', 'JJ'), (u'matter', 'NN')]
[(u'a', 'DT'), (u'working', 'JJ'), (u'girl', 'NN')]
[(u'waterproofs', 'NN')]
[(u'the', 'DT'), (u'streetcar', 'NN')]
[(u'this', 'DT'), (u'car', 'NN')]
[(u'line', 'NN')]
[(u'money', 'NN')]
[(u'the', 'DT'), (u'city', 'NN')]
[(u'an', 'DT'), (u'ordinance', 'NN')]
[(u'a', 'DT'), (u'rage', 'NN')]
[(u'a', 'DT'), (u'rule', 'NN')]
[(u'the', 'DT'), (u'fare', 'NN')]
[(u'the', 'DT'), (u'passenger', 'NN')]
[(u'the', 'DT'), (u'transfer', 'NN')]
[(u'the', 'DT'), (u'conductor', 'NN')]
[(u'a', 'DT'), (u'transfer', 'NN')]
[(u'way', 'NN')]
[(u'the', 'DT'), (u'conductor', 'NN')]
[(u'the', 'DT'), (u'time', 'NN')]
[(u'the', 'DT'), (u'transfer', 'NN')]
[(u'the', 'DT'), (u'conductor', 'NN')]
[(u'a', 'DT'), (u'language', 'NN')]
[(u'a', 'DT'), (u'word', 'NN')]
[(u'the', 'DT'), (u'bell', 'NN')]


[(u'no', 'DT'), (u'place', 'NN')]
[(u'a', 'DT'), (u'prostitute', 'NN')]
[(u'a', 'DT'), (u'decent', 'JJ'), (u'girl', 'NN')]
[(u'a', 'DT'), (u'population', 'NN')]
[(u'low-class', 'NN')]
[(u'the', 'DT'), (u'verge', 'NN')]
[(u'starvation', 'NN')]
[(u'dependent', 'NN')]
[(u'life', 'NN')]
[(u'the', 'DT'), (u'whim', 'NN')]
[(u'every', 'DT'), (u'bit', 'NN')]
[(u'the', 'DT'), (u'old-time', 'JJ'), (u'slave', 'NN')]
[(u'immorality', 'NN')]
[(u'prevalent', 'NN')]
[(u'the', 'DT'), (u'system', 'NN')]
[(u'chattel', 'NN')]
[(u'slavery', 'NN')]
[(u'the', 'DT'), (u'packing', 'NN')]
[(u'the', 'DT'), (u'time', 'NN')]
[(u'everybody', 'NN')]
[(u'the', 'DT'), (u'old', 'JJ'), (u'slavery', 'NN')]
[(u'no', 'DT'), (u'difference', 'NN')]
[(u'color', 'NN')]
[(u'master', 'NN')]
[(u'slave', 'NN')]
[(u'morning', 'NN')]
[(u'home', 'NN')]
[(u'the', 'DT'), (u'man-doctor', 'NN')]
[(u'whim', 'NN')]
[(u'a', 'DT'), (u'fine', 'JJ'), (u'baby', 'NN')]
[(u'an', 'DT'), (u'enormous', 'JJ'), (u'big', 'JJ'), (u'boy', 'NN')]
[(u'a',

[(u'scrubbing', 'NN')]
[(u'vigorous', 'JJ'), (u'exercise', 'NN')]
[(u'nothing', 'NN')]
[(u'discovery', 'NN')]
[(u'a', 'DT'), (u'substance', 'NN')]
[(u'energy', 'NN')]
[(u'an', 'DT'), (u'unlimited', 'JJ'), (u'time', 'NN')]
[(u'power', 'NN')]
[(u'the', 'DT'), (u'food', 'NN')]
[(u'the', 'DT'), (u'table', 'NN')]
[(u'taste', 'NN')]
[(u'the', 'DT'), (u'whole', 'JJ'), (u'family', 'NN')]
[(u'vomiting', 'NN')]
[(u'anything', 'NN')]
[(u'stomach', 'NN')]
[(u'a', 'DT'), (u'knife', 'NN')]
[(u'fork', 'NN')]
[(u'mouth', 'NN')]
[(u'throat', 'NN')]
[(u'the', 'DT'), (u'poison', 'NN')]
[(u'spite', 'NN')]
[(u'the', 'DT'), (u'plant', 'NN')]
[(u'stand', 'NN')]
[(u'the', 'DT'), (u'blinding', 'NN')]
[(u'clouds', 'NN')]
[(u'dust', 'NN')]
[(u'the', 'DT'), (u'end', 'NN')]
[(u'the', 'DT'), (u'week', 'NN')]
[(u'a', 'DT'), (u'fertilizer', 'NN')]
[(u'man', 'NN')]
[(u'life', 'NN')]
[(u'head', 'NN')]
[(u'another', 'DT'), (u'summer', 'NN')]
[(u'a', 'DT'), (u'summer', 'NN')]
[(u'prosperity', 'NN')]
[(u'the', 'DT'), (u'c

[(u'a', 'DT'), (u'man', 'NN')]
[(u'the', 'DT'), (u'aspect', 'NN')]
[(u'prison', 'NN')]
[(u'life', 'NN')]
[(u'the', 'DT'), (u'arrival', 'NN')]
[(u'a', 'DT'), (u'cell', 'NN')]
[(u'mate', 'NN')]
[(u'face', 'NN')]
[(u'the', 'DT'), (u'wall', 'NN')]
[(u'sulk', 'NN')]
[(u'the', 'DT'), (u'conversation', 'NN')]
[(u'man', 'NN')]
[(u'wonder', 'NN')]
[(u'the', 'DT'), (u'other', 'JJ'), (u'told', 'NN')]
[(u'midnight', 'NN')]
[(u'a', 'DT'), (u'night', 'NN')]
[(u'The', 'DT'), (u'young', 'JJ'), (u'fellow', 'NN')]
[(u'an', 'DT'), (u'amused', 'JJ'), (u'contempt', 'NN')]
[(u'a', 'DT'), (u'sort', 'NN')]
[(u'mule', 'NN')]
[(u'injustice', 'NN')]
[(u'the', 'DT'), (u'time', 'NN')]
[(u'war', 'NN')]
[(u'society', 'NN')]
[(u'a', 'DT'), (u'genial', 'JJ'), (u'freebooter', 'NN')]
[(u'the', 'DT'), (u'enemy', 'NN')]
[(u'fear', 'NN')]
[(u'shame', 'NN')]
[(u'annihilation', 'NN')]
[(u'spirit', 'NN')]
[(u'a', 'DT'), (u'goodhearted', 'JJ'), (u'fellow', 'NN')]
[(u'story', 'NN')]
[(u'the', 'DT'), (u'first', 'JJ'), (u'day', '

[(u'headache', 'NN')]
[(u'heartache', 'NN')]
[(u'fertilizer', 'NN')]
[(u'mill', 'NN')]
[(u'job', 'NN')]
[(u'the', 'DT'), (u'boss', 'NN')]
[(u'head', 'NN')]
[(u'place', 'NN')]
[(u'no', 'DT'), (u'room', 'NN')]
[(u'nothing', 'NN')]
[(u'perplexity', 'NN')]
[(u'the', 'DT'), (u'matter', 'NN')]
[(u'work', 'NN')]
[(u'look', 'NN')]
[(u'cold', 'JJ'), (u'indifference', 'NN')]
[(u'nothing', 'NN')]
[(u'the', 'DT'), (u'dreadful', 'JJ'), (u'meaning', 'NN')]
[(u'that', 'DT'), (u'incident', 'NN')]
[(u'a', 'DT'), (u'sinking', 'NN')]
[(u'the', 'DT'), (u'heart', 'NN')]
[(u'stand', 'NN')]
[(u'the', 'DT'), (u'mob', 'NN')]
[(u'the', 'DT'), (u'snow', 'NN')]
[(u'the', 'DT'), (u'time', 'NN')]
[(u'station', 'NN')]
[(u'breakfastless', 'NN')]
[(u'the', 'DT'), (u'throng', 'NN')]
[(u'the', 'DT'), (u'police', 'NN')]
[(u'no', 'DT'), (u'work', 'NN')]
[(u'that', 'DT'), (u'day', 'NN')]
[(u'a', 'DT'), (u'drink', 'NN')]
[(u'a', 'DT'), (u'sandwich', 'NN')]
[(u'old', 'JJ'), (u'union', 'NN')]
[(u'a', 'DT'), (u'dime', 'NN')]
[

[(u'the', 'DT'), (u'end', 'NN')]
[(u'day', 'NN')]
[(u'bed', 'NN')]
[(u'night', 'NN')]
[(u'an', 'DT'), (u'unfortunate', 'JJ'), (u'decision', 'NN')]
[(u'oclock', 'NN')]
[(u'an', 'DT'), (u'hour', 'NN')]
[(u'dinner', 'NN')]
[(u'the', 'DT'), (u'sunlight', 'NN')]
[(u'no', 'DT'), (u'place', 'NN')]
[(u'a', 'DT'), (u'barroom', 'NN')]
[(u'no', 'DT'), (u'place', 'NN')]
[(u'light', 'NN')]
[(u'warmth', 'NN')]
[(u'a', 'DT'), (u'little', 'JJ'), (u'music', 'NN')]
[(u'sit', 'NN')]
[(u'a', 'DT'), (u'companion', 'NN')]
[(u'talk', 'NN')]
[(u'home', 'NN')]
[(u'no', 'DT'), (u'affection', 'NN')]
[(u'life', 'NN')]
[(u'the', 'DT'), (u'pitiful', 'JJ'), (u'mockery', 'NN')]
[(u'the', 'DT'), (u'camaraderie', 'NN')]
[(u'vice', 'NN')]
[(u'a', 'DT'), (u'church', 'NN')]
[(u'an', 'DT'), (u'ill-smelling', 'JJ'), (u'workingman', 'NN')]
[(u'vermin', 'NN')]
[(u'neck', 'NN')]
[(u'course', 'NN')]
[(u'corner', 'NN')]
[(u'unheated', 'JJ'), (u'room', 'NN')]
[(u'a', 'DT'), (u'window', 'NN')]
[(u'a', 'DT'), (u'blank', 'JJ'), (u'w

[(u'leader', 'NN')]
[(u'the', 'DT'), (u'clerk', 'NN')]
[(u'the', 'DT'), (u'court', 'NN')]
[(u'a', 'DT'), (u'decent', 'JJ'), (u'fellow', 'NN')]
[(u'the', 'DT'), (u'fine', 'NN')]
[(u'somebody', 'NN')]
[(u'chose', 'NN')]
[(u'the', 'DT'), (u'future', 'NN')]
[(u'money', 'NN')]
[(u'different', 'JJ'), (u'standard', 'NN')]
[(u'a', 'DT'), (u'great', 'JJ'), (u'deal', 'NN')]
[(u'drinking', 'NN')]
[(u'a', 'DT'), (u'workingman', 'NN')]
[(u'exhaustion', 'NN')]
[(u'hopelessness', 'NN')]
[(u'something', 'NN')]
[(u'an', 'DT'), (u'active', 'JJ'), (u'man', 'NN')]
[(u'kept', 'JJ'), (u'sober', 'NN')]
[(u'friend', 'NN')]
[(u'a', 'DT'), (u'good', 'JJ'), (u'deal', 'NN')]
[(u'fonder', 'NN')]
[(u'both', 'DT'), (u'wine', 'NN')]
[(u'thing', 'NN')]
[(u'the', 'DT'), (u'saloon', 'NN')]
[(u'night', 'NN')]
[(u'a', 'DT'), (u'country', 'NN')]
[(u'customer', 'NN')]
[(u'a', 'DT'), (u'buyer', 'NN')]
[(u'an', 'DT'), (u'out-of-town', 'JJ'), (u'merchant', 'NN')]
[(u'half', 'NN')]
[(u'no', 'DT'), (u'one', 'NN')]
[(u'the', 'DT'

[(u'a', 'DT'), (u'voice', 'NN')]
[(u'business', 'NN')]
[(u'this', 'DT'), (u'time', 'NN')]
[(u'the', 'DT'), (u'crowd', 'NN')]
[(u'part', 'NN')]
[(u'sheepish-looking', 'NN')]
[(u'a', 'DT'), (u'joke', 'NN')]
[(u'the', 'DT'), (u'paint', 'NN')]
[(u'cheeks', 'NN')]
[(u'black-eyed', 'JJ'), (u'young', 'JJ'), (u'girl', 'NN')]
[(u'the', 'DT'), (u'top', 'NN')]
[(u'the', 'DT'), (u'balustrade', 'NN')]
[(u'slippered', 'JJ'), (u'foot', 'NN')]
[(u'the', 'DT'), (u'ankle', 'NN')]
[(u'down', 'NN')]
[(u'the', 'DT'), (u'floor', 'NN')]
[(u'the', 'DT'), (u'hall', 'NN')]
[(u'fun', 'NN')]
[(u'the', 'DT'), (u'procession', 'NN')]
[(u'a', 'DT'), (u'bright', 'JJ'), (u'red', 'JJ'), (u'kimono', 'NN')]
[(u'a', 'DT'), (u'voice', 'NN')]
[(u'the', 'DT'), (u'hall', 'NN')]
[(u'a', 'DT'), (u'glance', 'NN')]
[(u'a', 'DT'), (u'start', 'NN')]
[(u'a', 'DT'), (u'cry', 'NN')]
[(u'half', 'NN')]
[(u'sprang', 'NN')]
[(u'amazement', 'NN')]
[(u'Jurgis', 'NN')]
[(u'the', 'DT'), (u'street', 'NN')]
[(u'a', 'DT'), (u'silence', 'NN')]
[(u

[(u'the', 'DT'), (u'beautiful', 'JJ'), (u'lady', 'NN')]
[(u'side', 'NN')]
[(u'the', 'DT'), (u'neck', 'NN')]
[(u'the', 'DT'), (u'man', 'NN')]
[(u'the', 'DT'), (u'other', 'JJ'), (u'side', 'NN')]
[(u'a', 'DT'), (u'hobo', 'NN')]
[(u'no', 'DT'), (u'place', 'NN')]
[(u'that', 'DT'), (u'night', 'NN')]
[(u'the', 'DT'), (u'meeting', 'NN')]
[(u'the', 'DT'), (u'audience', 'NN')]
[(u'an', 'DT'), (u'agony', 'NN')]
[(u'uncertainty', 'NN')]
[(u'the', 'DT'), (u'vision', 'NN')]
[(u'last', 'JJ'), (u'forever', 'NN')]
[(u'the', 'DT'), (u'thing', 'NN')]
[(u'seat', 'NN')]
[(u'the', 'DT'), (u'same', 'JJ'), (u'row', 'NN')]
[(u'the', 'DT'), (u'aisle', 'NN')]
[(u'person', 'NN')]
[(u'the', 'DT'), (u'address', 'NN')]
[(u'nobody', 'NN')]
[(u'the', 'DT'), (u'door', 'NN')]
[(u'the', 'DT'), (u'night', 'NN')]
[(u'air', 'NN')]
[(u'desperation', 'NN')]
[(u'nothing', 'NN')]
[(u'that', 'DT'), (u'speech', 'NN')]
[(u'the', 'DT'), (u'name', 'NN')]
[(u'the', 'DT'), (u'orator', 'NN')]
[(u'man', 'NN')]
[(u'tramp', 'NN')]
[(u'an'