# 3-2 Grammars & Entities

## Imports, Custom Function, Data

In [1]:
# IMPORTS
import re
import nltk


# Improved normalization function
# Removes all punctuation except periods (for sentences)
# Lowercases all words and tokenizes
def tknize (a_string):
    # Handle all the string operations at one time
    clean = re.sub('[^a-zA-Z \.]', ' ', a_string).lower()
    # Create the list of sub-strings (tokens) and return it
    return nltk.tokenize.word_tokenize(clean)

# DATA
with open("../data/mdg.txt", mode="r", encoding="utf-8") as f:
            mdg = f.read()

print(mdg[0:50])

"Off there to the right -- somewhere -- is a large


In [2]:
# Break them into a list of tokens
mdg_ = tknize(mdg)

print(mdg_[0:10])

['off', 'there', 'to', 'the', 'right', 'somewhere', 'is', 'a', 'large', 'island']


## Tagging

See the [list of PoS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) on the UPenn website for details on tags. For a complete list, run the following: `nltk.help.upenn_tagset()` -- please make sure you download the tag set first, `nltk.download("tagsets")`. 

Here are the codes I use most often:

|Tag | Part of Speech |
|----|----------------|
|JJ	 | Adjectives     | 
|NN	 | Nouns          |
|RB	 | Adverbs        |
|PRP | Pronouns       |
|VB	 | Verbs          |

But this [alphabetical list](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) on a UPenn web page is really the most helpful.

In [3]:
# mdg_ is our cleaned list of tokens which keeps only periods.
mdg_tagged = nltk.pos_tag(mdg_)

mdg_tagged[0:10]

[('off', 'RB'),
 ('there', 'EX'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('right', 'NN'),
 ('somewhere', 'RB'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('large', 'JJ'),
 ('island', 'NN')]

QUIZ: Describe the data structure above.

In [5]:
# You can re-assemble this to see what the parser is going to see
reassembled = [tagged[1] for tagged in mdg_tagged[0:10]]
print(" ".join(reassembled))

RB EX TO DT NN RB VBZ DT JJ NN


The purpose of the code block above is to demonstrate that the tagged document is simply a list of tuples, which you can manipulate a number of ways, here I am simply grabbing the first value in the tuple, `tuple[0]`. 

We could also do something like this:

In [10]:
for t in mdg_tagged[0:100]:
    if t[1] == 'JJ':
        print(t[0])

large
rainsford
old
ship
trap
suggestive
curious
i
dank
tropical
palpable
thick
warm
good


***Uh oh!*** NLTK isn't infallible! The NLTK has tagged "rainsford" and "i" as adjectives. (Tagging the "I" as an adjective is something I have encountered before.) First, you should know there are alternatives to NLTK -- I recommend spaCy over TextBlob -- but you also have to determine if these are one-off errors that you can ignore or if you can ignore these errors because of the scale of your work. This is an analytical judgment call: you must make it for yourself, but you must **document** it for others. *Documentation makes **you** look smart!*

### Textblob

In [11]:
# Get the functionality we need
from textblob import TextBlob

# Create the blob
blob = TextBlob(re.sub('[^a-zA-Z \.]', ' ', mdg).lower())

# Get the noun phrases (so easy!)
mdg_np = blob.noun_phrases

# See some results
for i in mdg_np[0:10]:
    print(i)

large island
old charts
ship trap island whitney
suggestive name isn t
curious dread
i don t
dank tropical night
warm blackness
ve good eyes
i ve


***Lions, tigers, and bear! Oh my!*** TextBlob produces weird results as well. This is not easy! (Think about the implications for large language models and how many inaccurate results are being "averaged" out.)

### spaCy

In [12]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_)


Apple Apple PROPN NNP
is be AUX VBZ
looking look VERB VBG
at at ADP IN
buying buy VERB VBG
U.K. U.K. PROPN NNP
startup startup VERB VBD
for for ADP IN
$ $ SYM $
1 1 NUM CD
billion billion NUM CD


In [13]:
mdg_doc = nlp(mdg)

for token in mdg_doc[0:10]:
    print(token.text, token.pos_)

" PUNCT
Off ADV
there ADV
to ADP
the DET
right NOUN
-- PUNCT
somewhere ADV
-- PUNCT
is AUX


In [14]:
mdg_PPNs = []
for token in mdg_doc:
    if token.pos_ == "PROPN":
        mdg_PPNs.append(token.text)

print(set(mdg_PPNs))

{'Crimea', 'khaki', 'Swede', 'Russian', 'Dusk', 'New', 'English', 'Mr.', 'Captain', 'Rainsford', 'Madame', 'bush', 'Czar', 'French', 'china', 'borsch', 'cr^epes', 'God', 'huskily', 'Paris', 'Mirage', 'Island', 'Chablis', 'City', 'Watch', 'General', 'Trap', 'gray', "anything?'--", 'Monte', 'jut', 'Swam', 'Caucasus', 'Providence', 'Pistol', 'Bah', 'Lazarus', 'Boredom', 'White', 'Puritan', 'Zaroff', '.the', 'American', 'Marcus', 'Ugh', 'Americans', 'Suzette', 'weary', 'au', 'Au', 'Roger', 'Cliquot', 'staccato', 'Ennui', 'Malacca', 'Carlo', 'revoir', 'Amazon', 'Lucar', 'Doubtless', 'Death', 'Swamp', 'Folies', 'Automatically', 'Cossack', 'Moscow', 'Purdey', 'bay', 'Sleep', 'Rains', 'Sea', 'nearer', 'London', 'Africa', 'York', 'astrakhan', 'Nielsen', 'Gasping', 'Caribbean', 'Uganda', 'Laughter', 'I.', 'Tibet', 'Russia', 'Splendid', 'liqueur', 'San', 'Russians', 'Guns', 'Rio', 'Butterfly', 'Cape', 'Great', 'Aurelius', 'Ship', 'ford', 'Pol', 'general', 'Whitney', 'Bergere', 'Chambertin', 'Sang

In [34]:
# Compare NLTK and Spacy on proper nouns

tagged = nltk.pos_tag(mdg_)
# print(tagged[0:5])

mdg_nouns = []
for i in tagged[0:100]:
    if i[1] == "NN":
        mdg_nouns.append(i[0])

print(mdg_nouns)

['right', 'island', 'whitney', 'mystery', 'island', 'island', 'whitney', 'name', 'isn', 'dread', 'place', 'superstition', 'rainsford', 'night', 'blackness', 'yacht', 'whitney', 'laugh', 'i']


In [39]:
def getPOS (POS, a_string):
    """
    Takes a string and returns a list of tuples with the word 
    and its part of speech
    """
    tokens = nltk.word_tokenize(a_string)
    tagged = nltk.pos_tag(tokens)
    the_list = []
    for i in tagged:
        if i[1] == POS:
            the_list.append(i[0])
    return the_list

In [41]:
nouns = getPOS("NN", mdg)

print(nouns)

['right', 'island', 'mystery', 'island', 'name', 'dread', 'place', 'superstition', 'night', 'blackness', 'yacht', 'laugh', 'moose', 'fall', 'bush', 'night', 'moist', 'velvet', 'jaguar', 'hunting', 'sport', 'sport', 'world', 'hunter', 'jaguar', 'rot', 'hunter', 'philosopher', 'jaguar', 'jaguar', 'understanding', 'thing', 'fear', 'fear', 'pain', 'fear', 'death', 'weather', 'realist', 'world', 'island', 'dark', 'place', 'reputation', 'one', 'place', 'sailor', 'lore', 'crew', 'bit', 'today', 'bit', 'devil', 'light', 'look', 'place', 'name', 'sir', 'anything', 'air', 'something', 'chill', 'breeze', 'sea', 'plate-glass', 'window', 'island', 'chill', 'sort', 'dread', 'imagination', 'sailor', 'ship', 'company', 'fear', 'sense', 'danger', 'thing', 'sound', 'light', 'place', 'evil', 'zone', 'pipe', 'afterdeck', 'night', 'breakfast', 'night', 'sound', 'night', 'throb', 'engine', 'yacht', 'darkness', 'ripple', 'wash', 'propeller', 'steamer', 'chair', 'brier', 'drowsiness', 'night', 'night', 'sound

## Chunking

In [None]:
# How to Read the Grammar below:
# Start with an optional (?) determiner ('DT')
# Can have any number (*) of adjectives (JJ)
# End with a noun (<NN>)
grammar = "NP: {<DT>?<JJ>*<NN>}"

# Instantiate the chunk parser
parser = nltk.RegexpParser(grammar)

# Run it on our tagged text
tree = parser.parse(mdg_tagged)

# See some results
for i in tree[0:20]:
    print(i)

In [None]:
print(tree[6])

In [None]:
print(len(tree), tree.height())

In [None]:
print(tree.treepositions()[0:20])

In [None]:
NPtrees = [subtree for subtree in tree if type(subtree) == nltk.Tree and subtree.label() == "NP"]

print(len(NPtrees))
print(NPtrees[0:20])

In [None]:
NPleaves = [subtree.leaves() for subtree in tree if type(subtree) == nltk.Tree and subtree.label() == "NP"]

print(len(NPleaves))
print(NPleaves[0:20])

In [None]:
for a_list in NPleaves[0:20]:
    np = [word for word,tag in a_list]
    phrase = " ".join(np)
    print(phrase)

### Named Entities

You will need to download the named entity chunker first: `nltk.download("maxent_ne_chunker")`.

For more on the kinds of named entities: https://www.nltk.org/book/ch07.html#sec-ner.

In [None]:
tree = nltk.ne_chunk(mdg_tagged, binary=True)

def extract_ne(text):
    # tokenize by word
    words = nltk.tokenize.word_tokenize(text)
    # apply part of speech tags to those words
    tags = nltk.pos_tag(words)
    # extract named entities based on those tags
    # "binary=True ==> named entities won’t be labeled by kind
    tree = nltk.ne_chunk(tags, binary=True)
    ne_set = set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
    )
    return ne_set

In [None]:
mdg_ne = extract_ne(mdg)
print(mdg_ne)