# 1   Information Extraction

In [1]:
import nltk, re, pprint
locs = [('Omnicom', 'IN', 'New York'),
        ('DDB Needham', 'IN', 'New York'),
        ('Kaplan Thaler Group', 'IN', 'New York'),
        ('BBDO South', 'IN', 'Atlanta'),
        ('Georgia-Pacific', 'IN', 'Atlanta')]
query = [e1 for (e1, rel, e2) in locs if e2=='Atlanta']
print(query)

['BBDO South', 'Georgia-Pacific']


## 1.1   Information Extraction Architecture

In [2]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

# 2   Chunking

## 2.1   Noun Phrase Chunking

In [3]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
            ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]

grammar = "NP: {<DT>?<JJ>*<NN>}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [4]:
result.draw()

## 2.2   Tag Patterns

## 2.3   Chunking with Regular Expressions

In [5]:
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
      {<NNP>+}                # chunk sequences of proper nouns
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
            ("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]

In [6]:
print(cp.parse(sentence))

(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))


In [7]:
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN>+}  # Chunk one or more (two {<NN>{2}}) consecutive nouns"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))

(S (NP money/NN market/NN fund/NN))


## 2.4   Exploring Text Corpora

In [8]:
def find_chunks(chunk_str: str):
    cp = nltk.RegexpParser(chunk_str)
    brown = nltk.corpus.brown
    for sent in brown.tagged_sents():
        tree = cp.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == chunk_str.split(":")[0]: 
                yield subtree

In [9]:
pprint.pprint(list(find_chunks("NOUNS: {<N.*>{4,}}")))

[Tree('NOUNS', [('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP')]),
 Tree('NOUNS', [('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP')]),
 Tree('NOUNS', [("Georgia's", 'NP$'), ('automobile', 'NN'), ('title', 'NN'), ('law', 'NN')]),
 Tree('NOUNS', [('State', 'NN-TL'), ('Welfare', 'NN-TL'), ("Department's", 'NN$-TL'), ('handling', 'NN')]),
 Tree('NOUNS', [('Fulton', 'NP-TL'), ('Tax', 'NN-TL'), ("Commissioner's", 'NN$-TL'), ('Office', 'NN-TL')]),
 Tree('NOUNS', [('Mayor', 'NN-TL'), ('William', 'NP'), ('B.', 'NP'), ('Hartsfield', 'NP')]),
 Tree('NOUNS', [('Mrs.', 'NP'), ('J.', 'NP'), ('M.', 'NP'), ('Cheshire', 'NP')]),
 Tree('NOUNS', [('E.', 'NP'), ('Pelham', 'NP'), ('Rd.', 'NN-TL'), ('Aj', 'NN')]),
 Tree('NOUNS', [('State', 'NN-TL'), ('Party', 'NN-TL'), ('Chairman', 'NN-TL'), ('James', 'NP'), ('W.', 'NP'), ('Dorsey', 'NP')]),
 Tree('NOUNS', [('Texas', 'NP'), ('Sen.', 'NN-TL'), ('John', 'NP'), ('Tower', 'NP')]),
 Tree('NOUNS', [('Lt.', 'NN-T

## 2.5   Chinking

In [10]:
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
            ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)

In [11]:
print(cp.parse(sentence))

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


## 2.6   Representing Chunks: Tags vs Trees

# 3   Developing and Evaluating Chunkers

## 3.1   Reading IOB Format and the CoNLL 2000 Corpus

In [12]:
text = '''
he PRP B-NP 
accepted VBD B-VP 
the DT B-NP 
position NN I-NP 
of IN B-PP 
vice NN B-NP 
chairman NN I-NP 
of IN B-PP 
Carlyle NNP B-NP 
Group NNP I-NP 
, , O 
a DT B-NP 
merchant NN I-NP 
banking NN I-NP 
concern NN I-NP 
. . O
'''
nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()

In [13]:
from nltk.corpus import conll2000
print(conll2000.chunked_sents('train.txt')[99])

(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)


In [14]:
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


## 3.2   Simple Evaluation and Baselines

In [15]:
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [16]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [17]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [18]:
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


In [19]:
postags = sorted(set(pos for sent in train_sents
                     for (word,pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [20]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [21]:
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%


## 3.3   Training Classifier-Based Chunkers

In [22]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(train_set, trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [23]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

KeyboardInterrupt: 

In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == len(sentence)-1:
        nextword, nextpos = "<END>", "<END>"
    else:
        nextword, nextpos = sentence[i+1]
    return {"pos": pos,
            "word": word,
            "prevpos": prevpos,
            "nextpos": nextpos,
            "prevpos+pos": "%s+%s" % (prevpos, pos),
            "pos+nextpos": "%s+%s" % (pos, nextpos),
            "tags-since-dt": tags_since_dt(sentence, i)}

In [None]:
def tags_since_dt(sentence, i):
    tags = set()
    for word, pos in sentence[:i]:
        if pos == 'DT':
            tags = set()
        else:
            tags.add(pos)
    return '+'.join(sorted(tags))

In [None]:
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

# 4   Recursion in Linguistic Structure

## 4.1   Building Nested Structure with Cascaded Chunkers

In [24]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
    ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]

In [25]:
print(cp.parse(sentence))

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [26]:
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"),
    ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"),
    ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))

(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [27]:
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))

(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))


## 4.2   Trees

In [28]:
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)

(NP Alice)


In [29]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)

(NP the rabbit)


In [30]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)

(S (NP Alice) (VP chased (NP the rabbit)))


In [31]:
print(tree4[1])

(VP chased (NP the rabbit))


In [32]:
tree4[1].label()

'VP'

In [33]:
tree4.leaves()

['Alice', 'chased', 'the', 'rabbit']

In [34]:
tree4[1][1][1]

'rabbit'

In [35]:
tree3.draw()

## 4.3   Tree Traversal

In [36]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        # Now we know that t.node is defined
        print('(', t.label(), end=" ")
        for child in t:
            traverse(child)
        print(')', end=" ")

t = nltk.Tree.fromstring('(S (NP Alice) (VP chased (NP the rabbit)))')
traverse(t)

( S ( NP Alice ) ( VP chased ( NP the rabbit ) ) ) 

# 5   Named Entity Recognition

In [37]:
sent = nltk.corpus.treebank.tagged_sents()
pprint.pprint(list(nltk.ne_chunk_sents(sent, binary=True)))

[Tree('S', [Tree('NE', [('Pierre', 'NNP'), ('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]),
 Tree('S', [Tree('NE', [('Mr.', 'NNP'), ('Vinken', 'NNP')]), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), Tree('NE', [('Elsevier', 'NNP'), ('N.V.', 'NNP')]), (',', ','), ('the', 'DT'), Tree('NE', [('Dutch', 'NNP')]), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]),
 Tree('S', [Tree('NE', [('Rudolph', 'NNP'), ('Agnew', 'NNP')]), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), Tree('NE', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP')]), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('*-1', '-NONE-'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('of', 

 Tree('S', [('When', 'WRB'), Tree('NE', [('Bell', 'NNP')]), ('established', 'VBD'), ('that', 'IN'), ('the', 'DT'), Tree('NE', [('Berliner', 'NNP')]), ('patent', 'NN'), ('caveat', 'NN'), ('was', 'VBD'), ('registered', 'VBN'), ('*-145', '-NONE-'), ('10', 'CD'), ('days', 'NNS'), ('before', 'IN'), Tree('NE', [('Edison', 'NNP')]), ("'s", 'POS'), ('application', 'NN'), ('*T*-1', '-NONE-'), (',', ','), Tree('NE', [('Western', 'NNP'), ('Union', 'NNP')]), ('dropped', 'VBD'), ('the', 'DT'), ('lawsuit', 'NN'), ('and', 'CC'), ('agreed', 'VBD'), ('*-2', '-NONE-'), ('never', 'RB'), ('to', 'TO'), ('enter', 'VB'), ('the', 'DT'), ('telephone', 'NN'), ('business', 'NN'), ('--', ':'), ('the', 'DT'), ('basis', 'NN'), ('for', 'IN'), ('the', 'DT'), ('company', 'NN'), ("'s", 'POS'), ('current', 'JJ'), ('plight', 'NN'), ('.', '.')]),
 Tree('S', [Tree('NE', [('Oliver', 'NNP'), ('Berliner', 'NNP'), ('Beverly', 'NNP'), ('Hills', 'NNP')]), (',', ','), Tree('NE', [('Calif', 'NNP')]), ('.', '.')]),
 Tree('S', [Tree

 Tree('S', [('``', '``'), ('I', 'PRP'), ('do', 'VBP'), ("n't", 'RB'), ('want', 'VB'), ('*-1', '-NONE-'), ('to', 'TO'), ('denounce', 'VB'), ('it', 'PRP'), ('because', 'IN'), ('*', '-NONE-'), ('denouncing', 'VBG'), ('it', 'PRP'), ('would', 'MD'), ('be', 'VB'), ('like', 'IN'), ('*', '-NONE-'), ('denouncing', 'VBG'), ('capitalism', 'NN'), (',', ','), ("''", "''"), ('he', 'PRP'), ('explains', 'VBZ'), ('*T*-2', '-NONE-'), ('.', '.')]),
 Tree('S', [('And', 'CC'), ('surprising', 'JJ'), ('numbers', 'NNS'), ('of', 'IN'), ('small', 'JJ'), ('investors', 'NNS'), ('seem', 'VBP'), ('*-1', '-NONE-'), ('to', 'TO'), ('be', 'VB'), ('adapting', 'VBG'), ('to', 'TO'), ('greater', 'JJR'), ('stock', 'NN'), ('market', 'NN'), ('volatility', 'NN'), ('and', 'CC'), ('say', 'VB'), ('0', '-NONE-'), ('they', 'PRP'), ('can', 'MD'), ('live', 'VB'), ('with', 'IN'), ('program', 'NN'), ('trading', 'NN'), ('.', '.')]),
 Tree('S', [Tree('NE', [('Glenn', 'NNP'), ('Britta', 'NNP')]), (',', ','), ('a', 'DT'), ('25-year-old', '

In [38]:
pprint.pprint(list(nltk.ne_chunk_sents(sent)))

[Tree('S', [Tree('PERSON', [('Pierre', 'NNP')]), Tree('ORGANIZATION', [('Vinken', 'NNP')]), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]),
 Tree('S', [Tree('PERSON', [('Mr.', 'NNP')]), Tree('PERSON', [('Vinken', 'NNP')]), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Elsevier', 'NNP')]), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), Tree('GPE', [('Dutch', 'NNP')]), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')]),
 Tree('S', [Tree('PERSON', [('Rudolph', 'NNP')]), Tree('GPE', [('Agnew', 'NNP')]), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), Tree('ORGANIZATION', [('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP')]), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', '

 Tree('S', [('The', 'DT'), ('program-trading', 'JJ'), ('issue', 'NN'), ('is', 'VBZ'), ('heating', 'VBG'), ('up', 'RP'), ('on', 'IN'), Tree('PERSON', [('Capitol', 'NNP'), ('Hill', 'NNP')]), ('as', 'IN'), ('it', 'PRP'), ('is', 'VBZ'), ('*?*', '-NONE-'), ('on', 'IN'), Tree('FACILITY', [('Wall', 'NNP'), ('Street', 'NNP')]), (',', ','), ('and', 'CC'), ('several', 'JJ'), ('legislators', 'NNS'), ('want', 'VB'), ('*-2', '-NONE-'), ('to', 'TO'), ('grant', 'VB'), ('the', 'DT'), ('SEC', 'NNP'), ('the', 'DT'), ('power', 'NN'), ('*', '-NONE-'), ('to', 'TO'), ('shut', 'VB'), ('off', 'RP'), ('the', 'DT'), ('programs', 'NNS'), ('when', 'WRB'), ('trading', 'VBG'), ('becomes', 'VBZ'), ('too', 'RB'), ('volatile', 'JJ'), ('*T*-3', '-NONE-'), ('.', '.')]),
 Tree('S', [Tree('ORGANIZATION', [('SEC', 'NNP')]), ('Chairman', 'NNP'), Tree('PERSON', [('Richard', 'NNP'), ('Breeden', 'NNP')]), ('has', 'VBZ'), ('said', 'VBD'), ('0', '-NONE-'), ('he', 'PRP'), ('would', 'MD'), ('be', 'VB'), ('willing', 'JJ'), ('*-1', 

 Tree('S', [('The', 'DT'), Tree('ORGANIZATION', [('Constitution', 'NNP')]), ('does', 'VBZ'), ('not', 'RB'), ('expressly', 'RB'), ('give', 'VB'), ('the', 'DT'), ('president', 'NN'), ('such', 'JJ'), ('power', 'NN'), ('.', '.')]),
 Tree('S', [('However', 'RB'), (',', ','), ('the', 'DT'), ('president', 'NN'), ('does', 'VBZ'), ('have', 'VB'), ('a', 'DT'), ('duty', 'NN'), ('*', '-NONE-'), ('not', 'RB'), ('to', 'TO'), ('violate', 'VB'), ('the', 'DT'), ('Constitution', 'NNP'), ('.', '.')]),
 Tree('S', [('The', 'DT'), ('question', 'NN'), ('is', 'VBZ'), ('whether', 'IN'), ('his', 'PRP$'), ('only', 'JJ'), ('means', 'NNS'), ('of', 'IN'), ('defense', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('veto', 'NN'), ('.', '.')]),
 Tree('S', [('Excision', 'NN'), ('of', 'IN'), ('appropriations', 'NNS'), ('riders', 'NNS'), ('that', 'WDT'), ('*T*-1', '-NONE-'), ('trespass', 'VBP'), ('on', 'IN'), ('the', 'DT'), ('president', 'NN'), ("'s", 'POS'), ('duties', 'NNS'), ('and', 'CC'), ('prerogative', 'NN'), ('under', 'IN'

 Tree('S', [('``', '``'), ('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('demand', 'NN'), ('that', 'WDT'), ('*T*-3', '-NONE-'), ('must', 'MD'), ('be', 'VB'), ('met', 'VBN'), ('*-1', '-NONE-'), (',', ','), ('regardless', 'RB'), ('of', 'IN'), ('the', 'DT'), ('price', 'NN'), ('of', 'IN'), ('oil', 'NN'), (',', ','), ("''", "''"), ('said', 'VBD'), ('*T*-2', '-NONE-'), Tree('PERSON', [('Mr.', 'NNP'), ('Stevenson', 'NNP')]), ('.', '.')]),
 Tree('S', [Tree('GPE', [('Brazil', 'NNP')]), ('is', 'VBZ'), ('the', 'DT'), ('third-largest', 'JJ'), ('producer', 'NN'), ('*RNR*-1', '-NONE-'), ('and', 'CC'), ('the', 'DT'), ('fifth-largest', 'JJ'), ('exporter', 'NN'), ('*RNR*-1', '-NONE-'), ('of', 'IN'), ('sugar', 'NN'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.')]),
 Tree('S', [('A', 'DT'), ('shift', 'NN'), ('to', 'TO'), ('*', '-NONE-'), ('producing', 'VBG'), ('more', 'JJR'), ('alcohol', 'NN'), ('and', 'CC'), ('less', 'JJR'), ('sugar', 'NN'), ('had', 'VBD'), ('been', 'VBN'), ('expected', 'VBN'),

## 6   Relation Extraction

In [39]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                                     corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [40]:
from nltk.corpus import conll2002
vnv = """
(
is/V|    # 3rd sing present and
was/V|   # past forms of the verb zijn ('be')
werd/V|  # and also present
wordt/V  # past of worden ('become)
)
.*       # followed by anything
van/Prep # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
    for rel in nltk.sem.extract_rels('PER', 'ORG', doc,
                                   corpus='conll2002', pattern=VAN):
        print(nltk.sem.clause(rel, relsym="VAN"))

VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('annie_lennox', 'eurythmics')


In [41]:
print(nltk.rtuple(rel, lcon=True, rcon=True))

...'Door/Prep rugproblemen/N van/Prep zangeres/N')[PER: 'Annie/N Lennox/N'] 'wordt/V het/Art concert/N van/Prep' [ORG: 'Eurythmics/N']('vandaag/Adv in/Prep'...
