In [None]:
### CREATE VIRTUAL DISPLAY ###
!apt-get install -y xvfb # Install X Virtual Frame Buffer
import os
os.system('Xvfb :1 -screen 0 1600x1200x16  &')    # create virtual display with size 1600x1200 and 16 bit color. Color can be changed to 24 or 8
os.environ['DISPLAY']=':1.0'    # tell X clients to use our virtual DISPLAY :1.0.

%matplotlib inline

### INSTALL GHOSTSCRIPT (Required to display NLTK trees) ###
!apt-get update
!apt install ghostscript python3-tk

# Parsing with NLTK. Probabilistic context-free grammars
Taken from the NLTK book

NLTK: several treebanks available:
* English: nltk_data/corpora/treebank/combined

* Probabilistic context-free grammars (PCFG)
* How to obtain a PCFG

## Example grammar:

In [None]:
import nltk

grammar = nltk.CFG.fromstring("""
 S -> NP V NP
 NP -> NP Sbar
 Sbar -> NP V
 NP -> 'fish'
 V -> 'fish'
 """)

sent1 = "fish fish fish fish fish".split()
sent2 = "fish fish fish fish fish fish fish".split()

chart_parser = nltk.ChartParser(grammar)



### The problem of ambiguity in NLP

In [None]:
for p in chart_parser.parse(sent1): print(p)

In [None]:
for p in chart_parser.parse(sent2): 
  print(p)
  display(p)

In [None]:
# How many trees?
sent3 = "fish fish fish fish fish fish fish fish fish fish fish".split()
count = 0
for p in chart_parser.parse(sent3): 
  count = count + 1
print("Number of parses: ", count)

### Examples:
* _I saw the man with the telescope_
  * I used a telescope?
  * The man had a telescope?

* _I saw the man on the hill with the telescope_
  * How many trees?

In [None]:
grammar2 = nltk.CFG.fromstring("""
 S -> NP VP
 NP -> PRON | DT NN | NP PP
 VP -> V NP | V NP PP
 DT -> 'the'
 NN -> 'man' | 'hill' |'telescope' |'knife'
 PP -> P NP
 V -> 'saw'
 PRON -> 'I'
 P -> 'on' | 'with'
""")
chart_parser = nltk.ChartParser(grammar2)

sent1 = "I saw the man with the telescope".split()

for p in chart_parser.parse(sent1): print(p)

In [None]:
sent2 = "I saw the man on the hill with the knife".split()

for p in chart_parser.parse(sent2): 
  print(p)
  display(p)

## Solution: use a PCFG!

In [None]:
grammar = nltk.PCFG.fromstring("""
   S -> NP VP [1.0]
   VP -> V NP [0.5]
   VP -> V NP PP [0.5]
   V -> 'saw' [1.0]
   DT -> 'the' [1.0]
   NP -> PRON  [0.22]
   NP -> DT NN [0.56]
   NP -> NP PP [0.22]
   NN -> 'knife' [0.2]
   NN -> 'telescope' [0.2]
   NN -> 'man' [0.4]
   NN -> 'hill' [0.2]
   PP -> P NP [1.0]
   PRON -> 'I' [1.0]
   P -> 'on' [0.33]
   P -> 'with' [0.67]
""")
inside_parser = nltk.parse.InsideChartParser(grammar)
viterbi_parser = nltk.ViterbiParser(grammar)

Now we can have the different alternatives ordered by their probability:

In [None]:
sent1 = "I saw the man with the telescope".split()

trees1 = inside_parser.parse(sent1)

for tree in trees1: 
  print(tree)
  display(tree)

In [None]:
sent2 = "I saw the man on the hill with the knife".split()

trees2 = inside_parser.parse(sent2)

for tree in trees2: 
  print(tree)
  display(tree)

## Annotating a treebank
### We must select a set of sentences to be annotated, for example:
 * _I saw the man with the telescope_
 * _I saw the man on the hill with the telescope_


### Then the trees must be annotated (manually or semiautomatically)

**I saw the man with the telescope** <br>
(S  (NP (PRON I)) <br>
     (VP <br>
       (V saw)  <br>
       (NP (DT the) (NN man)) <br>
       (PP (P with) (NP (DT the) (NN telescope))))) <br> <br>
**I saw the man on the hill with the telescope** <br>
(S <br>
  (NP (PRON I)) <br>
  (VP <br>
    (V saw) <br>
    (NP <br>
      (NP <br>
        (NP (DT the) (NN man)) <br>
        (PP (P on) (NP (DT the) (NN hill)))) <br>
      (PP (P with) (NP (DT the) (NN telescope)))))) <br>

### How to obtain a PCFG? Counting rules <br>
S -> NP VP      [2/2] <br>
 <br>
NP -> PRON    [2/9] <br>
NP -> DT NN   [5/9] <br>
NP -> NP PP    [2/9] <br>
 <br>
VP -> V NP      [1/2] <br>
VP -> V NP PP [1/2] <br>
PP -> P NP      [2/2] <br>
 <br>
DT -> 'the'           [5/5] <br>
 <br>
NN -> 'man'          [2/5] <br>
NN -> 'hill'            [1/5] <br>
NN -> 'telescope' [2/5] <br>
 <br>
Pron -> 'I'             [2/2] <br>
 <br>
P -> 'with'            [2/3] <br>
P -> 'on'               [1/3] <br>
 <br>
V -> 'saw'             [2/2] <br>


### We have a PCFG:

S -> NP VP [1.0] <br>
VP -> V NP [0.5] <br>
VP -> V NP PP [0.5] <br>
V -> 'saw' [1.0] <br>
DT -> 'the' [1.0] <br>
NP -> PRON  [0.22] <br>
NP -> DT NN [0.56] <br>
NP -> NP PP [0.22] <br>
NN -> 'telescope' [0.4] <br>
NN -> 'man' [0.4] <br>
NN -> 'hill' [0.2] <br>
PP -> P NP [1.0] <br>
PRON -> 'I' [1.0] <br>
P -> 'on' [0.33] <br>
P -> 'with' [0.67] <br>

## Exercise: given a (small) treebank, induce a PCFG
( N ( A long) ( N ( A red) ( N hair) ) )

( N ( A nice) ( N tie) )

( N ( A ( A dark) ( A red) ) ( N hair) )

### Obtain its corresponding PCFG:

In [None]:
grammar = nltk.PCFG.fromstring("""
   N -> XXX YYY [1.0]
   N -> 'hair' [0.xxx] # Lexical rules
""")
inside_parser = nltk.parse.InsideChartParser(grammar)

### Calculate the best tree for the sentence *nice red hair*:

In [None]:
sentence = 'nice red hair'.split()

trees = inside_parser.parse(sentence)

for tree in trees: 
  print(tree)
  display(tree)

### Automatically calculating a PCFG given a set of bracketed trees:

In [None]:
trees = "(N ( A long) ( N ( A red) ( N hair) ) )\n( N ( A nice) ( N tie) )\n( N ( A ( A dark) ( A red) ) ( N hair) )\n"

# Given a list of bracketed trees, get a list of all the productions
def calculate_productions(bracketed_trees):
  prods = []
  for line in trees.splitlines():
    tree = Tree.fromstring(line)
    prods += tree.productions() # add the productions of this tree
  return prods


productions = calculate_productions(trees)
print('PRODUCTIONS: ', productions)

N = nltk.Nonterminal('N')
grammar = nltk.induce_pcfg(N, productions) # Obtain a PCFG
print(grammar)

inside_parser = nltk.parse.InsideChartParser(grammar, trace=2) # Obtain syntactic analyzer (parser)

In [None]:
sentence = 'nice red hair'.split()
trees = inside_parser.parse(sentence)

for tree in trees: 
  print(tree)
  display(tree)

## Creating a real PCFG (English)

In [None]:
from itertools import islice
nltk.download('treebank')

productions = []
S = nltk.Nonterminal('S')

# Print some annotated sentences from the Penn treebank
for tree in nltk.corpus.treebank.parsed_sents('wsj_0018.mrg'): print(tree)

### Count the number of times that every rule has been used

In [None]:
for tree in nltk.corpus.treebank.parsed_sents('wsj_0018.mrg'): productions += tree.productions()

grammar = nltk.induce_pcfg(S, productions)
inside_parser = nltk.parse.InsideChartParser(grammar, trace=2)
viterbi_parser = nltk.parse.ViterbiParser(grammar, trace=2)
print(grammar)

### Applying the PCFG <br>
* No smoothing: all the words in any sentence should be present in the treebank (otherwise the sentence will get a zero probability)

To obtain the analysis with highest probability:

In [None]:
sentence1 = 'Cray Computer has applied to trade on Nasdaq .'.split()

trees = viterbi_parser.parse(sentence1) # Find the tree with the highest probability
for tree in trees: print(tree)

All the analyses:

In [None]:
trees = inside_parser.parse(sentence1) # Find all the trees
for tree in trees: print(tree)

In [None]:
trees = inside_parser.parse(sentence1)

for tree in trees: display(tree)

In [None]:
sentence2 = 'Gregory employs the engineering on the Nasdaq with the stock .'.split()

trees = inside_parser.parse(sentence2) # Find all the trees
for tree in trees: display(tree)