# Parsing with NLTK. Unification-based grammars

In [None]:
### CREATE VIRTUAL DISPLAY ###
!apt-get install -y xvfb # Install X Virtual Frame Buffer
import os
os.system('Xvfb :1 -screen 0 1600x1200x16  &')    # create virtual display with size 1600x1200 and 16 bit color. Color can be changed to 24 or 8
os.environ['DISPLAY']=':1.0'    # tell X clients to use our virtual DISPLAY :1.0.

%matplotlib inline

### INSTALL GHOSTSCRIPT (Required to display NLTK trees) ###
!apt-get update
!apt install ghostscript python3-tk

## Input data. Grammar

In [None]:
import nltk
from nltk.parse import load_parser
from nltk import grammar, parse

In [None]:
g = """
## Natural Language Toolkit: feat0.fcfg
##
## First example of a feature-based grammar for English, illustrating
## value-sharing of NUM and TENSE features.
## Used in Feature-Based Grammars chapter.
## 
## Author: Ewan Klein <ewan@inf.ed.ac.uk> 
## URL: <http://nltk.sourceforge.net>
## For license information, see LICENSE.TXT

% start S
# ###################
# Grammar Productions
# ###################

# S expansion productions
S -> NP[NUM=?n] VP[NUM=?n]

# NP expansion productions
NP[NUM=?n] -> N[NUM=?n] 
NP[NUM=?n] -> PropN[NUM=?n] 
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl] 

# VP expansion productions
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP

# ###################
# Lexical Productions
# ###################

Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'

PropN[NUM=sg]-> 'Kim' | 'Jody' | 'Peter' | 'John'

N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' 

IV[TENSE=pres,  NUM=sg] -> 'disappears' | 'walks'
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'

IV[TENSE=pres,  NUM=pl] -> 'disappear' | 'walk'
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'

IV[TENSE=past] -> 'disappeared' | 'walked'
TV[TENSE=past] -> 'saw' | 'liked'
"""
grammar1 = grammar.FeatureGrammar.fromstring(g)

print(grammar1)

cp = parse.FeatureEarleyChartParser(grammar1, trace=1)

## Alternatively: load a grammar

In [None]:
nltk.download('book_grammars')

nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg')
cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1)


## Analyze a sentence

In [None]:
sentence1 = "Kim likes children".split()

trees = cp.parse(sentence1)

for tree in trees: print(tree)

### Graphically:

In [None]:
trees = cp.parse(sentence1)
print()
print()


#for tree in trees: tree.draw() # running locally in your computer, so that pop-up windows are allowed
for tree in trees: display(tree)

### Another sentence

In [None]:
sentence2 = "these dogs like children".split()

trees = cp.parse(sentence2)
print()
print()


for tree in trees: display(tree)

# Another, more sophisticated, grammar:


In [None]:
g = """
% start S
# ###################
# Grammar Productions
# ###################

# S expansion productions
S[SUBJ=?hs, OBJ=?o] -> NP[NUM=?n, HEAD=?hs] VP[NUM=?n, OBJ=?o]

# NP expansion productions
NP[NUM=?n, HEAD=?h] -> N[NUM=?n, HEAD=?h] 
NP[NUM=?n, HEAD=?h] -> PropN[NUM=?n, HEAD=?h] 
NP[NUM=?n, HEAD=?h] -> Det[NUM=?n] N[NUM=?n, HEAD=?h]
NP[NUM=pl, HEAD=?h] -> N[NUM=pl, HEAD=?h] 

# VP expansion productions
VP[TENSE=?t, NUM=?n, OBJ=null] -> IV[TENSE=?t, NUM=?n, OBJ=null]
VP[TENSE=?t, NUM=?n, OBJ=?o] -> TV[TENSE=?t, NUM=?n, OBJ=?o] NP[HEAD=?o]

# ###################
# Lexical Productions
# ###################

Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'

PropN[NUM=sg, HEAD=Kim]-> 'Kim'
PropN[NUM=sg, HEAD=Jody]-> 'Jody'

N[NUM=sg, HEAD=dog] -> 'dog' 
N[NUM=sg, HEAD=girl] -> 'girl' 
N[NUM=sg, HEAD=car] -> 'car'
N[NUM=sg, HEAD=child] -> 'child'

N[NUM=pl, HEAD=dog] -> 'dogs' 
N[NUM=pl, HEAD=girl] -> 'girls' 
N[NUM=pl, HEAD=car] -> 'cars'
N[NUM=pl, HEAD=child] -> 'children'


IV[TENSE=pres,  NUM=sg, OBJ=null] -> 'disappears' | 'walks'
TV[TENSE=pres,  NUM=sg          ] -> 'sees' | 'likes'

IV[TENSE=pres,  NUM=pl, OBJ=null] -> 'disappear' | 'walk'
TV[TENSE=pres,  NUM=pl          ] -> 'see' | 'like'

IV[TENSE=past, OBJ=null] -> 'disappeared' | 'walked'
TV[TENSE=past          ] -> 'saw' | 'liked'
"""

grammar1 = grammar.FeatureGrammar.fromstring(g)

print(grammar1)

cp = parse.FeatureEarleyChartParser(grammar1, trace=1)

In [None]:
sentence1 = "these dogs like children".split()

trees = cp.parse(sentence1)
print()
print()

for tree in trees: display(tree)

## Assignment

* Take the previous grammar as a starting point
* Make a copy in your local directory
* Try analyzing different sentences. For example:

In [None]:
sentence1 = "this dog walks".split()
sentence2 = "these dog walks".split()
sentence3 = "this dog walk".split()

trees = cp.parse(sentence1)
for tree in trees: print(tree)

Write rules for:
* A grammar of your favourite language (Basque, Spanish, English, ...): dealing with more sophisticated agreement (e.g. 'los gatos negros comen ratones', 'gizonak txoria ikusi du', ...
* Ditransitive verbs:

In [None]:
sentence1 = "Kim gives Jody the dog".split()

* Subcategorization (see http://www.nltk.org/book/ch09.html, section 3.1)
* Auxiliary verbs and inversion (see http://www.nltk.org/book/ch09.html, section 3.3)

## Important
* Save the different states of your grammar (my_grammar1.fcfg, my_grammar2.fcfg, my_grammar3.fcfg ...) with the examples currently analyzed by the grammar, so that you can recover from errors when modifying the grammar