In [1]:
import nltk
from nltk import tree
from nltk import Nonterminal
from nltk.draw.tree import TreeView

def loadData(path):
    with open(path,'r') as f:
        data = f.read().split('\n')
    return data

def getTreeData(data):
    return map(lambda s: tree.Tree.fromstring(s), data)

# Main script
gram_rules = []
print("loading data..")
data = loadData('parseTrees.txt')
print("generating trees..")
treeData = getTreeData(data)

for t in treeData:
    gram_rules.extend(t.productions())

print("Total rules: " + str(len(gram_rules)) + ' loaded for our system' )

loading data..
generating trees..
Total rules: 214289 loaded for our system


In [20]:
#We now will calculate the probabilities for each rule as the number of times it appears as a perc of all rules
from collections import Counter
import pandas as pd

#We enrich our dataframe with additional fields for probabilities and counters
rules_count = dict(Counter(gram_rules))
rules_df = pd.DataFrame(list(rules_count.items()), columns=['rule','count'])
rules_df = pd.DataFrame(list(rules_count.items()), columns=['rule','count'])
rules_df['rule_str'] = rules_df.apply(lambda row: str(row['rule']), axis=1)
rules_df['lhs'] = rules_df.apply(lambda row: row['rule'].lhs(), axis=1)
rules_df['rhs'] = rules_df.apply(lambda row: row['rule'].rhs(), axis=1)
rules_df['total_count'] = rules_df.groupby('lhs')['count'].transform(sum)
rules_df['prob'] = rules_df.apply(lambda row: row['count']/row['total_count'], axis=1)
rules_df['production_prob'] = rules_df.apply(lambda row: str(row['rule']) + ' ' + str(row['prob']), axis=1)

#Convert the rules into a grammar in order to create parse trees
rules_list = rules_df['rule'].tolist()
grammar = "\n".join([str(r) for r in rules_list])
grammar = nltk.CFG.fromstring(grammar)

#Generate the dictionary for each tree's calculation
rules_prob = dict(zip(rules_df['rule'], rules_df['prob']))

In [15]:
#Convert the rules into a grammar in order to create parse trees
S = Nonterminal('S')
rules_list = rules_df['rule'].tolist()
grammar = "\n".join([str(r) for r in rules_list])
grammar = nltk.CFG.fromstring(grammar)

In [30]:
#Let's now parse our sentence
sent = 'Show me the meals on the flight from Phoenix'
#We need to lowercase the verb as in our grammar verbs are in lowercase since they can be in 
#the middle of the sentence as well. Phoenix should remain in capital case as is a proper noun and not the bird.
sent = sent[0].lower() + sent[1:]
sent = sent.split()
parser = nltk.ChartParser(grammar)

#We will generate the tree parsing using our grammar and calculate the probabilities
i = 0
for tree in parser.parse(sent):
    i += 1    
    print("Tree number " + str(i) + ':')
    #Calculate the probability
    prob = 1
    for p in tree.productions():
        prob *= rules_prob[p]
    print("Tree probability " + str(prob))        
    print(tree)
    print('\n')

Tree number 1:
Tree probability 1.2761534830418347e-08
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP
      (Det the)
      (Nominal
        (Nominal
          (Nominal (Noun meals))
          (PP
            (Preposition on)
            (NP (Det the) (Nominal (Noun flight)))))
        (PP (Preposition from) (NP (Proper_Noun Phoenix)))))))


Tree number 2:
Tree probability 7.873603043279275e-09
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP
      (Det the)
      (Nominal
        (Nominal (Noun meals))
        (PP
          (Preposition on)
          (NP
            (NP (Det the) (Nominal (Noun flight)))
            (PP (Preposition from) (NP (Proper_Noun Phoenix)))))))))


Tree number 3:
Tree probability 1.2761534830418347e-08
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP
      (Det the)
      (Nominal
        (Nominal (Noun meals))
        (PP
          (Preposition on)
          (NP
            (Det the)
            (Nominal
              (Nominal (Noun

In [31]:
#In order to validate our results we'll use the NLTK package to 
from nltk import induce_pcfg

print("Let's build us some PCFG")
S = Nonterminal('S')
grammar = induce_pcfg(S, gram_rules)

sent = "show me the meals on the flight from Phoenix".split()

print("Using the Viterbi parser to most likely parse for our ambiguous sentence")
viterbi_parser = nltk.ViterbiParser(grammar)
for tree in viterbi_parser.parse(sent):
    print(tree)

print("")    
print("And now parsing all trees with InsideChart parser...")
inside_parser = nltk.InsideChartParser(grammar)
# viterbi_parser.trace(3)

i = 1
for tree in inside_parser.parse(sent):
    print('Tree number ' + str(i) + ":")    
    print(tree)
#     TreeView(tree)._cframe.print_to_file('parse' + str(i) + '.ps')
    i += 1

Let's build us some PCFG
Using the Viterbi parser to most likely parse for our ambiguous sentence
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP (Det the) (Nominal (Noun meals)))
    (PP
      (Preposition on)
      (NP
        (Det the)
        (Nominal
          (Nominal (Noun flight))
          (PP
            (Preposition from)
            (NP (Proper_Noun Phoenix)))))))) (p=2.19769e-08)

And now parsing all trees with InsideChart parser...
Tree number 1:
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP (Det the) (Nominal (Noun meals)))
    (PP
      (Preposition on)
      (NP
        (Det the)
        (Nominal
          (Nominal (Noun flight))
          (PP
            (Preposition from)
            (NP (Proper_Noun Phoenix)))))))) (p=2.19769e-08)
Tree number 2:
(S
  (IVP
    (IVerb show)
    (NP (Pronoun me))
    (NP
      (Det the)
      (Nominal
        (Nominal (Noun meals))
        (PP
          (Preposition on)
          (NP (Det the) (Nominal (Noun flight)

In [5]:
# print(grammar)