# Code to generate Vocab.gr
##### 1. From nltk.book read Monty Python and the Holy Grail and allowed_words.txt
##### 2. Word Tokenize
##### 3. Identify parts of speech using RegexpTagger,UnigramTagger, BigramTagger and PerceptronTagger
##### 4. Write it to Vocab.gr

In [20]:
import nltk
from nltk.corpus import brown
import pandas as pd
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)
from nltk.book import *
from nltk.corpus import webtext
data = webtext.raw('grail.txt')

In [21]:
with open('allowed_words.txt', 'r') as myfile:
    actual_data=myfile.read()

In [22]:
text = nltk.word_tokenize(data)
actual_text = nltk.word_tokenize(actual_data)

In [25]:
train_sents = brown.tagged_sents()

In [26]:
test_sents = text

In [27]:
patterns = [
    (r'.*ing$', 'VBG'),               # gerunds
    (r'.*ed$', 'VBD'),                # simple past
    (r'.*es$', 'VBZ'),                # 3rd singular present
    (r'.*ould$', 'MD'),               # modals
    (r'.*\'s$', 'NN$'),               # possessive nouns
    (r'.*s$', 'NNS'),                 # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'(The|the|A|a|An|an)$', 'AT'),   # articles 
    (r'.*able$', 'JJ'),                # adjectives 
    (r'.*ness$', 'NN'),                # nouns formed from adjectives
    (r'(.*ly|Alright|alright)$', 'RB'),                  # adverbs
    (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns
    (r'(His|his|Her|her|Its|its)$', 'PRP$'),    # possesive
    (r'(my|Your|your|Yours|yours)$', 'PRP$'),   # possesive
    (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions
    (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions
    (r'(till|Till|until|Until)$', 'IN'),        # time prepopsitions
    (r'(by|By|beside|Beside)$', 'IN'),          # space prepopsitions
    (r'(under|Under|below|Below)$', 'IN'),      # space prepopsitions
    (r'(over|Over|above|Above)$', 'IN'),        # space prepopsitions
    (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions
    (r'(into|Into|towards|Towards)$', 'IN'),    # space prepopsitions
    (r'(onto|Onto|from|From)$', 'IN'),          # space prepopsitions    
    # WARNING : Put the default value in the end
    (r'.*', 'NN')                      # nouns (default)
    ]


In [28]:
from nltk.tag.perceptron import PerceptronTagger
pct_tag = PerceptronTagger(load=False)
model = pct_tag.train(train_sents)

In [29]:
t0 = nltk.RegexpTagger(patterns)
t1 = nltk.UnigramTagger(train_sents,model=model, backoff=t0)
t2 = nltk.UnigramTagger(train_sents, backoff=t1)
t3 = nltk.BigramTagger(train_sents,cutoff=2, backoff=t2)

In [9]:
pos_list = sorted(set(t3.tag(text)))
actual_pos_list = sorted(set(t3.tag(actual_text)))

In [10]:
#Adding default weight of 1. Change it later
pos_list = [(1,) + pos for pos in pos_list]
actual_pos_list = [(1,) + pos for pos in actual_pos_list]

In [11]:
df = pd.DataFrame(pos_list, columns=['Weight', 'Word', 'POS'])
df = df[['Weight','POS','Word']]


actual_df = pd.DataFrame(actual_pos_list, columns=['Weight', 'Word', 'POS'])
actual_df = actual_df[['Weight','POS','Word']]

new_df = pd.concat([df,actual_df],axis=0)
new_df = new_df.drop_duplicates()
new_df = new_df.sort_values('POS')
#Uncomment this if Vocab.gr is not generated
new_df.to_csv('Vocab.gr',sep='\t',index=False,header=False)

In [16]:
import re
def get_rules(tree):
    # Get subtrees of the tree
    subtrees = tree.subtrees()
    rules = []

    # For each subtree get the rule that generates it's immediate child node
    for subtree in subtrees:
        children = [ch for ch in subtree]

        # Skip nodes that lead to leaves
        if len(children) > 0 and not type(children[0]) == str:
            # get labels of child nodes
            child_labels = [fix_label(ch.label()) for ch in children]

            # create rule string
            curr_rule = '\t'.join([subtree.label()] + child_labels)

            # Exclude rules with unneeded nodes
            if not exclude_rule(curr_rule):
                rules.append(curr_rule)

    return rules


def fix_label(label):
    if not re.search(r'[a-zA-Z]', label):
        return 'Punc-' + label

    return label


def exclude_rule(rule):
    for n in unneeded_nodes:
        if n in rule:
            return True

    return False

In [15]:
from collections import Counter
unneeded_nodes = ['$', '-NONE-']

In [7]:
# get trees for Penn Treebank
trees_by_file = [treebank.parsed_sents(id) for id in treebank.fileids()]
trees = [tree for files in trees_by_file for tree in files]

# converting trees into Chomsky Normal Forms
for i in range(len(trees)):
    tree = trees[i]
    tree.chomsky_normal_form()

In [18]:
rules = [rule for tree in trees for rule in get_rules(tree)]

rule_counts = Counter(rules)
rules = list(set(rules))
rules.sort()

# Save rules
with open('S2.gr', 'w') as f:
    f.write('# Rules found in Penn Treebank\n# Number of rules: {0}\n'.format(len(rules)))

    for r in rules:
        f.write('{0}\t\t{1}\n'.format(rule_counts[r], r))