In [1]:
from nltk.corpus import treebank
import nltk
import re
import pandas as pd
import itertools
import csv

In [21]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def get_rules(tree):
    # Get subtrees of the tree
    subtrees = tree.subtrees()
    rules = []

    # For each subtree get the rule that generates it's immediate child node
    for subtree in subtrees:
        children = [ch for ch in subtree]

        # Skip preterminals
        if len(children) > 1:
            # get labels of child nodes
            child_labels = [ch.label() for ch in children]

            # create rule string
            curr_rule = ' '.join([subtree.label()] + child_labels)
            rules.append(curr_rule)

    return rules

In [3]:
trees_by_file = [treebank.parsed_sents(id) for id in treebank.fileids()]
# trees_by_file = trees_by_file[0]

trees = [tree for files in trees_by_file for tree in files]

# converting trees into Chomsky Normal Forms
for i in range(len(trees)):
    tree = trees[i]
    tree.chomsky_normal_form()  

In [22]:
 # Collect rules from treebank
rules = [rule for tree in trees for rule in get_rules(tree)] 

In [23]:
rules_df = pd.DataFrame(data={'rules':rules})

In [24]:
# Creating columns for each the rule components (LHS -> RHS1  RHS2)

rules_df['lhs'] = rules_df.rules.apply(lambda x: x.split(' ')[0])
rules_df['rhs1'] = rules_df.rules.apply(lambda x: x.split(' ')[1])
rules_df['rhs2'] = rules_df.rules.apply(lambda x: x.split(' ')[2])

# Cleaning the rules string
# EX: NP-SBJ NP NP-SBJ|<,-ADJP-,>    to      NP-SBJ NP NP-SBJ
rules_df['rhs1_clean'] = rules_df.rules.apply(lambda x: re.sub(r'\|[^)]*\>', '', x.split(' ')[1]))
rules_df['rhs2_clean'] = rules_df.rules.apply(lambda x: re.sub(r'\|[^)]*\>', '', x.split(' ')[2]))

In [25]:
rules_df['rules_clean'] = rules_df.apply(lambda x: x.lhs+'   '+x.rhs1_clean+'   '+x.rhs2_clean,axis=1)


In [31]:
rules_clean = pd.concat([
    rules_df[rules_df.lhs == 'S'].sort_values('rhs1')['rules_clean'].value_counts().reset_index().sort_values('index'),
    rules_df[(rules_df.lhs.str.startswith('S')) & (rules_df.lhs != 'S')]['rules_clean'].value_counts().reset_index().sort_values('index'),
    rules_df[(rules_df.lhs.str.startswith('S')==False) & (rules_df.lhs != 'S')]['rules_clean'].value_counts().reset_index().sort_values('index')]).rename(columns={'index':'rule','rules_clean':'frequency'})

In [32]:
# Removing rules that have conflict with allowed_words.txt

rules_clean = rules_clean[rules_clean.rule.str.contains('\$') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('-NONE-') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('-LRB-') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('\#') == False]


In [33]:
# Adding initial rule
rules_clean = pd.concat([pd.DataFrame(data={'rule':['TOP S'], 'frequency':[1]}),
                        rules_clean])

In [34]:
rules_clean['final'] = rules_clean.apply(lambda x: str(x.frequency) + '   ' + x.rule, axis=1)]

In [35]:
rules_clean.head()

Unnamed: 0,rule,frequency,final
0,TOP S,1,1 TOP S
253,S '' S,1,1 S UCP-ADV S
44,"S , S",3,1 S NP-SBJ-112 S
35,S : S,4,1 S NP-SBJ-103 VP
191,S ADJP-PRD NP-SBJ,1,2 S NP-SBJ-83 VP


In [54]:
# Saving the grammar file
rules_clean[['final']].to_csv('probabilistic_grammar.txt',index=False,header=False,sep='\t')