In [1]:
from nltk.corpus import treebank
from nltk.corpus import brown
import nltk
import re
import pandas as pd
import itertools
import csv

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# treebank.fileids()[0:2]

In [4]:
# treebank.parsed_sents('wsj_0001.mrg')

In [5]:
# brown.fileids()[0:2]

In [6]:
# brown.tagged_sents('ca02')

In [7]:
def get_rules(tree):
    # Get subtrees of the tree
    subtrees = tree.subtrees()
    rules = []

    # For each subtree get the rule that generates it's immediate child node
    for subtree in subtrees:
        children = [ch for ch in subtree]

        # Skip preterminals
        if len(children) > 1:
            # get labels of child nodes
            child_labels = [ch.label() for ch in children]

            # create rule string
            curr_rule = ' '.join([subtree.label()] + child_labels)
            rules.append(curr_rule)

    return rules

In [8]:
trees_by_file = [treebank.parsed_sents(id) for id in treebank.fileids()]
# trees_by_file = trees_by_file[0]

trees = [tree for files in trees_by_file for tree in files]

# converting trees into Chomsky Normal Forms
for i in range(len(trees)):
    tree = trees[i]
    tree.chomsky_normal_form()  

In [9]:
 # Collect rules from treebank
rules = [rule for tree in trees for rule in get_rules(tree)] 

In [10]:
rules_df = pd.DataFrame(data={'rules':rules})

In [11]:
# Creating columns for each the rule components (LHS -> RHS1  RHS2)

rules_df['lhs'] = rules_df.rules.apply(lambda x: x.split(' ')[0])
rules_df['rhs1'] = rules_df.rules.apply(lambda x: x.split(' ')[1])
rules_df['rhs2'] = rules_df.rules.apply(lambda x: x.split(' ')[2])

# Cleaning the rules string
# EX: NP-SBJ NP NP-SBJ|<,-ADJP-,>    to      NP-SBJ NP NP-SBJ
rules_df['rhs1_clean'] = rules_df.rules.apply(lambda x: re.sub(r'\|[^)]*\>', '', x.split(' ')[1]))
rules_df['rhs2_clean'] = rules_df.rules.apply(lambda x: re.sub(r'\|[^)]*\>', '', x.split(' ')[2]))

In [12]:
# rules_df['rules_clean'] = rules_df.apply(lambda x: x.lhs+'   '+x.rhs1_clean+'   '+x.rhs2_clean,axis=1)
rules_df['rules_clean'] = rules_df.apply(lambda x: x.lhs+'   '+x.rhs1+'   '+x.rhs2,axis=1)

In [13]:
rules_clean = pd.concat([
    rules_df[rules_df.lhs == 'S'].sort_values('rhs1')['rules_clean'].value_counts().reset_index().sort_values('index'),
    rules_df[(rules_df.lhs.str.startswith('S')) & (rules_df.lhs != 'S')]['rules_clean'].value_counts().reset_index().sort_values('index'),
    rules_df[(rules_df.lhs.str.startswith('S')==False) & (rules_df.lhs != 'S')]['rules_clean'].value_counts().reset_index().sort_values('index')]).rename(columns={'index':'rule','rules_clean':'frequency'})

In [14]:
# Removing rules that have conflict with allowed_words.txt

rules_clean = rules_clean[rules_clean.rule.str.contains('\$') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('-NONE-') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('-LRB-') == False]
rules_clean = rules_clean[rules_clean.rule.str.contains('\#') == False]


In [15]:
# Adding initial rule "1 TOP -> S"
rules_clean = pd.concat([pd.DataFrame(data={'rule':['TOP S'], 'frequency':[1]}),
                        rules_clean])

In [17]:
rules_clean['final'] = rules_clean.apply(lambda x: str(x.frequency) + '   ' + x.rule, axis=1)

In [18]:
rules_clean.head()

Unnamed: 0,rule,frequency,final
0,TOP S,1,1 TOP S
521,S '' S|<NP-SBJ-2-VP-.>,1,1 S '' S|<NP-SBJ-2-VP-.>
769,"S , S|<PP-,-NP-SBJ-VP>",1,"1 S , S|<PP-,-NP-SBJ-VP>"
487,"S , S|<PP-,-PP-NP-SBJ-VP>",1,"1 S , S|<PP-,-PP-NP-SBJ-VP>"
477,"S , S|<SBAR-ADV-,-NP-SBJ-3-VP>",1,"1 S , S|<SBAR-ADV-,-NP-SBJ-3-VP>"


In [19]:
# Saving the grammar file
rules_clean[['final']].to_csv('probabilistic_grammar.txt',index=False,header=False,sep='\t')