In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.parse import CoreNLPParser
from nltk.tree import Tree
from nltk.parse.corenlp import CoreNLPDependencyParser
from nltk.corpus import webtext

In [2]:
parser = CoreNLPParser(url='http://localhost:9000')

# To run this parser is necessary to install it and start the service as decribed in 
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html

In [3]:
# Get allowed words
allowed_words_file = open('allowed_words.txt', 'r')
allowed_words = allowed_words_file.read().split('\n')

## Loading Monty Python Corpus

In [4]:
data = webtext.raw('grail.txt')

# Getting Vocabulary

In [5]:
# Get tokens and part of speech (POS)
tokens = nltk.word_tokenize(data)
pos_list = nltk.pos_tag(tokens)

# Creating a pandas dataframe to store tokens and POS
tokens_df = pd.DataFrame(pos_list,columns=['token','POS'])

# set count = 1 for each occurrence of token. It will be used to group and sum these values to result in the token frequency
tokens_df['count'] = 1

#filter allowed words
tokens_df = tokens_df[tokens_df.token.isin(allowed_words)]

# calculate the token frequency
tokens_df = tokens_df.groupby(['token','POS']).sum().reset_index()[['count','POS','token']].sort_values(['POS','token'])

# replace the punctuation characters with pattern "Punc_"+char
tokens_df['POS'] = tokens_df['POS'].apply(lambda x: 'Punc_'+x if x in ['(',')','.',':',',','"',"''"] else x)

In [6]:
# Save vocabulary file
tokens_df.to_csv('Vocab.gr',sep='\t',index=False,header=False)

# Getting Grammar

In [7]:
# create a list of sentences from the Monty Python corpus
sents_monty_python = nltk.sent_tokenize(data)
sents_monty_python = [sent.split('\n') for sent in sents_monty_python]
sents_monty_python = [item for sublist in sents_monty_python for item in sublist]

In [8]:
unneeded_nodes = [' $ ', '-$', '-NONE-', '#']

# Method to extract grammatical rules from trees

def get_rules(tree):
    # Get subtrees of the tree
    subtrees = tree.subtrees()
    rules = []

    # For each subtree get the rule that generates it's immediate child node
    for subtree in subtrees:
        children = [ch for ch in subtree]

        # Skip nodes that lead to leaves
        if len(children) > 0 and not type(children[0]) == str:
            # get labels of child nodes
            child_labels = [fix_label(ch.label()) for ch in children]

            # create rule string
            curr_rule = '  '.join([subtree.label()] + child_labels)

            # Exclude rules with unneeded nodes
            if not exclude_rule(curr_rule):
                rules.append(curr_rule)
    return rules


def fix_label(label):
    if not re.search(r'[a-zA-Z]', label):
        return 'Punc-' + label
    return label

def exclude_rule(rule):
    for n in unneeded_nodes:
        if n in rule:
            return True
    return False

def clean_string(string):
    string = re.sub(r'\[[^)]*]','',string).replace('  ',' ')
    string = re.sub(r'.*: ','',string)
    string = re.sub(r'\r','',string)
    string = string.strip()
    return string

def to_CNF(sentence):  
    tree = parser.raw_parse(sentence)
    tree = next(tree)
    tree.chomsky_normal_form()
    rules = get_rules(tree)
    return rules
    
def get_grammar(sentences_df):

    df = pd.DataFrame(sentences_df,columns=['sentences'])

    print('cleaning strings...',end='')
    df['sentences_clean'] = df.sentences.apply(lambda x: clean_string(x))
    print('Done!')    
    
    print('getting rules and converting to CNF...',end='')
    df['rules_CNF'] = df['sentences_clean'].apply(lambda x: to_CNF(x) if len(x.strip()) > 0 else '')
    print('Done!')  
    
    return df


In [9]:
# This method does:
# 1) reads the monty python sentences and generate a pandas dataframe 
# 2) clean the strings (remove parts like 'SCENE 1:', 'KING ARTHUR:', '[wind]', '[clop clop clop]', etc.)
# 3) extract trees, convert to CNF and extract the rules
# 4) return all in a pandas dataframe format

df = get_grammar(sents_monty_python)

cleaning strings...Done!
getting rules and converting to CNF...Done!


In [10]:
df.head(10)

Unnamed: 0,sentences,sentences_clean,rules_CNF
0,SCENE 1: [wind] [clop clop clop],,
1,KING ARTHUR: Whoa there!,Whoa there!,"[ROOT FRAG, FRAG ADVP FRAG|<NP-.>, ADVP RB..."
2,[clop clop clop],,
3,SOLDIER #1: Halt!,Halt!,"[ROOT S, S VP Punc-., VP VB]"
4,Who goes there?,Who goes there?,"[ROOT SBARQ, SBARQ WHNP SBARQ|<SQ-.>, WHNP ..."
5,"ARTHUR: It is I, Arthur, son of Uther Pendrago...","It is I, Arthur, son of Uther Pendragon, from ...","[ROOT S, S NP S|<VP-.>, NP PRP, S|<VP-.> ..."
6,"King of the Britons, defeator of the Saxons, s...","King of the Britons, defeator of the Saxons, s...","[ROOT NP, NP NP NP|<PP-,-NP-,-NP-.>, NP NN..."
7,SOLDIER #1: Pull the other one!,Pull the other one!,"[ROOT S, S VP Punc-., VP VB NP, NP DT N..."
8,"ARTHUR: I am, ... and this is my trusty serva...","I am, ... and this is my trusty servant Patsy.","[ROOT S, S S S|<:-CC-S-.>, S NP S|<VP-,>,..."
9,We have ridden the length and breadth of the l...,We have ridden the length and breadth of the l...,"[ROOT S, S NP S|<VP-.>, NP PRP, S|<VP-.> ..."


# Grouping rules by Tree

In [11]:
# Create a dataframe with the list of rules (in CNF) by tree
rules_cnf = df[df.rules_CNF != ''][['rules_CNF']].reset_index(drop=True)
rules_cnf.head()

Unnamed: 0,rules_CNF
0,"[ROOT FRAG, FRAG ADVP FRAG|<NP-.>, ADVP RB..."
1,"[ROOT S, S VP Punc-., VP VB]"
2,"[ROOT SBARQ, SBARQ WHNP SBARQ|<SQ-.>, WHNP ..."
3,"[ROOT S, S NP S|<VP-.>, NP PRP, S|<VP-.> ..."
4,"[ROOT NP, NP NP NP|<PP-,-NP-,-NP-.>, NP NN..."


In [12]:
# Dismember the list of rules in one row by rule and group the trees with ids (treeid)
rules_cnf = rules_cnf.reset_index().rename(columns={'index':'treeid'})
rules_cnf.treeid = rules_cnf.treeid + 1

# Create an id for each rule within the tree
rules_trees = []
for idx,row in rules_cnf.iterrows():
    for i in range(len(row.rules_CNF)):
        rules_trees.append((row.treeid,i+1,row.rules_CNF[i]))
        
rules_cnf = pd.DataFrame(rules_trees,columns=['treeid','ruleid','rule_CNF'])

In [13]:
rules_cnf.head(10)

Unnamed: 0,treeid,ruleid,rule_CNF
0,1,1,ROOT FRAG
1,1,2,FRAG ADVP FRAG|<NP-.>
2,1,3,ADVP RB
3,1,4,FRAG|<NP-.> NP Punc-.
4,1,5,NP RB
5,2,1,ROOT S
6,2,2,S VP Punc-.
7,2,3,VP VB
8,3,1,ROOT SBARQ
9,3,2,SBARQ WHNP SBARQ|<SQ-.>


# Getting rules frequency

In [14]:
# Get the rules frequency and save into a vocabulary file

In [15]:
rules_frequency = pd.DataFrame(rules_cnf.rule_CNF.value_counts()).reset_index().rename(columns={'index':'rule_CNF','rule_CNF':'frequency'})

In [16]:
rules_cnf = pd.merge(rules_cnf, rules_frequency, how='left',on='rule_CNF')

In [17]:
rules_cnf['rule_CNF'] = rules_cnf['rule_CNF'].apply(lambda x: x.replace('ROOT','S2').replace('S ','S2 '))

In [18]:
rules_cnf.head(10)

Unnamed: 0,treeid,ruleid,rule_CNF,frequency
0,1,1,S2 FRAG,419
1,1,2,FRAG ADVP FRAG|<NP-.>,5
2,1,3,ADVP RB,346
3,1,4,FRAG|<NP-.> NP Punc-.,37
4,1,5,NP RB,34
5,2,1,S2 S,964
6,2,2,S2 VP Punc-.,225
7,2,3,VP VB,151
8,3,1,S2 SBARQ,78
9,3,2,SBARQ WHNP SBARQ|<SQ-.>,52


In [19]:
rules_cnf[['frequency','rule_CNF']].drop_duplicates().sort_values('rule_CNF').to_csv('S2.gr',index=False, header=False,sep='\t')