In [34]:
from nltk.corpus import treebank
# use pandas to create a dataframe with the vocabulary only to sort by POS
import pandas as pd
import nltk

def get_tags(word, preterminals):
    tags = []
    for pt in preterminals:
        leaf = [child for child in pt][0]

        if word == leaf and not pt.label() in tags:
            tags.append(pt.label())

    return tags

In [35]:
# Get allowed words
allowed_words_file = open('allowed_words.txt', 'r')
allowed_words = allowed_words_file.read().split('\n')

In [36]:
# get trees for Penn Treebank
trees_by_file = [treebank.parsed_sents(id) for id in treebank.fileids()]
trees = [tree for files in trees_by_file for tree in files]

# converting trees into Chomsky Normal Forms
for i in range(len(trees)):
    tree = trees[i]
    tree.chomsky_normal_form()

In [37]:
preterminals = [subtree for tree in trees for subtree in tree.subtrees() if subtree.height() == 2]

In [115]:
# Creating a pandas dataframe with the terminals and POS
terminals_df = pd.DataFrame(data={'POS':[], 'Word':[]})
for leaf in preterminals:
    terminals_df = pd.concat([terminals_df,
                              pd.DataFrame(data={'POS':[leaf.label()], 'Word':[leaf.leaves()[0]]})])

# Creating a column with the number of occurrences of the word in the corpus
terminals_df['count'] = 1
terminals_df_group = terminals_df.groupby(['POS','Word']).sum().reset_index()

# Filter only words present in Allowed_words.txt
terminals_df_group = terminals_df_group[terminals_df_group.Word.isin(allowed_words)]

# Get words from allowed_words.txt not identified
missing_words = list(set(allowed_words)-set(terminals_df.Word.values))

missing_words_df = pd.DataFrame([nltk.pos_tag([word])[0] for word in missing_words[1:]],columns=['Word','POS'])
missing_words_df['count'] = 1

terminals_df_group = pd.concat([terminals_df_group,missing_words_df])

# Saving file
terminals_df_group[['count','POS','Word']].to_csv('newvocab.txt',sep='\t',index=False,header=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [68]:
# Selecting and saving the unassigned words
unassigned_words = terminals_df[terminals_df.Word.isin(allowed_words) == False]['Word'].unique()

with open('unassigned_words.txt', 'w') as f:
    f.write('\n'.join(unassigned_words))