In [1]:
import os
from tree_utils import Tree

In [2]:
data_dir = '/Users/xy/data/PennTreebank_full/parsed/prd/wsj'
assert os.path.exists(data_dir), 'data_dir not found'

In [3]:
# How many subdirectories
sub_dirs = [sd for sd in os.listdir(data_dir) \
            if os.path.isdir(os.path.join(data_dir, sd))]
sub_dirs = sorted(sub_dirs)
print(f'Found {len(sub_dirs)} subdirectories in {data_dir}')
print(sub_dirs)

Found 25 subdirectories in /Users/xy/data/PennTreebank_full/parsed/prd/wsj
['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']


In [4]:
# How many .prd files exist in total
prd_files = [os.path.join(data_dir, sd, f) \
             for sd in sub_dirs \
                for f in os.listdir(os.path.join(data_dir, sd)) \
                    if f.endswith('.prd')]
print(f'Found {len(prd_files)} .prd files in total')
print('Examples:', prd_files[:1])

Found 2312 .prd files in total
Examples: ['/Users/xy/data/PennTreebank_full/parsed/prd/wsj/00/wsj_0042.prd']


In [5]:
# Work on a .prd example
example_prd_file = '/Users/xy/data/PennTreebank_full/parsed/prd/wsj/00/wsj_0008.prd'


tree_strings = []
with open(example_prd_file) as f:
    lp_count, rp_count = 0, 0
    tmp_str = ''
    for line in f:
        line = line.strip()
        if not line: continue
        lp_count += line.count('(')
        rp_count += line.count(')')
        if lp_count > rp_count:
            tmp_str += line
        elif lp_count == rp_count:
            tmp_str += line
            tree_strings.append(tmp_str)
            tmp_str = ''
            lp_count, rp_count = 0, 0
        else:
            raise ValueError('Something wrong with the parentheses')

print(f'Found {len(tree_strings)} trees in {example_prd_file}')
print('Examples:')
print(tree_strings[0])

Found 6 trees in /Users/xy/data/PennTreebank_full/parsed/prd/wsj/00/wsj_0008.prd
Examples:
( (S (NP-SBJ The federal government)(VP suspended(NP (NP sales)(PP of(NP U.S. savings bonds)))(SBAR-PRP because(S (NP-SBJ Congress)(VP hasn't(VP lifted(NP (NP the ceiling)(PP-LOC on(NP government debt)))))))).))


In [7]:
tree0 = Tree(tree_strings[0])

print(tree0)
print(tree0.leaves())

tree1 = Tree(tree_strings[1])
print(tree1)
print(tree1.leaves())

(S ('NP-SBJ', ['The federal government'])('VP', ['suspended', ('NP', [('NP', ['sales']), ('PP', ['of', ('NP', ['U.S. savings bonds'])])]), ('SBAR-PRP', ['because', ('S', [('NP-SBJ', ['Congress']), ('VP', ["hasn't", ('VP', ['lifted', ('NP', [('NP', ['the ceiling']), ('PP-LOC', ['on', ('NP', ['government debt'])])])])])])])]).)
['The federal government', 'suspended', 'sales', 'of', 'U.S. savings bonds', 'because', 'Congress', "hasn't", 'lifted', 'the ceiling', 'on', 'government debt', '.']
(S ('S-TPC-1', [('SBAR-TMP', ['Until', ('S', [('NP-SBJ', ['Congress']), ('VP', ['acts'])])]), ',', ('NP-SBJ', ['the government']), ('VP', ["hasn't", ('NP', ['any authority', ('S', [('NP-SBJ', ['*']), ('VP', ['to', ('VP', ['issue', ('NP', [('NP', ['new debt obligations']), ('PP', ['of', ('NP', ['any kind'])])])])])])])])]),('NP-SBJ', ['the Treasury'])('VP', ['said', ('SBAR', ['0', ('S', ['*T*-1'])])]).)
['Until', 'Congress', 'acts', ',', 'the government', "hasn't", 'any authority', '*', 'to', 'issue', '