# natural language syntax

compositional syntax - structure of language

# represent English syntax

the tree data abstraction can represent the structure of a sentence

In [1]:
def tree(label, branches=[]):
    """Construct a tree with the given label value and a list of branches."""
    return [label] + list(branches)

def label(tree):
    """Return the label value of a tree."""
    return tree[0]

def branches(tree):
    """Return the list of branches of the given tree."""
    return tree[1:]

def is_leaf(tree):
    """Returns True if the given tree's list of branches is empty, and False
    otherwise.
    """
    return not branches(tree)

def leaves(tree):
    """Return a list containing the leaf labels of tree"""
    if is_leaf(tree):
        return [label(tree)]
    else:
        return sum([leaves(b) for b in branches(tree)], [])
    
def is_tree(tree):
    if type(tree) != list or len(tree) < 1:
        return False
    for branch in branches(tree):
        if not is_tree(branch):
            return False
    return True

def print_tree(t, indent=0):
    print('   ' * indent + str(label(t)))
    for (b) in branches(t):
        print_tree(b, indent + 1)

In [2]:
example = tree('ROOT',
            [tree('FRAG',
                [tree('NP',
                    [tree('DT', [tree('a')]),
                    tree('JJ', [tree('little')]),
                    tree('NN', [tree('bug')])]),
                tree('.', [tree('.')])])])

In [3]:
example

['ROOT',
 ['FRAG',
  ['NP', ['DT', ['a']], ['JJ', ['little']], ['NN', ['bug']]],
  ['.', ['.']]]]

In [4]:
leaves(example)

['a', 'little', 'bug', '.']

In [5]:
'a little bug.'

'a little bug.'

In [6]:
from string import punctuation
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
contractions = ["n't", "'s", "'re", "'ve"]

In [8]:
['they', "'re", 'coming', 'over']

['they', "'re", 'coming', 'over']

In [12]:
def words(t):
    s = ''
    for w in leaves(t):
        no_space = (w in punctuation and w != '$') or w in contractions
        if not s or no_space:
            s = s + w
        else:
            s = s + ' ' + w
    return s
    
def replace(t, s, w):
    if label(t) == s:
        return tree(s, [tree(w)])
    else:
        return tree(label(t), [replace(b, s, w) for b in branches(t)])

In [13]:
replace(example, 'JJ', 'huge')

['ROOT',
 ['FRAG',
  ['NP', ['DT', ['a']], ['JJ', ['huge']], ['NN', ['bug']]],
  ['.', ['.']]]]

In [14]:
words(example)

'a little bug.'

# files, strings and lists

some files are plain text and can be read into python

- `.strip()` return a string without whitespaces like spaces and tabs, etc.
- `.split()` return a list of strings that were separated by whitespace
- `.replace(a, b)` return a string with all instances of string `a` replaced by `b`


In [15]:
def read_trees(lines):
    """Return trees as lists of tokens from a list of lines.
    
    """
    trees = []
    tokens = []
    for line in lines:
        if line.strip():
            tokens.extend(line.replace('(', ' ( ').replace(')', ' ) ').split())
            if tokens.count('(') == tokens.count(')'):
                trees.append(tokens)
                tokens = []
    return trees


def all_trees(path='CHILDESTreebank-curr/suppes.parsed'):
    return read_trees(open(path).readlines())

In [16]:
all_trees()

[['(',
  'ROOT',
  '(',
  'S',
  '(',
  'NP',
  '(',
  'NN',
  'this',
  ')',
  ')',
  '(',
  'VP',
  '(',
  'COP',
  'is',
  ')',
  '(',
  'NP',
  '(',
  'DT',
  'a',
  ')',
  '(',
  'NN',
  'book',
  ')',
  ')',
  ')',
  '(',
  '.',
  '?',
  ')',
  ')',
  ')'],
 ['(',
  'ROOT',
  '(',
  'SBARQ',
  '(',
  'WHNP',
  '(',
  'WP',
  'what',
  ')',
  ')',
  '(',
  'SQ',
  '(',
  'AUX',
  'do',
  ')',
  '(',
  'NP',
  '(',
  'PRP',
  'you',
  ')',
  ')',
  '(',
  'VP',
  '(',
  'VB',
  'see',
  ')',
  '(',
  'PP',
  '(',
  'IN',
  'in',
  ')',
  '(',
  'NP',
  '(',
  'DT',
  'the',
  ')',
  '(',
  'NN',
  'book',
  ')',
  ')',
  ')',
  ')',
  ')',
  '(',
  '.',
  '?',
  ')',
  ')',
  ')'],
 ['(',
  'ROOT',
  '(',
  'SBARQ',
  '(',
  'WHADJP',
  '(',
  'WRB',
  'how',
  ')',
  '(',
  'JJ',
  'many',
  ')',
  ')',
  '(',
  'SQ',
  '(',
  'NP',
  '(',
  'NNS',
  'rabbits',
  ')',
  ')',
  ')',
  '(',
  '.',
  '?',
  ')',
  ')',
  ')'],
 ['(',
  'ROOT',
  '(',
  'FRAG',
  '(',
  'WRB',
  'how'

In [17]:
data = all_trees()

In [18]:
len(data)

35906

In [19]:
data[0]

['(',
 'ROOT',
 '(',
 'S',
 '(',
 'NP',
 '(',
 'NN',
 'this',
 ')',
 ')',
 '(',
 'VP',
 '(',
 'COP',
 'is',
 ')',
 '(',
 'NP',
 '(',
 'DT',
 'a',
 ')',
 '(',
 'NN',
 'book',
 ')',
 ')',
 ')',
 '(',
 '.',
 '?',
 ')',
 ')',
 ')']

In [20]:
min(data, key=len)

['(', 'ROOT', '(', 'CC', 'and', ')', ')']

# tree representation

a tree represented as a list of tokens

In [21]:
def tree(label, branches=[]):
    if not branches:
        return [label]
    else:
        return ['(', label] + sum(branches, []) + [')']
    
def label(tree):
    if len(tree) == 1:
        return tree[0]
    else:
        assert tree[0] == '(', tree
        return tree[1]
    
def branches(tree):
    if len(tree) == 1:
        return []
    opened = 1
    assert tree[0] == '('
    current_branch = []
    all_branches = []
    for token in tree[2:]:
        current_branch.append(token)
        if token == '(':
            opened += 1
        elif token == ')':
            opened -= 1
        if opened == 1:
            all_branches.append(current_branch)
            current_branch = []
    assert opened == 0
    return all_branches

def is_leaf(tree):
    return not branches(tree)

In [22]:
example = tree('ROOT',
            [tree('FRAG',
                [tree('NP',
                    [tree('DT', [tree('a')]),
                    tree('JJ', [tree('little')]),
                    tree('NN', [tree('bug')])]),
                tree('.', [tree('.')])])])

In [23]:
leaves(example)

['a', 'little', 'bug', '.']

In [24]:
words(example)

'a little bug.'

In [None]:
def replace_all(s, w):
    for t in all_trees():
        r = replace(t, s, w)
        if t != r:
            print(words(t))
            print(words(r))
            input()

In [None]:
replace_all('NP', 'Oski')