In [1]:
from swda_time import CorpusReader
from random import randrange
import complexity
from nltk.tree import Tree

corpus = CorpusReader('swda', 'swda/swda-metadata.csv')

In [2]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    print 'Depth*width: {}'.format(complexity.balanced(tree))
    print 'Average depth: {}'.format(complexity.avdepth(tree))

utterance 214

(S
  (CC and)
  (EDITED
    (RM (-DFL- \[))
    (NP-SBJ-UNF (XX tha-))
    (, ,)
    (IP (-DFL- \+)))
  (NP-SBJ (PRP they))
  (RS (-DFL- \]))
  (VP
    (VBD did)
    (RB n't)
    (VP
      (VB find)
      (NP (NP (DT any) (NN problem)) (PP (IN with) (NP (DT that))))
      (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))))))
  (, ,)
  (-DFL- E_S))
Length: 18
Depth: 8
Width: 1.5
Depth*width: 12.0
Average depth: 5.26315789474


In [None]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

In [None]:
# dict with the measures for easy iteration
measures = {'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            'normalized depth*width': complexity.balanced,
            'normalized average depth': complexity.n_avdepth}

# dict of sets (to avoid duplicates) for the results
results = dict([(measure, set()) for measure in measures])

for conv in corpus.iter_transcripts(display_progress=False):
    for utt in conv.utterances:
        for tree in utt.trees:
            for measure in measures:
                # Trees need to be cast to string to be hashable
                results[measure].add((conv.conversation_no, measures[measure](tree), str(tree)))

def reconstruct_tree(item):
    """
    Helper function to turn the (hashable) string representation back into a Tree
    """
    item = list(item)
    item[2] = Tree.fromstring(item[2])
    return tuple(item)

for measure in results:
    results[measure] = map(reconstruct_tree, results[measure])
    results[measure].sort(key=lambda x: x[1])

av_results = {}
for measure in measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1))

In [None]:
# modifiers with associated functions
mods = {'Most': lambda x: x[len(x)-10:], 'Least': lambda x: x[:10]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            text = str(item[2].flatten()).replace('\n', '')
            print '{}. {item[0]} {} \n {item[2]!s}'.format(i, text, item=item)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}'.format(name)
    for i, item in enumerate(av_results[measure][:10]), 1):
        text = str(item[2].flatten()).replace('\n', '')
        print '{}. {item[0]} {} \n {item[2]!s}'.format(i, text, item=item)