In [1]:
from swda_time import CorpusReader
from random import randrange
import complexity
from nltk.tree import Tree
import numpy as np

corpus = CorpusReader('swda', 'swda/swda-metadata.csv')

In [2]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    #print 'Depth*width: {}'.format(complexity.balanced(tree))
    #print 'Balanced depth*width: {}'.format(complexity.balanced2(tree))
    #print 'Average depth: {}'.format(complexity.avdepth(tree))
print "Lu's measures:"
for item in complexity.lus_measures(s.trees).items():
    print'{}:{}'.format(*item)
    

utterance 527

(S
  (CC Or)
  (NP-SBJ (NNS people))
  (VP
    (VBP send)
    (NP (PRP you))
    (ADVP-DIR (RB there))
    (PP (IN as) (NP (DT a) (JJ last) (NN resort))))
  (. .)
  (-DFL- E_S))
Length: 10
Depth: 6
Width: 1.55555555556
Lu's measures:
(ROOT (S (CC Or) (NP (NNS people)) (VP (VBP send) (NP (PRP you)) (ADVP (RB there)) (PP (IN as) (NP (DT a) (JJ last) (NN resort)))) (. .) (-DFL- E_S)))
S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])] (S (CC Or) (NP (NNS people)) (VP (VBP send) (NP (PRP you)) (ADVP (RB there)) (PP (IN as) (NP (DT a) (JJ last) (NN resort)))) (. .) (-DFL- E_S))
S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP] (S (CC Or) (NP (NNS people)) (VP (VBP send) (NP (PRP you)) (ADVP (RB there)) (PP (IN as) (NP (DT a) (JJ last) (NN resort)))) (. .) (-DFL- E_S))
ROOT
{'S < (VP <# VBG|TO) $+ VP': 0, 'S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ

In [None]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized balanced depth*width: {}'.format(complexity.nbalanced2(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

In [None]:
# dicts with the measures for easy iteration
# add additional measures here
measures = {'length': complexity.length,
            'depth': complexity.depth,
            'width': complexity.width,
            'depth*width': complexity.balanced,
            'balanced depth*width': complexity.balanced2,
            'average depth': complexity.avdepth,
            'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            'normalized depth*width': complexity.nbalanced,
            'normalized average depth': complexity.n_avdepth,
            'normalized balanced depth*width': complexity.nbalanced2}

non_normalized_measures = {'length': complexity.length,
                           'depth': complexity.depth,
                           'width': complexity.width,
                           'depth*width': complexity.balanced,
                           'balanced depth*width': complexity.balanced2}

normalized_measures = {'normalized depth': complexity.ndepth,
                       'normalized width': complexity.nwidth,
                       'normalized depth*width': complexity.nbalanced,
                       'normalized average depth': complexity.n_avdepth,
                       'normalized balanced depth*width': complexity.nbalanced2}

# dict of sets (to avoid duplicates) for the results
ordered_results = dict([(measure, []) for measure in measures])

for conv in corpus.iter_transcripts(display_progress=False):
    for utt in conv.utterances:
        for tree in utt.trees:
            #Trees need to be cast to string to be hashable for duplicate removal
            string = str(tree)
            for measure in measures:
                ordered_results[measure].append((conv.conversation_no, measures[measure](tree), string))

results = {}
#remove duplicates from results            
for measure in ordered_results:
    results[measure] = list(set(ordered_results[measure]))
    results[measure].sort(key=lambda x: x[1])

#compute results closest to average
av_results = {}

for measure in non_normalized_measures:
    mean = np.mean([x[1] for x in results[measure]])
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - mean), reverse=True)
for measure in normalized_measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1), reverse=True)

In [None]:
# code to display top, bot and closest to average n trees per measure

n = 5 #length of lists to be displayed

# modifiers with associated functions
mods = {'Most': lambda x: x[len(x)-n:], 'Least': lambda x: x[:n]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            tree = Tree.fromstring(item[2])
            text = str(tree.flatten()).replace('\n', '')
            print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}\n'.format(measure)
    for i, item in enumerate(mods['Most'](av_results[measure]), 1):
        tree = Tree.fromstring(item[2])
        text = str(tree.flatten()).replace('\n', '')
        print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)

In [None]:
for measure in results:
    print len(results[measure])
    
for i, item in enumerate(ordered_results['normalized depth']):
    assert item[2] == ordered_results['normalized average depth'][i][2]

In [None]:
# code to find distinguishing sentences for each (sensible) pair of normalized measures

i = 0
measure_list = list(normalized_measures)
diff = {}

while i < len(measure_list):
    one = measure_list[i]
    data_one = [x[1] for x in ordered_results[one]]
    for other in measure_list[i + 1:]:
        data_other = [x[1] for x in ordered_results[other]]
        comp = zip(data_one, data_other)
        index, values = max(enumerate(comp), key=lambda x: abs(x[1][0] - x[1][1]))
        # sanity check: Are these actually values for the same tree?
        #               Do they actually have these values?
        assert ordered_results[one][index][2] == ordered_results[other][index][2]
        assert ordered_results[one][index][1] == values[0]
        assert ordered_results[other][index][1] == values[1]
        tree = ordered_results[one][index][2]
        diff[(one, other)] = (tree, values)
    i += 1
    
for pair in diff:
    print 'Largest difference in value between {} and {}:'.format(*pair)
    tree, values = diff[pair]
    print 'Value for {pair[0]}: {values[0]}, value for {pair[1]}: {values[1]}'.format(pair=pair, values=values)
    print Tree.fromstring(tree)