In [1]:
from swda_time import CorpusReader
from random import randrange
from itertools import chain
from collections import defaultdict
from complexity import ComplexityMeasures
from nltk.tree import Tree
import numpy as np

swbd = CorpusReader('swda_complete', 'swda_complete/swda-metadata.csv')
bnc = CorpusReader('bnc_complete', 'bnc_complete/bnc-metadata.csv')

corpus = swbd
complexity = ComplexityMeasures(corpus)

In [2]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    #print 'Depth*width: {}'.format(complexity.balanced(tree))
    print 'Balanced depth*width: {}'.format(complexity.balanced2(tree))
    print 'Average depth: {}'.format(complexity.avdepth(tree))
print "Lu's measures:"
for item in complexity.lus_measures(s.trees).items():
    print'{}:{}'.format(*item)
    

utterance 77

(S
  (ADVP (RB probably))
  (NP-SBJ
    (NP (CD one))
    (PP
      (IN of)
      (NP
        (NP (DT the) (JJS biggest) (NNS decisions))
        (PRN (S (NP-SBJ (PRP I)) (VP (VBP think))))
        (SBAR
          (WHNP-1 (WDT that))
          (S
            (NP-SBJ (-NONE- *T*-1))
            (VP
              (VBD was)
              (ADJP-PRD
                (RB very)
                (JJ strengthening)
                (PP (IN for) (NP (PRP$ our) (NN family))))))))))
  (VP-UNF
    (VBD was)
    (SBAR
      (SBAR
        (IN rather)
        (IN than)
        (S
          (NP-SBJ (-NONE- *))
          (VP
            (VB have)
            (S
              (NP-SBJ (CD one) (NN child))
              (VP (VBP make) (NP (DT that) (NN decision)))))))
      (SBAR
        (IN than)
        (ADVP (RB just))
        (S (NP-SBJ (-NONE- *)) (VP (VB delegate) (NP (PRP it)))))))
  (. .)
  (-DFL- E_S))
Length: 32
Depth: 12
Width: 1.47058823529
Balanced depth*width: 1.59411764706
Average depth: 7.9393

In [3]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    #print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized balanced depth*width: {}'.format(complexity.nbalanced2(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

Normalized depth: 0.899871078642
Normalized width: 0.976439812435
Normalized balanced depth*width: 0.845779697886
Normalized average depth: 0.975203438457


In [19]:
#fast order-preserving removal of duplicates: https://stackoverflow.com/questions/480214/how-do-you-remove-duplicates-from-a-list-in-whilst-preserving-order
def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

# dicts with the measures for easy iteration
# add additional measures here
measures = {'length': complexity.length,
            'depth': complexity.depth,
            'width': complexity.width,
            #'depth*width': complexity.balanced,
            'balanced depth*width': complexity.balanced2,
            'average depth': complexity.avdepth,
            'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            #'normalized depth*width': complexity.nbalanced,
            'normalized average depth': complexity.n_avdepth,
            #'normalized balanced depth*width': complexity.nbalanced2
           }

non_normalized_measures = {'length': complexity.length,
                           'depth': complexity.depth,
                           'width': complexity.width,
                           'average depth': complexity.avdepth,
                           #'depth*width': complexity.balanced,
                           }

normalized_measures = {'normalized depth': complexity.ndepth,
                       'normalized width': complexity.nwidth,
                       #'normalized depth*width': complexity.nbalanced,
                       'normalized average depth': complexity.n_avdepth,
                       #'normalized balanced depth*width': complexity.nbalanced2,
                       'balanced depth*width': complexity.balanced2}

# dict of sets (to avoid duplicates) for the results
ordered_results = dict([(measure, []) for measure in measures])
#normalized_values = dict([(measure, defaultdict(list)) for measure in normalized_measures])

for conv in corpus.iter_transcripts(display_progress=True):
    for utt in conv.utterances:
        for tree in utt.trees:
            #Trees need to be cast to string to be hashable for duplicate removal
            string = str(tree)
            le = complexity.length(tree)
            for measure in measures:
                value = measures[measure](tree)
                ordered_results[measure].append((conv.conversation_no, value, string))

results = dict([(measure, []) for measure in measures])
#remove duplicates from results            
for measure in ordered_results:
    ordered_results[measure] = f7(ordered_results[measure])
    results[measure] = sorted(ordered_results[measure])  

#compute results closest to average
av_results = {}

for measure in non_normalized_measures:
    mean = np.mean([x[1] for x in results[measure]])
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - mean), reverse=True)
for measure in normalized_measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1), reverse=True)


transcript 1127


In [34]:
#compute means and stdevs per length:
for measure in normalized_values:
    stdev = {}
    means = {}
    for le in normalized_values[measure]:
        normalized_values[measure][le] = list(set(normalized_values[measure][le]))
        stdev[le] = np.std([x[0] for x in normalized_values[measure][le]])
        #no need to actually compute the means
    complexity.averages[measure] = means
    complexity.stdevs[measure] = stdev

In [40]:
from csv import DictWriter

legend = {'depth': 'depth',
          'width': 'width',
          'balanced': 'balanced2',
          'avdepth' : 'avdepth',
          'ndepth' : 'normalized depth',
          'nwidth' : 'normalized width',
          'navdepth' : 'normalized average depth'
         }

header = ['length']
data = defaultdict(dict)

for measure in legend:
    header.append('mean_'+measure)
    header.append('stdev_'+measure)
    for le in complexity.averages[legend[measure]]:
        data[le + 1]['mean_'+measure] = complexity.averages[legend[measure]][le]
        data[le + 1]['stdev_'+measure] = complexity.stdevs[legend[measure]][le]
        
data_list = []

for le in data:
    d = {'length': le}
    for key in data[le]:
        d[key] = data[le][key]
    data_list.append(d)
        
with open('means_stdevs.csv'.format(measure), 'wb') as output:
    writer = DictWriter(output, header)
    writer.writeheader()
    writer.writerows(data_list)


In [20]:
import csv

legend = {'length': 'length',
          'depth': 'depth',
          'width': 'width',
          'balanced': 'balanced depth*width',
          'avdepth' : 'average depth',
          'ndepth' : 'normalized depth',
          'nwidth' : 'normalized width',
          'navdepth' : 'normalized average depth'
         }

data= dict([(measure, map(lambda x: x[1], ordered_results[legend[measure]])) for measure in legend])


with open('data.csv', 'wb') as output:
    writer = csv.writer(output)
    writer.writerow(data.keys())
    writer.writerows(zip(*data.values()))
    

In [22]:
for measure in ordered_results:
    print measure, ordered_results[measure][0]

average depth (2005, 3.0, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
normalized average depth (2005, 0.6428571428571429, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
depth (2005, 3, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
normalized depth (2005, 0.59999999999999998, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
width (2005, 1.5, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
length (2005, 3, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
normalized width (2005, 1.142388394784561, '(INTJ (UH Okay) (. .) (-DFL- E_S))')
balanced depth*width (2005, 2.0, '(INTJ (UH Okay) (. .) (-DFL- E_S))')


In [6]:
# code to display top, bot and closest to average n trees per measure
from nltk_to_qtree import nltk_to_qtree

n = 5 #length of lists to be displayed

# modifiers with associated functions
mods = {'Most': lambda x: reversed(x[len(x)-n:]), 'Least': lambda x: x[:n]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            tree = Tree.fromstring(item[2])
            text = str(tree.flatten()).replace('\n', '')
            print '{}.(from conv. no. {}) {} \n'.format(i, item[0], text)
            #tree.pretty_print()
            print nltk_to_qtree(tree)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}\n'.format(measure)
    for i, item in enumerate(mods['Most'](av_results[measure]), 1):
        tree = Tree.fromstring(item[2])
        text = str(tree.flatten()).replace('\n', '')
        print '{}.(from conv. no. {}) {} \n'.format(i, item[0], text, tree)
        #tree.pretty_print()
        print nltk_to_qtree(tree)

Most complex trees by normalized depth:

1.(from conv. no. 3371) (ROOT  but  it  is  nice  to  have  it  because  there  's  ,  uh  ,  with  our  size  of  family  it  's  nice  to  know  that  there  's  one  that  's  always  straight  and  clean  if  you  have  someone  come  just  on  the  spur  of  the  moment  .) 

\Tree [.ROOT [.S [.CC but ] [.NP [.PRP it ]  ] [.VP [.VBZ is ] [.ADJP [.JJ nice ] [.S [.VP [.TO to ] [.VP [.VB have ] [.NP [.PRP it ]  ] [.SBAR [.IN because ] [.S [.NP [.EX there ]  ] [.VP [.BES 's ] [., , ] [.INTJ [.UH uh ]  ] [., , ] [.PP [.IN with ] [.NP [.NP [.PRP$ our ] [.NN size ]  ] [.PP [.IN of ] [.NP [.NP [.NN family ]  ] [.SBAR [.S [.NP [.PRP it ]  ] [.VP [.BES 's ] [.ADJP [.JJ nice ] [.S [.VP [.TO to ] [.VP [.VB know ] [.SBAR [.IN that ] [.S [.NP [.EX there ]  ] [.VP [.BES 's ] [.NP [.NP [.CD one ]  ] [.SBAR [.WHNP [.WDT that ]  ] [.S [.VP [.BES 's ] [.ADVP [.RB always ]  ] [.ADJP [.JJ straight ] [.CC and ] [.JJ clean ]  ] [.SBAR [.IN if ] [.S [.NP [.PRP you

\Tree [.S [.NP-SBJ [.PRP I ]  ] [.ADVP [.RB definitely ]  ] [.VP [.VBP feel ] [.SBAR [.IN like ] [.S [.NP-SBJ-1 [.PRP we ]  ] [.VP [.VBP need ] [.S [.NP-SBJ [.-NONE- *-1 ]  ] [.VP [.TO to ] [.VP [.VB keep ] [.S [.NP-SBJ [.PRP it ]  ] [.ADVP [.IN at ] [.JJS least ]  ] [.ADJP-PRD [.JJ unanimous ]  ]  ] [.-DFL- E_S ] [.SBAR-PRP [.RB because ] [., , ] [.INTJ [.UH uh ]  ] [.PRN [., , ] [.S [.NP-SBJ [.PRP you ]  ] [.VP [.VBP know ]  ]  ] [., , ]  ] [.S [.NP-SBJ [.EX there ]  ] [.VP [.BES 's ] [.NP-PRD [.NP [.DT the ] [.JJ classic ] [.NN sort ]  ] [.PP [.IN of ] [.NP [.NP [.S-TTL [.NP-SBJ [.-NONE- * ]  ] [.VP [.TO TO ] [.VP [.VB KILL ] [.NP [.DT A ] [.NN MOCKINGBIRD ]  ]  ]  ]  ] [.NN kind ]  ] [.PP [.IN of ] [.NP [.NN story ]  ]  ]  ]  ] [.SBAR [.WHADVP-5 [.WRB where ]  ] [.S [.NP-SBJ [.PRP you ]  ] [.VP [.VBP get ] [.PRN [., , ] [.S [.NP-SBJ [.PRP you ]  ] [.VP [.VBP know ]  ]  ] [., , ]  ] [.NP [.NP [.NN jury ]  ] [.SBAR [.WHNP-2 [.WDT which ]  ] [.S [.NP-SBJ [.-NONE- *T*-2 ]  ] [.VP [.MD 

\Tree [.S [.NP-SBJ [.PRP I ]  ] [.VP [.MD could ] [.RB n't ] [.VP [.VB put ] [.PRT [.IN in ]  ] [.NP [.NP [.JJ enough ] [.NN overtime ]  ] [.SBAR [.SBAR [.WHNP-1 [.WDT that ]  ] [.S [.NP-SBJ [.-NONE- *T*-1 ]  ] [.VP [.VBD was ] [.ADJP-UNF-PRD [.JJ necess- ]  ]  ]  ]  ] [., , ] [.CC or ] [.EDITED [.RM [.-DFL- \[ ]  ] [.SBAR-UNF [.WHNP [.DT that ]  ]  ] [., , ] [.IP [.-DFL- \+ ]  ]  ]  ] [.RS [.-DFL- \] ]  ] [.PRN [.S [.NP-SBJ [.PRP you ]  ] [.VP [.VBP know ]  ]  ] [., , ]  ] [.NP [.NP [.NNS things ]  ] [.SBAR [.WHNP-2 [.WDT that ]  ] [.S [.NP-SBJ [.-NONE- *T*-2 ]  ] [.VP [.MD would ] [.VP [.VB come ] [.PRT [.RP up ]  ] [.PP-LOC [.IN at ] [.NP [.NN work ]  ]  ]  ]  ]  ]  ] [., , ] [.SBAR [.WHNP-3 [.WDT that ]  ] [.S [.NP-SBJ [.-NONE- *T*-3 ]  ] [.VP [.MD would ] [.VP [.VB require ] [.S [.NP-SBJ [.PRP me ]  ] [.VP [.VP [.TO to ] [.VP [.VB stay ] [.ADVP-TMP [.RB late ]  ]  ]  ] [.CC or ] [.VP [.TO to ] [.VP [.VB come ] [.PRT [.IN in ]  ] [.ADVP-TMP [.JJ early ]  ]  ]  ] [.CC or ] [.VP [.TO

\Tree [.S [.EDITED [.RM [.-DFL- \[ ]  ] [.EDITED [.RM [.-DFL- \[ ]  ] [.NP-SBJ [.PRP I ]  ] [., , ] [.IP [.-DFL- \+ ]  ]  ] [.NP-SBJ [.PRP I ]  ] [., , ] [.RS [.-DFL- \] ]  ] [.IP [.-DFL- \+ ]  ]  ] [.NP-SBJ-1 [.PRP I ]  ] [.RS [.-DFL- \] ]  ] [.VP [.VBP like ] [.S [.NP-SBJ [.-NONE- *-1 ]  ] [.VP [.TO to ] [.VP [.VB refer ] [.PP [.IN to ] [.NP [.DT this ]  ]  ] [.PP [.IN as ] [.NP [.NP [.ADVP [.NN sort ] [.IN of ]  ] [.NNP CONSUMER ] [.NNP REPORT ] [.NN phenomenon ]  ] [.. . ] [.SBAR [.WHNP-2 [.WDT Which ]  ] [.S [.NP-SBJ [.-NONE- *T*-2 ]  ] [.VP [.VBZ is ] [.SBAR-PRD [.-NONE- 0 ] [.S [.S [.NP-SBJ [.NP [.NN everybody ]  ] [.SBAR [.WHNP-3 [.WDT that ]  ] [.S [.NP-SBJ [.PRP I ]  ] [.VP [.VBP know ] [.NP [.-NONE- *T*-3 ]  ]  ]  ]  ]  ] [.VP [.VBZ reads ] [.NP [.NNP CONSUMER ] [.NNPS REPORTS ]  ]  ]  ] [.RB so ] [.S [.NP-SBJ-6 [.NP [.DT the ] [.NNS people ]  ] [.SBAR [.WHNP-4 [.WP who ]  ] [.S [.NP-SBJ [.-NONE- *T*-4 ]  ] [.VP [.VBP do ]  ]  ]  ]  ] [.VP [.VP [.VB feel ] [.SBAR [.IN that ]

In [7]:
navdepth_examples = map(lambda x: (x[1], Tree.fromstring(x[2])), results['normalized average depth'])
navdepth_examples = filter(lambda x: complexity.length(x[1]) == 15, navdepth_examples)

balanced2_examples = map(lambda x: (x[1], Tree.fromstring(x[2])), results['balanced depth*width'])
balanced2_examples = filter(lambda x: complexity.length(x[1]) == 15, balanced2_examples)

nwidth_examples = map(lambda x: (x[1], Tree.fromstring(x[2])), results['normalized width'])
nwidth_examples = filter(lambda x: complexity.length(x[1]) == 15, nwidth_examples)

for value, example in navdepth_examples[:10]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)
for value, example in navdepth_examples[-10:]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)

for value, example in balanced2_examples[:10]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)
for value, example in balanced2_examples[-10:]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)
    
for value, example in nwidth_examples[:10]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)
for value, example in nwidth_examples[-10:]:
    print "{} \n {}".format(value, example)
    print nltk_to_qtree(example)

0.587558285197 
 (ROOT
  (S
    (ADVP (RB So))
    (, ,)
    (INTJ (UH um))
    (, ,)
    (NP (CC and))
    (, ,)
    (ADVP (CC and))
    (, ,)
    (CC and)
    (ADVP (PRP I))
    (, ,)
    (NP (PRP I))
    (VP (VBP do) (RB n't) (VP (VB know)))
    (. .)))
\Tree [.ROOT [.S [.ADVP [.RB So ]  ] [., , ] [.INTJ [.UH um ]  ] [., , ] [.NP [.CC and ]  ] [., , ] [.ADVP [.CC and ]  ] [., , ] [.CC and ] [.ADVP [.PRP I ]  ] [., , ] [.NP [.PRP I ]  ] [.VP [.VBP do ] [.RB n't ] [.VP [.VB know ]  ]  ] [.. . ]  ]  ] 
0.597688600459 
 (NP
  (NP (DT a) (JJ big) (NN science) (NN school))
  (, ,)
  (NP (DT no) (NN football) (NN team))
  (, ,)
  (CC but)
  (NP (DT a) (JJ big) (NN science) (NN school))
  (, ,)
  (-DFL- E_S))
\Tree [.NP [.NP [.DT a ] [.JJ big ] [.NN science ] [.NN school ]  ] [., , ] [.NP [.DT no ] [.NN football ] [.NN team ]  ] [., , ] [.CC but ] [.NP [.DT a ] [.JJ big ] [.NN science ] [.NN school ]  ] [., , ] [.-DFL- E_S ]  ] 
0.607818915721 
 (ROOT
  (S
    (CC and)
    (ADVP (RB then))


\Tree [.S [.CC and ] [.NP-SBJ [.PRP it ]  ] [.VP [.BES 's ] [.ADVP [.RB about ]  ] [.ADJP-PRD [.NP-ADV [.QP [.CD one ] [.IN by ] [.CD one ] [.CC and ] [.DT a ] [.NN half ]  ] [.NN foot ]  ] [.JJ wide ] [.CC and ] [.JJ long ]  ]  ] [.. . ] [.-DFL- E_S ]  ] 
1.10158326654 
 (S
  (NP-SBJ (PRP He))
  (VP
    (VBZ takes)
    (NP
      (NP (DT both) (NNP San) (NNP Antonio) (NNS papers))
      (, ,)
      (NP (DT an) (NN Austin) (NN paper))
      (, ,)
      (NP (DT both) (NNP Houston) (NNS papers))))
  (, ,)
  (-DFL- E_S))
\Tree [.S [.NP-SBJ [.PRP He ]  ] [.VP [.VBZ takes ] [.NP [.NP [.DT both ] [.NNP San ] [.NNP Antonio ] [.NNS papers ]  ] [., , ] [.NP [.DT an ] [.NN Austin ] [.NN paper ]  ] [., , ] [.NP [.DT both ] [.NNP Houston ] [.NNS papers ]  ]  ]  ] [., , ] [.-DFL- E_S ]  ] 
1.10158326654 
 (S
  (NP-SBJ (PRP it))
  (VP
    (VBD was)
    (NP-PRD
      (NP (DT a) (, ,) (INTJ (UH uh)) (, ,) (JJ big) (NN boat))
      (, ,)
      (NP (JJ big) (NN ski) (NN boat) (NN type) (NN thing))))
  (.

In [8]:
# code to find distinguishing sentences for each (sensible) pair of normalized measures

i = 0
measure_list = list(normalized_measures)
diff = {}

while i < len(measure_list):
    one = measure_list[i]
    data_one = [x[1] for x in ordered_results[one]]
    for other in measure_list[i + 1:]:
        data_other = [x[1] for x in ordered_results[other]]
        comp = zip(data_one, data_other)
        top10 = sorted(enumerate(comp), key=lambda x: abs(x[1][0] - x[1][1]))[-10:]
        # sanity check: Are these actually values for the same tree?
        #               Do they actually have these values?
        diff[(one, other)] = []
        for index, values in top10:
            tree = ordered_results[one][index][2]
            diff[(one, other)].append((tree, values))
    i += 1
    
for pair in diff:
    print 'Largest differences in value between {} and {}:'.format(*pair)
    for tree, values in diff[pair]:
        print 'Value for {pair[0]}: {values[0]}, value for {pair[1]}: {values[1]}'.format(pair=pair, values=values)
        print Tree.fromstring(tree)

Largest differences in value between normalized depth and normalized average depth:
Value for normalized depth: 1.92007791687, value for normalized average depth: 1.41064519367
(ROOT (S (VP (MD Would) (VP (NP (PRP you)))) (. ?)))
Value for normalized depth: 1.92007791687, value for normalized average depth: 1.41064519367
(ROOT (S (VP (MD Would) (VP (NP (PRP you)))) (. ?)))
Value for normalized depth: 1.86819546229, value for normalized average depth: 1.35023872679
(ROOT
  (S
    (CC and)
    (, ,)
    (ADVP (RB so))
    (, ,)
    (NP (PRP he))
    (ADVP (BES 's))
    (VP
      (VBZ says)
      (SBAR
        (S
          (NP (PRP I))
          (VP
            (MD ought)
            (S
              (VP
                (TO to)
                (VP
                  (VB join)
                  (NP
                    (NP (CD one))
                    (PP
                      (IN of)
                      (NP
                        (NP (DT those))
                        (SBAR
           

Value for balanced depth*width: 3.3292716699, value for normalized average depth: 1.13995046861
(S
  (EDITED (RM (-DFL- \[)) (NP-SBJ (PRP I)) (, ,) (IP (-DFL- \+)))
  (INTJ (UH uh))
  (, ,)
  (NP-SBJ (PRP I))
  (RS (-DFL- \]))
  (VP
    (VBP think)
    (SBAR
      (IN that)
      (, ,)
      (INTJ (UH uh))
      (, ,)
      (S
        (PP-TMP (IN for) (ADVP (RB once)))
        (, ,)
        (EDITED
          (RM (-DFL- \[))
          (S (NP-SBJ (PRP we)) (VP-UNF (MD should)))
          (, ,)
          (IP (-DFL- \+)))
        (NP-SBJ (PRP we))
        (VP
          (MD should)
          (RB n't)
          (RS (-DFL- \]))
          (ADVP (RB even))
          (VP
            (VB be)
            (EDITED
              (RM (-DFL- \[))
              (PP-LOC-UNF (IN at))
              (, ,)
              (IP (-DFL- \+)))
            (PP-LOC-PRD
              (IN at)
              (RS (-DFL- \]))
              (NP (DT these) (NNP Mid) (NNP East) (NNS talks)))
            (SBAR-PRP
            

Value for normalized depth: 1.14779270633, value for balanced depth*width: 3.4
(S
  (CC and)
  (NP-SBJ (PRP I))
  (VP
    (VBP think)
    (SBAR
      (-NONE- 0)
      (S
        (NP-SBJ (PRP$ my) (NN feeling))
        (VP
          (VBD was)
          (SBAR-PRD
            (IN that)
            (S
              (NP-SBJ-3 (DT the) (RB very) (JJS best) (NNS teachers))
              (VP
                (MD should)
                (VP
                  (VB be)
                  (PP-PRD
                    (IN in)
                    (NP
                      (NNS grades)
                      (CD one)
                      (, ,)
                      (CD two)
                      (, ,)
                      (CC and)
                      (CD three)))
                  (. .)
                  (S-PRP
                    (NP-SBJ (-NONE- *-3))
                    (VP
                      (TO To)
                      (VP
                        (VP
                          (VB get)
        

Value for balanced depth*width: 3.4, value for normalized width: 1.03804899261
(S
  (CC and)
  (NP-SBJ (PRP I))
  (VP
    (VBP think)
    (SBAR
      (-NONE- 0)
      (S
        (NP-SBJ (PRP$ my) (NN feeling))
        (VP
          (VBD was)
          (SBAR-PRD
            (IN that)
            (S
              (NP-SBJ-3 (DT the) (RB very) (JJS best) (NNS teachers))
              (VP
                (MD should)
                (VP
                  (VB be)
                  (PP-PRD
                    (IN in)
                    (NP
                      (NNS grades)
                      (CD one)
                      (, ,)
                      (CD two)
                      (, ,)
                      (CC and)
                      (CD three)))
                  (. .)
                  (S-PRP
                    (NP-SBJ (-NONE- *-3))
                    (VP
                      (TO To)
                      (VP
                        (VP
                          (VB get)
        

In [None]:
all_swbd_trees = {}

for conv in swbd.iter_transcripts(display_progress=False):
    trees = {'A': set(), 'B': set()}
    for ut in conv.utterances:
        trees[ut.caller].add(tuple([str(tree) for tree in ut.trees]))
    all_swbd_trees[conv.conversation_no] = trees

In [None]:
# code to check whether there are parts of either corpus for which Lu's measures make sense
# in the sense of being non-zero for more than half of each speaker's utterances that have distinct trees

lu_measure_list = ['MLC', 'MLT',
                   'C/S',
                   'C/T', 'CT/T', 'DC/C', 'DC/T',
                   'CP/C', 'CP/T', 'T/S',
                   'CN/C', 'CN/T', 'VP/T']
    
swbd_results = dict([(conv, dict([(measure, dict([(speaker, []) for speaker in ('A', 'B')])) for measure in lu_measure_list])) for conv in all_swbd_trees])
swbd_defined = dict([(conv, dict([(measure, dict([(speaker, []) for speaker in ('A', 'B')])) for measure in lu_measure_list])) for conv in all_swbd_trees])
  
good_convs = dict([(measure, []) for measure in lu_measure_list])
                    
for conv in all_swbd_trees:
    for speaker in all_swbd_trees[conv]:
        for trees in all_swbd_trees[conv][speaker]:
            Trees = [Tree.fromstring(tree) for tree in trees]
            measures = complexity.lus_measures(Trees)
            for measure in measures:
                swbd_results[conv][measure][speaker].append(measures[measure])
                if measures[measure] == 0.0 or measures[measure] == None:
                    swbd_defined[conv][measure][speaker].append(0.0)
                else:
                    swbd_defined[conv][measure][speaker].append(1.0)
        for measure in swbd_defined[conv]:
            swbd_defined[conv][measure][speaker] = np.mean(swbd_defined[conv][measure][speaker])
    for measure in good_convs:
        if swbd_defined[conv][measure]['A'] >= 0.5 and swbd_defined[conv][measure]['B'] >= 0.5:
            good_convs[measure].append(conv)

for measure, convs in good_convs.items():
    print '{}: {}'.format(measure, len(convs))
                    

                    
"""
all_bnc_trees = {}    
    
for conv in bnc.iter_transcripts(display_progess=False):
    trees = {'A': set(), 'B': set()}
    for ut in conv.utterances:
        trees[ut.caller].add(tuple([str(tree) for tree in ut.trees]))
    all_bnc_trees[conv.conversation_no] = trees
"""