In [11]:
from swda_time import CorpusReader
from random import randrange
import complexity
from nltk.tree import Tree
import numpy as np

corpus = CorpusReader('swda', 'swda/swda-metadata.csv')

In [12]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    print 'Depth*width: {}'.format(complexity.balanced(tree))
    print 'Average depth: {}'.format(complexity.avdepth(tree))

utterance 755

(S
  (EDITED
    (RM (-DFL- \[))
    (S (NP-SBJ (PRP It)) (VP-UNF (BES 's)))
    (, ,)
    (IP (-DFL- \+)))
  (INTJ (UH uh))
  (, ,)
  (NP-SBJ (PRP it))
  (RS (-DFL- \]))
  (VP
    (VP
      (VBZ saves)
      (NP (NP (DT a) (JJ third)) (ADVP (RP off)))
      (PP (IN on) (NP (NNS taxes))))
    (CC or)
    (NP-ETC (NN something)))
  (. .)
  (-DFL- E_S))
Length: 18
Depth: 7
Width: 1.48648648649
Depth*width: 10.4054054054
Average depth: 5.0


In [13]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

Normalized depth: 0.742479858893
Normalized width: 0.98451899703
Normalized depth*width: 0.733494619043
Normalized average depth: 0.821923933881


In [33]:
# dicts with the measures for easy iteration
# add additional measures here
measures = {'length': complexity.length,
            'depth': complexity.depth,
            'width': complexity.width,
            'depth*width': complexity.balanced,
            'average depth': complexity.avdepth,
            'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            'normalized depth*width': complexity.nbalanced,
            'normalized average depth': complexity.n_avdepth}

non_normalized_measures = {'length': complexity.length,
                           'depth': complexity.depth,
                           'width': complexity.width,
                           'depth*width': complexity.balanced}

normalized_measures = {'normalized depth': complexity.ndepth,
                       'normalized width': complexity.nwidth,
                       'normalized depth*width': complexity.nbalanced,
                       'normalized average depth': complexity.n_avdepth}

# dict of sets (to avoid duplicates) for the results
results = dict([(measure, []) for measure in measures])

for conv in corpus.iter_transcripts(display_progress=False):
    for utt in conv.utterances:
        for tree in utt.trees:
            #Trees need to be cast to string to be hashable for duplicate removal
            string = str(tree)
            for measure in measures:
                results[measure].append((conv.conversation_no, measures[measure](tree), string))

for measure in results:
    results[measure] = list(set(results[measure]))
    results[measure].sort(key=lambda x: x[1])

av_results = {}

for measure in non_normalized_measures:
    mean = np.mean([x[1] for x in results[measure]])
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - mean), reverse=True)
for measure in normalized_measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1), reverse=True)

In [34]:
n = 10 #length of lists to be displayed

# modifiers with associated functions
mods = {'Most': lambda x: x[len(x)-n:], 'Least': lambda x: x[:n]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            tree = Tree.fromstring(item[2])
            text = str(tree.flatten()).replace('\n', '')
            print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}\n'.format(measure)
    for i, item in enumerate(mods['Most'](av_results[measure]), 1):
        tree = Tree.fromstring(item[2])
        text = str(tree.flatten()).replace('\n', '')
        print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)

Most complex trees by normalized depth:

1.(from conv. no. 2405) (S  but  it  's  running  about  seventy  which  *T*-1  is  still  ridiculous  *  to  have  *-3  to  make  changes  to  seventy  percent  of  everything  that  she  types  *T*-2  .  E_S) 
 (S
  (CC but)
  (NP-SBJ (PRP it))
  (VP
    (BES 's)
    (VP
      (VBG running)
      (NP
        (NP (RB about) (CD seventy))
        (SBAR
          (WHNP-1 (WDT which))
          (S
            (NP-SBJ (-NONE- *T*-1))
            (VP
              (VBZ is)
              (ADVP (RB still))
              (ADJP-PRD
                (JJ ridiculous)
                (S
                  (NP-SBJ-3 (-NONE- *))
                  (VP
                    (TO to)
                    (VP
                      (VB have)
                      (S
                        (NP-SBJ (-NONE- *-3))
                        (VP
                          (TO to)
                          (VP
                            (VB make)
                            (NP

7.(from conv. no. 3646) (S  Well  ,  I  had  one  that  *T*-1  was  needed  *-2  to  be  able  *-3  to  be  moved  *-4  along  \[  at  a  ,  \+  at  a  \]  pace  a  little  faster  than  he  was  going  *-5  to  be  able  *-6  to  be  moved  *-7  in  public  school  ,  E_S) 
 (S
  (INTJ (UH Well))
  (, ,)
  (NP-SBJ (PRP I))
  (VP
    (VBD had)
    (NP
      (NP (NN one))
      (SBAR
        (WHNP-1 (WDT that))
        (S
          (NP-SBJ-2 (-NONE- *T*-1))
          (EDITED (VP-UNF (VBD was)))
          (VP
            (VBN needed)
            (S
              (NP-SBJ-3 (-NONE- *-2))
              (VP
                (TO to)
                (VP
                  (VB be)
                  (ADJP-PRD
                    (JJ able)
                    (S
                      (NP-SBJ-4 (-NONE- *-3))
                      (VP
                        (TO to)
                        (VP
                          (VB be)
                          (VP
                            (VBN moved)
    

4.(from conv. no. 3040) (S  \[  \[  \[  I  ,  \+  I  'm  ,  \]  \+  uh  ,  I  think  0  I  'm  ,  \]  \+  I  think  0  I  'm  ,  \]  uh  ,  a  little  out  of  the  ordinary  in  \[  that  ,  \+  that  \]  \[  \[  I  ,  \+  I  ,  \]  \+  uh  ,  I  \]  think  0  I  'm  more  worried  about  \[  the  deficit  ,  \+  the  national  deficit  \]  \[  than  ,  \+  than  ,  \]  uh  ,  a  lot  of  other  people  are  *?*  and  think  that  \[  we  need  *-1  to  ,  \+  we  need  *-2  to  \]  either  raise  our  taxes  \[  or  ,  \+  or  \]  cutback  on  something  E_S) 
 (S
  (EDITED
    (RM (-DFL- \[))
    (EDITED
      (RM (-DFL- \[))
      (EDITED (RM (-DFL- \[)) (NP-SBJ (PRP I)) (, ,) (IP (-DFL- \+)))
      (S (NP-SBJ (PRP I)) (VP-UNF (VBP 'm)))
      (, ,)
      (RS (-DFL- \]))
      (IP (-DFL- \+)))
    (S
      (INTJ (UH uh))
      (, ,)
      (NP-SBJ (PRP I))
      (VP
        (VBP think)
        (SBAR (-NONE- 0) (S (NP-SBJ (PRP I)) (VP-UNF (VBP 'm))))))
    (, ,)
    (RS (-DFL- \]))
 

3.(from conv. no. 4736) (S  The  law  enforcement  community  ,  uh  ,  uh  ,  you  know  ,  \[  has  *  to  ,  \+  has  *-3  to  \]  separate  the  difference  between  somebody  who  *T*-1  is  being  set  *T*-1  up  in  which  ,  uh  ,  grievous  acts  are  done  *-2  *T*-6  *  \[  \[  \[  to  ,  \+  uh  ,  \]  \+  to  ,  \]  \+  you  know  ,  to  \]  get  somebody  into  \[  a  ,  \+  a  \]  situation  where  they  're  going  *-4  to  be  guilty  \[  of  ,  \+  of  \]  a  crime  *T*-5  .  E_S) 
 (S
  (NP-SBJ-3 (DT The) (NN law) (NN enforcement) (NN community))
  (, ,)
  (INTJ (UH uh))
  (, ,)
  (INTJ (UH uh))
  (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
  (EDITED
    (RM (-DFL- \[))
    (VP (VBZ has) (S (NP-SBJ (-NONE- *)) (VP-UNF (TO to))))
    (, ,)
    (IP (-DFL- \+)))
  (VP
    (VBZ has)
    (S
      (NP-SBJ (-NONE- *-3))
      (VP
        (TO to)
        (RS (-DFL- \]))
        (VP
          (VB separate)
          (NP
            (NP (DT the) (NN difference))
 

9.(from conv. no. 3235) (S  and  I  think  0  it  's  going  *-1  to  have  *-2  to  swing  back  the  other  way  ,  because  I  think  0  we  're  going  *-3  to  have  an  awful  lot  of  kids  who  *T*-4  are  going  *-5  to  have  major  problems  from  all  this  .  E_S) 
 (S
  (CC and)
  (NP-SBJ (PRP I))
  (VP
    (VBP think)
    (SBAR
      (-NONE- 0)
      (S
        (NP-SBJ-1 (PRP it))
        (VP
          (BES 's)
          (VP
            (VBG going)
            (S
              (NP-SBJ-2 (-NONE- *-1))
              (VP
                (TO to)
                (VP
                  (VB have)
                  (S
                    (NP-SBJ (-NONE- *-2))
                    (VP
                      (TO to)
                      (VP
                        (VB swing)
                        (ADVP-DIR (RB back))
                        (NP-DIR (DT the) (JJ other) (NN way))
                        (, ,)
                        (SBAR-PRP
                          (IN because)
 


7.(from conv. no. 2109) (S-1  and  then  ,  you  know  ,  uh  ,  \[  \[  for  ,  \+  for  ,  \]  \+  for  \]  no  real  ,  you  know  ,  direct  reason  ,  I  guess  0  *T*-1  ,  uh  ,  \[  we  ,  \+  we  \]  get  into  ,  uh  ,  uh  ,  a  really  good  relationship  for  a  while  and  then  ,  you  know  ,  back  and  forth  E_S) 
 (S-1
  (CONJP (CC and) (RB then))
  (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
  (INTJ (UH uh))
  (, ,)
  (EDITED
    (RM (-DFL- \[))
    (EDITED (RM (-DFL- \[)) (PP-UNF (IN for)) (, ,) (IP (-DFL- \+)))
    (PP-UNF (IN for))
    (, ,)
    (RS (-DFL- \]))
    (IP (-DFL- \+)))
  (PP-TPC
    (IN for)
    (RS (-DFL- \]))
    (NP
      (DT no)
      (ADJP
        (JJ real)
        (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
        (JJ direct))
      (NN reason)))
  (PRN
    (, ,)
    (S
      (NP-SBJ (PRP I))
      (VP (VBP guess) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))))
    (, ,))
  (INTJ (UH uh))
  (, ,)
  (EDITED (RM (-DFL- \[)) (N


7.(from conv. no. 4736) (S  The  law  enforcement  community  ,  uh  ,  uh  ,  you  know  ,  \[  has  *  to  ,  \+  has  *-3  to  \]  separate  the  difference  between  somebody  who  *T*-1  is  being  set  *T*-1  up  in  which  ,  uh  ,  grievous  acts  are  done  *-2  *T*-6  *  \[  \[  \[  to  ,  \+  uh  ,  \]  \+  to  ,  \]  \+  you  know  ,  to  \]  get  somebody  into  \[  a  ,  \+  a  \]  situation  where  they  're  going  *-4  to  be  guilty  \[  of  ,  \+  of  \]  a  crime  *T*-5  .  E_S) 
 (S
  (NP-SBJ-3 (DT The) (NN law) (NN enforcement) (NN community))
  (, ,)
  (INTJ (UH uh))
  (, ,)
  (INTJ (UH uh))
  (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
  (EDITED
    (RM (-DFL- \[))
    (VP (VBZ has) (S (NP-SBJ (-NONE- *)) (VP-UNF (TO to))))
    (, ,)
    (IP (-DFL- \+)))
  (VP
    (VBZ has)
    (S
      (NP-SBJ (-NONE- *-3))
      (VP
        (TO to)
        (RS (-DFL- \]))
        (VP
          (VB separate)
          (NP
            (NP (DT the) (NN difference))



5.(from conv. no. 2472) (NP  Sliced  tomatoes  and  then  green  pepper  strips  ,  just  over  it  ,  and  some  red  onion  slices  ,  and  just  a  little  bit  of  ,  um  ,  basil  ,  and  then  a  little  Italian  dressing  over  it  .  E_S) 
 (NP
  (NP (JJ Sliced) (NNS tomatoes))
  (CONJP (CC and) (RB then))
  (NP
    (NP (JJ green) (NN pepper) (NNS strips))
    (, ,)
    (PP-LOC (ADVP (RB just)) (IN over) (NP (PRP it))))
  (, ,)
  (CC and)
  (NP (DT some) (JJ red) (NN onion) (NNS slices))
  (, ,)
  (CC and)
  (NP
    (NP (RB just) (DT a) (JJ little) (NN bit))
    (PP (IN of) (, ,) (INTJ (UH um)) (, ,) (NP (NN basil))))
  (, ,)
  (CONJP (CC and) (RB then))
  (NP
    (NP (NP-ADV (DT a) (JJ little)) (JJ Italian) (NN dressing))
    (PP-LOC (IN over) (NP (PRP it))))
  (. .)
  (-DFL- E_S))

6.(from conv. no. 4316) (S  I  mean  ,  \[  \[  as  ,  \+  as  ,  \]  \+  you  know  ,  as  \]  poverty  has  gotten  worse  ,  as  ,  you  know  ,  education  has  gotten  worse  ,  as  there  's

8.(from conv. no. 3020) (S Kauai was nice) 
 (S (NP-SBJ (NNP Kauai)) (VP (VBD was) (ADJP-PRD (JJ nice))))

9.(from conv. no. 3399) (FRAG maybe some day) 
 (FRAG (ADVP (RB maybe)) (ADVP-TMP (TYPO (DT some) (NN day))))

10.(from conv. no. 3524) (PRN You know ,) 
 (PRN (S (NP-SBJ (PRP You)) (VP (VBP know))) (, ,))



Most complex trees by depth:

1.(from conv. no. 4056) (S  The  benefits  plan  is  different  \[  for  operating  ,  \+  for  what  we  call  *T*-1  operating  \]  staff  ,  which  *T*-2  are  considered  *-3  to  be  ,  uh  ,  clerical  personnel  ,  accountants  ,  things  like  that  as  opposed  to  what  they  call  *T*-4  the  P  A  T  staff  ,  which  I  belong  to  *T*-5  E_S) 
 (S
  (NP-SBJ (DT The) (NNS benefits) (NN plan))
  (VP
    (VBZ is)
    (ADJP-PRD (JJ different))
    (EDITED
      (RM (-DFL- \[))
      (PP (IN for) (NP-UNF (VBG operating)))
      (, ,)
      (IP (-DFL- \+)))
    (PP
      (IN for)
      (SBAR-NOM
        (WHNP-1 (WP what))
        (S
      

10.(from conv. no. 2749) (S  I  definitely  feel  like  we  need  *-1  to  keep  it  at  least  unanimous  E_S  because  ,  uh  ,  you  know  ,  there  's  the  classic  sort  of  *  TO  KILL  A  MOCKINGBIRD  kind  of  story  where  you  get  ,  you  know  ,  jury  which  *T*-2  can  have  all  kinds  of  prejudices  and  things  that  ,  a  judge  at  least  officially  is  n't  supposed  *-3  to  have  *T*-4  .  Though  ,  of  course  they  can  *?*  ,  too  *T*-5  .  E_S) 
 (S
  (NP-SBJ (PRP I))
  (ADVP (RB definitely))
  (VP
    (VBP feel)
    (SBAR
      (IN like)
      (S
        (NP-SBJ-1 (PRP we))
        (VP
          (VBP need)
          (S
            (NP-SBJ (-NONE- *-1))
            (VP
              (TO to)
              (VP
                (VB keep)
                (S
                  (NP-SBJ (PRP it))
                  (ADVP (IN at) (JJS least))
                  (ADJP-PRD (JJ unanimous)))
                (-DFL- E_S)
                (SBAR-PRP
                  (RB beca

1.(from conv. no. 4072) (S  \[  Also  ,  \+  also  \]  I  have  the  observation  *ICH*-6  \[  of  ,  \+  uh  ,  with  \]  the  press  \[  is  \+  \]  that  ,  like  \[  any  \+  any  \]  stories  that  I  had  any  first  hand  knowledge  of  *T*-1  that  I  see  *T*-2  in  the  press  ,  you  know  ,  which  *T*-3  only  happened  like  ten  times  in  my  life  you  know  first  thing  you  know  for  \[  anything  of  \+  you  know  \[  or  \+  \]  anything  that  \]  *T*-4  actually  \[  had  ,  \+  had  \]  more  than  just  a  blurb  about  it  .  \[  That  \+  \]  the  press  has  managed  *-5  to  just  mess  up  some  aspect  of  the  story  E_S) 
 (S
  (EDITED (RM (-DFL- \[)) (ADVP (RB Also)) (, ,) (IP (-DFL- \+)))
  (ADVP (RB also))
  (RS (-DFL- \]))
  (NP-SBJ (PRP I))
  (VP
    (VBP have)
    (NP (DT the) (NN observation) (SBAR (-NONE- *ICH*-6)))
    (EDITED (RM (-DFL- \[)) (PP-UNF (IN of)) (, ,) (IP (-DFL- \+)))
    (INTJ (UH uh))
    (, ,)
    (PP (IN with) (RS (-DFL- \]

9.(from conv. no. 3268) (S  Uh  ,  \[  my  ,  \+  my  \]  idea  I  guess  of  the  ideal  vacation  would  be  0  *  to  begin  with  ,  uh  ,  money  be  no  object  .  You  know  ,  so  just  absolutely  ,  you  know  ,  first  class  *  fly  where  I  want  *-1  to  *?*  *T*-2  ,  rent  a  car  if  I  want  *-3  to  *?*  .  Uh  ,  you  know  ,  just  \[  the  ,  \+  really  the  \]  nicest  hotels  and  stuff  like  that  \[  and  \+  and  \]  not  worry  if  somebody  says  ,  well  ,  *  let  's  take  this  little  side  trip  ,  you  know  ,  and  not  have  *  to  sit  there  and  figure  out  ,  um  ,  you  know  ,  again  \[  we  ,  \+  we  \]  fit  that  in  there  or  not  .  E_S) 
 (S
  (INTJ (UH Uh))
  (, ,)
  (EDITED
    (RM (-DFL- \[))
    (NP-SBJ-UNF (PRP$ my))
    (, ,)
    (IP (-DFL- \+)))
  (NP-SBJ
    (NP (PRP$ my) (RS (-DFL- \])) (NN idea))
    (PRN (S (NP-SBJ (PRP I)) (VP (VBP guess))))
    (PP (IN of) (NP (DT the) (JJ ideal) (NN vacation))))
  (VP
    (MD would)


6.(from conv. no. 3268) (S  Uh  ,  \[  my  ,  \+  my  \]  idea  I  guess  of  the  ideal  vacation  would  be  0  *  to  begin  with  ,  uh  ,  money  be  no  object  .  You  know  ,  so  just  absolutely  ,  you  know  ,  first  class  *  fly  where  I  want  *-1  to  *?*  *T*-2  ,  rent  a  car  if  I  want  *-3  to  *?*  .  Uh  ,  you  know  ,  just  \[  the  ,  \+  really  the  \]  nicest  hotels  and  stuff  like  that  \[  and  \+  and  \]  not  worry  if  somebody  says  ,  well  ,  *  let  's  take  this  little  side  trip  ,  you  know  ,  and  not  have  *  to  sit  there  and  figure  out  ,  um  ,  you  know  ,  again  \[  we  ,  \+  we  \]  fit  that  in  there  or  not  .  E_S) 
 (S
  (INTJ (UH Uh))
  (, ,)
  (EDITED
    (RM (-DFL- \[))
    (NP-SBJ-UNF (PRP$ my))
    (, ,)
    (IP (-DFL- \+)))
  (NP-SBJ
    (NP (PRP$ my) (RS (-DFL- \])) (NN idea))
    (PRN (S (NP-SBJ (PRP I)) (VP (VBP guess))))
    (PP (IN of) (NP (DT the) (JJ ideal) (NN vacation))))
  (VP
    (MD would

3.(from conv. no. 2035) (S  \[  we  have  ,  \+  we  try  \]  *-2  to  spend  like  Monday  nights  ,  we  call  it  family  home  evening  ,  and  ,  \[  have  our  kids  ,  \+  uh  ,  you  know  ,  have  little  lessons  with  them  ,  \]  and  ,  you  know  ,  just  \[  \[  see  ,  \+  count  ,  \]  \+  see  \]  how  things  are  going  *T*-3  in  our  family  and  ,  you  know  ,  teach  them  about  their  grandparents  or  something  like  that  .  Bring  out  pictures  of  them  so  they  get  *-4  to  know  them  E_S) 
 (S
  (EDITED
    (RM (-DFL- \[))
    (S (NP-SBJ (PRP we)) (VP-UNF (VBP have)))
    (, ,)
    (IP (-DFL- \+)))
  (NP-SBJ-2 (PRP we))
  (VP
    (VBP try)
    (RS (-DFL- \]))
    (S
      (NP-SBJ (-NONE- *-2))
      (VP
        (TO to)
        (VP
          (VP
            (VB spend)
            (INTJ (UH like))
            (NP (NNP Monday) (NNS nights)))
          (PRN
            (, ,)
            (S
              (NP-SBJ (PRP we))
              (VP
             

9.(from conv. no. 4796) (S  \[  if  some  ,  \+  if  it  *EXP*-1  were  possible  *  to  market  some  ,  \]  uh  ,  form  of  ,  uh  ,  space  technology  ,  you  know  ,  *  \[  to  ,  \+  uh  ,  uh  ,  to  \]  make  it  equitable  ,  it  might  severely  help  the  economy  ,  you  know  ,  \[  in  the  ,  \+  in  the  ,  \]  uh  ,  respect  that  ,  you  know  ,  there  's  endless  amounts  of  research  0  *T*-2  to  be  done  *-3  .  Endless  amounts  of  resources  \[  and  \+  and  \]  whatnot  ,  E_S) 
 (S
  (EDITED
    (RM (-DFL- \[))
    (SBAR-ADV (IN if) (S-UNF (NP-SBJ-UNF (DT some))))
    (, ,)
    (IP (-DFL- \+)))
  (SBAR-ADV
    (IN if)
    (S
      (NP-SBJ (NP (PRP it)) (S (-NONE- *EXP*-1)))
      (VP
        (VBD were)
        (ADJP-PRD (JJ possible))
        (S-1
          (NP-SBJ (-NONE- *))
          (VP
            (TO to)
            (VP
              (VB market)
              (NP
                (NP
                  (DT some)
                  (, ,)
           