In [11]:
from swda_time import CorpusReader
from random import randrange
import complexity
from nltk.tree import Tree

corpus = CorpusReader('swda', 'swda/swda-metadata.csv')

In [12]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    print 'Depth*width: {}'.format(complexity.balanced(tree))
    print 'Average depth: {}'.format(complexity.avdepth(tree))

utterance 755

(S
  (EDITED
    (RM (-DFL- \[))
    (S (NP-SBJ (PRP It)) (VP-UNF (BES 's)))
    (, ,)
    (IP (-DFL- \+)))
  (INTJ (UH uh))
  (, ,)
  (NP-SBJ (PRP it))
  (RS (-DFL- \]))
  (VP
    (VP
      (VBZ saves)
      (NP (NP (DT a) (JJ third)) (ADVP (RP off)))
      (PP (IN on) (NP (NNS taxes))))
    (CC or)
    (NP-ETC (NN something)))
  (. .)
  (-DFL- E_S))
Length: 18
Depth: 7
Width: 1.48648648649
Depth*width: 10.4054054054
Average depth: 5.0


In [13]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

Normalized depth: 0.742479858893
Normalized width: 0.98451899703
Normalized depth*width: 0.733494619043
Normalized average depth: 0.821923933881


In [None]:
# dict with the measures for easy iteration
measures = {'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            'normalized depth*width': complexity.nbalanced,
            'normalized average depth': complexity.n_avdepth}

# dict of sets (to avoid duplicates) for the results
results = dict([(measure, []) for measure in measures])

for conv in corpus.iter_transcripts(display_progress=False):
    for utt in conv.utterances:
        for tree in utt.trees:
            #Trees need to be cast to string to be hashable for duplicate removal
            string = str(tree)
            for measure in measures:
                results[measure].append((conv.conversation_no, measures[measure](tree), string))

for measure in results:
    results[measure] = list(set(results[measure]))
    results[measure].sort(key=lambda x: x[1])

av_results = {}
for measure in measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1))

In [21]:
# modifiers with associated functions
mods = {'Most': lambda x: x[len(x)-10:], 'Least': lambda x: x[:10]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            tree = Tree.fromstring(item[2])
            text = str(tree.flatten()).replace('\n', '')
            print '{}.(from conv. no. {}) {} \n {}'.format(i, item[0], text, tree)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}'.format(measure)
    for i, item in enumerate(av_results[measure][:10], 1):
        tree = Tree.fromstring(item[2])
        text = str(tree.flatten()).replace('\n', '')
        print '{}.(from conv. no. {}) {} \n {}'.format(i, item[0], text, tree)

Most complex trees by normalized depth:

1.(from conv. no. 2405) (S  but  it  's  running  about  seventy  which  *T*-1  is  still  ridiculous  *  to  have  *-3  to  make  changes  to  seventy  percent  of  everything  that  she  types  *T*-2  .  E_S) 
 (S
  (CC but)
  (NP-SBJ (PRP it))
  (VP
    (BES 's)
    (VP
      (VBG running)
      (NP
        (NP (RB about) (CD seventy))
        (SBAR
          (WHNP-1 (WDT which))
          (S
            (NP-SBJ (-NONE- *T*-1))
            (VP
              (VBZ is)
              (ADVP (RB still))
              (ADJP-PRD
                (JJ ridiculous)
                (S
                  (NP-SBJ-3 (-NONE- *))
                  (VP
                    (TO to)
                    (VP
                      (VB have)
                      (S
                        (NP-SBJ (-NONE- *-3))
                        (VP
                          (TO to)
                          (VP
                            (VB make)
                            (NP

Least complex trees by normalized depth:

1.(from conv. no. 2836) (S  and  ,  uh  ,  you  know  ,  two  counts  of  kidnapping  ,  uh  you  know  ,  \[  the  ,  \+  the  \]  forty-five  to  the  head  ,  you  know  ,  just  \[  the  ,  \+  the  \]  mental  anguish  and  the  whole  nine  yards  uh  ,  you  know  ,  the  guy  only  got  five  years  .  E_S) 
 (S
  (CC and)
  (, ,)
  (INTJ (UH uh))
  (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
  (NP-ADV
    (NP (NP (CD two) (NNS counts)) (PP (IN of) (NP (NN kidnapping))))
    (, ,)
    (INTJ (UH uh))
    (PRN (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
    (EDITED (RM (-DFL- \[)) (NP-UNF (DT the)) (, ,) (IP (-DFL- \+)))
    (NP
      (NP (DT the) (RS (-DFL- \])) (CD forty-five))
      (PP-LOC (IN to) (NP (DT the) (NN head))))
    (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
    (ADVP (RB just))
    (EDITED (RM (-DFL- \[)) (NP-UNF (DT the)) (, ,) (IP (-DFL- \+)))
    (NP (DT the) (RS (-DFL- \])) (JJ mental) (NN 

  (-DFL- E_S))
7.(from conv. no. 2382) (S  I  think  0  they  feel  0  this  is  going  *-1  to  be  an  effortless  sweatless  way  ,  0  *  to  get  in  shape  ,  without  *  having  *-3  to  spend  anything  *T*-2  E_S) 
 (S
  (NP-SBJ (PRP I))
  (VP
    (VBP think)
    (SBAR
      (-NONE- 0)
      (S
        (NP-SBJ (PRP they))
        (VP
          (VBP feel)
          (SBAR
            (-NONE- 0)
            (S
              (NP-SBJ-1 (DT this))
              (VP
                (VBZ is)
                (VP
                  (VBG going)
                  (S
                    (NP-SBJ (-NONE- *-1))
                    (VP
                      (TO to)
                      (VP
                        (VB be)
                        (NP-PRD
                          (NP
                            (DT an)
                            (JJ effortless)
                            (JJ sweatless)
                            (NN way))
                          (, ,)
                      

4.(from conv. no. 2035) (S  \[  we  have  ,  \+  we  try  \]  *-2  to  spend  like  Monday  nights  ,  we  call  it  family  home  evening  ,  and  ,  \[  have  our  kids  ,  \+  uh  ,  you  know  ,  have  little  lessons  with  them  ,  \]  and  ,  you  know  ,  just  \[  \[  see  ,  \+  count  ,  \]  \+  see  \]  how  things  are  going  *T*-3  in  our  family  and  ,  you  know  ,  teach  them  about  their  grandparents  or  something  like  that  .  Bring  out  pictures  of  them  so  they  get  *-4  to  know  them  E_S) 
 (S
  (EDITED
    (RM (-DFL- \[))
    (S (NP-SBJ (PRP we)) (VP-UNF (VBP have)))
    (, ,)
    (IP (-DFL- \+)))
  (NP-SBJ-2 (PRP we))
  (VP
    (VBP try)
    (RS (-DFL- \]))
    (S
      (NP-SBJ (-NONE- *-2))
      (VP
        (TO to)
        (VP
          (VP
            (VB spend)
            (INTJ (UH like))
            (NP (NNP Monday) (NNS nights)))
          (PRN
            (, ,)
            (S
              (NP-SBJ (PRP we))
              (VP
             

3.(from conv. no. 4886) (X MUMBLEx) 
 (X (XX MUMBLEx))
4.(from conv. no. 2533) (X MUMBLEx) 
 (X (XX MUMBLEx))
5.(from conv. no. 2334) (NP that) 
 (NP (DT that))
6.(from conv. no. 2800) (X MUMBLEx) 
 (X (XX MUMBLEx))
7.(from conv. no. 2465) (INTJ so) 
 (INTJ (UH so))
8.(from conv. no. 4679) (X MUMBLEx) 
 (X (XX MUMBLEx))
9.(from conv. no. 2266) (X MUMBLEx) 
 (X (XX MUMBLEx))
10.(from conv. no. 4603) (X MUMBLEx) 
 (X (XX MUMBLEx))
Closest to average trees by normalized width
1.(from conv. no. 3086) (INTJ no) 
 (INTJ (UH no))
2.(from conv. no. 3763) (X MUMBLEx) 
 (X (XX MUMBLEx))
3.(from conv. no. 2260) (X MUMBLEx) 
 (X (XX MUMBLEx))
4.(from conv. no. 3728) (X MUMBLEx) 
 (X (XX MUMBLEx))
5.(from conv. no. 4071) (INTJ Right) 
 (INTJ (UH Right))
6.(from conv. no. 4320) (S  And  ,  uh  ,  nowadays  the  latest  thing  0  \[  I  've  ,  \+  I  've  \]  read  about  *T*-1  in  the  paper  here  that  they  're  doing  *T*-2  as  far  as  ,  uh  ,  crimes  involving  *  stealing  cars  is  that

7.(from conv. no. 4796) (S  \[  if  some  ,  \+  if  it  *EXP*-1  were  possible  *  to  market  some  ,  \]  uh  ,  form  of  ,  uh  ,  space  technology  ,  you  know  ,  *  \[  to  ,  \+  uh  ,  uh  ,  to  \]  make  it  equitable  ,  it  might  severely  help  the  economy  ,  you  know  ,  \[  in  the  ,  \+  in  the  ,  \]  uh  ,  respect  that  ,  you  know  ,  there  's  endless  amounts  of  research  0  *T*-2  to  be  done  *-3  .  Endless  amounts  of  resources  \[  and  \+  and  \]  whatnot  ,  E_S) 
 (S
  (EDITED
    (RM (-DFL- \[))
    (SBAR-ADV (IN if) (S-UNF (NP-SBJ-UNF (DT some))))
    (, ,)
    (IP (-DFL- \+)))
  (SBAR-ADV
    (IN if)
    (S
      (NP-SBJ (NP (PRP it)) (S (-NONE- *EXP*-1)))
      (VP
        (VBD were)
        (ADJP-PRD (JJ possible))
        (S-1
          (NP-SBJ (-NONE- *))
          (VP
            (TO to)
            (VP
              (VB market)
              (NP
                (NP
                  (DT some)
                  (, ,)
           