In [1]:
from swda_time import CorpusReader
from random import randrange
import complexity
from nltk.tree import Tree
import numpy as np

corpus = CorpusReader('swda', 'swda/swda-metadata.csv')

In [2]:
#find a random example and compute the non-normalized measures
iterator = corpus.iter_utterances()

for i in range(randrange(1000)):
    s = iterator.next()

for tree in s.trees:
    print tree
    print 'Length: {}'.format(complexity.length(tree))
    print 'Depth: {}'.format(complexity.depth(tree))
    print 'Width: {}'.format(complexity.width(tree))
    print 'Depth*width: {}'.format(complexity.balanced(tree))
    #print 'Balanced depth*width: {}'.format(complexity.balanced2(tree))
    #print 'Average depth: {}'.format(complexity.avdepth(tree))
print "Lu's measures:"
for item in complexity.lus_measures(s.trees).items():
    print'{}:{}'.format(*item)
    

utterance 751

(S
  (NP-SBJ (PRP we))
  (VP
    (VBP have)
    (NP
      (NP (CD one))
      (PP
        (IN of)
        (NP
          (NP
            (DT those)
            (S
              (NP-SBJ (-NONE- *))
              (VP
                (VP (VB use) (NP (PRP it)))
                (CC or)
                (VP (VB lose) (NP (PRP it)))))
            (NNS plans))
          (SBAR (-NONE- *ICH*-1)))))
    (, ,)
    (ADVP (RB too))
    (, ,)
    (SBAR-1
      (WHADVP-2 (WRB where))
      (, ,)
      (INTJ (UH uh))
      (, ,)
      (S
        (NP-SBJ (PRP they))
        (VP
          (MD 'll)
          (ADVP (RB basically))
          (VP
            (VB pay)
            (PP
              (IN for)
              (, ,)
              (INTJ (UH uh))
              (PRN
                (, ,)
                (S (NP-SBJ (PRP you)) (VP (VBP know)))
                (, ,))
              (NP
                (NN child)
                (NN care)
                (NN type)
                (, ,)
                (INTJ 

In [3]:
#print the normalized measures for the random example
for tree in s.trees:
    print 'Normalized depth: {}'.format(complexity.ndepth(tree))
    print 'Normalized width: {}'.format(complexity.nwidth(tree))
    print 'Normalized depth*width: {}'.format(complexity.nbalanced(tree))
    print 'Normalized balanced depth*width: {}'.format(complexity.nbalanced2(tree))
    print 'Normalized average depth: {}'.format(complexity.n_avdepth(tree))

Normalized depth: 1.09439666756
Normalized width: 1.01583685776
Normalized depth*width: 1.11694815806
Normalized balanced depth*width: 1.50833386101
Normalized average depth: 1.02421311312


In [5]:
# dicts with the measures for easy iteration
# add additional measures here
measures = {'length': complexity.length,
            'depth': complexity.depth,
            'width': complexity.width,
            'depth*width': complexity.balanced,
            'balanced depth*width': complexity.balanced2,
            'average depth': complexity.avdepth,
            'normalized depth': complexity.ndepth,
            'normalized width': complexity.nwidth,
            'normalized depth*width': complexity.nbalanced,
            'normalized average depth': complexity.n_avdepth,
            'normalized balanced depth*width': complexity.nbalanced2}

non_normalized_measures = {'length': complexity.length,
                           'depth': complexity.depth,
                           'width': complexity.width,
                           'depth*width': complexity.balanced,
                           'balanced depth*width': complexity.balanced2}

normalized_measures = {'normalized depth': complexity.ndepth,
                       'normalized width': complexity.nwidth,
                       'normalized depth*width': complexity.nbalanced,
                       'normalized average depth': complexity.n_avdepth,
                       'normalized balanced depth*width': complexity.nbalanced2}

# dict of sets (to avoid duplicates) for the results
ordered_results = dict([(measure, []) for measure in measures])

for conv in corpus.iter_transcripts(display_progress=False):
    for utt in conv.utterances:
        for tree in utt.trees:
            #Trees need to be cast to string to be hashable for duplicate removal
            string = str(tree)
            for measure in measures:
                ordered_results[measure].append((conv.conversation_no, measures[measure](tree), string))

results = {}
#remove duplicates from results            
for measure in ordered_results:
    results[measure] = list(set(ordered_results[measure]))
    results[measure].sort(key=lambda x: x[1])

#compute results closest to average
av_results = {}

for measure in non_normalized_measures:
    mean = np.mean([x[1] for x in results[measure]])
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - mean), reverse=True)
for measure in normalized_measures:
    av_results[measure] = sorted(results[measure], key=lambda x: abs(x[1] - 1), reverse=True)

  return balanced2(tree)/averages['balanced2'][length(tree)]


In [6]:
# code to display top, bot and closest to average n trees per measure

n = 5 #length of lists to be displayed

# modifiers with associated functions
mods = {'Most': lambda x: x[len(x)-n:], 'Least': lambda x: x[:n]}

for measure in measures:
    for modifier in mods:
        print '{} complex trees by {}:\n'.format(modifier, measure)
        for i, item in enumerate(mods[modifier](results[measure]), 1):
            tree = Tree.fromstring(item[2])
            text = str(tree.flatten()).replace('\n', '')
            print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)
        print '\n'

for measure in av_results:
    print 'Closest to average trees by {}\n'.format(measure)
    for i, item in enumerate(mods['Most'](av_results[measure]), 1):
        tree = Tree.fromstring(item[2])
        text = str(tree.flatten()).replace('\n', '')
        print '{}.(from conv. no. {}) {} \n {}\n'.format(i, item[0], text, tree)

Most complex trees by average depth:

1.(from conv. no. 4082) (S  I  know  of  cases  where  ,  uh  ,  we  have  one  television  media  where  they  will  show  clippings  from  one  event  and  describe  another  event  but  ,  with  \[  the  \+  the  \]  attempt  *  to  give  you  the  impression  that  what  they  're  talking  about  *T*-1  is  the  same  thing  0  they  are  showing  you  *T*-2  .  Which  *T*-4  is  sort  of  like  a  deliberate  bias  *T*-5  .  \[  Which  *T*-7  is  \+  which  *T*-3  is  \]  rather  disturbing  *T*-6  .  E_S) 
 (S
  (NP-SBJ (PRP I))
  (VP
    (VBP know)
    (PP
      (IN of)
      (NP
        (NP (NNS cases))
        (SBAR
          (WHADVP-6 (WRB where))
          (, ,)
          (INTJ (UH uh))
          (, ,)
          (S
            (NP-SBJ (PRP we))
            (VP
              (VBP have)
              (NP
                (NP (CD one) (NN television) (NN media))
                (SBAR
                  (WHADVP-5 (WRB where))
                


5.(from conv. no. 2969) (NP G o m p h r e n a . E_S) 
 (NP
  (SYM G)
  (SYM o)
  (SYM m)
  (SYM p)
  (SYM h)
  (SYM r)
  (SYM e)
  (SYM n)
  (SYM a)
  (. .)
  (-DFL- E_S))



Least complex trees by normalized width:

1.(from conv. no. 4765) (S-UNF I mean N_S) 
 (S-UNF (PRN (S (NP-SBJ (PRP I)) (VP (VBP mean)))) (-DFL- N_S))

2.(from conv. no. 4340) (S-UNF I mean ,) 
 (S-UNF (PRN (S (NP-SBJ (PRP I)) (VP (VBP mean)))) (, ,))

3.(from conv. no. 2784) (S they tell me) 
 (S (NP-SBJ (PRP they)) (VP-UNF (VBP tell) (NP (PRP me))))

4.(from conv. no. 2105) (S I tell you) 
 (S (NP-SBJ (PRP I)) (VP (VBP tell) (NP (PRP you))))

5.(from conv. no. 2008) (S it 's shorts) 
 (S (NP-SBJ (PRP it)) (VP (BES 's) (NP-PRD (NNS shorts))))



Most complex trees by normalized balanced depth*width:

1.(from conv. no. 4334) (S-UNF here , N_S) 
 (S-UNF (ADVP-LOC (RB here)) (, ,) (-DFL- N_S))

2.(from conv. no. 2041) (VP calculated down E_S) 
 (VP (VBN calculated) (PRT (RP down)) (-DFL- E_S))

3.(from conv. no. 334


4.(from conv. no. 2832) (S  I  figured  0  I  'm  going  *-1  to  have  *-2  to  hire  somebody  0  *T*-3  to  remove  our  windows  and  put  new  ones  *ICH*-6  in  that  *T*-4  are  easier  0  *  to  clean  *T*-5  .  E_S) 
 (S
  (NP-SBJ (PRP I))
  (VP
    (VBD figured)
    (SBAR
      (-NONE- 0)
      (S
        (NP-SBJ-1 (PRP I))
        (VP
          (VBP 'm)
          (VP
            (VBG going)
            (S
              (NP-SBJ-2 (-NONE- *-1))
              (VP
                (TO to)
                (VP
                  (VB have)
                  (S
                    (NP-SBJ (-NONE- *-2))
                    (VP
                      (TO to)
                      (VP
                        (VB hire)
                        (NP
                          (NP (NN somebody))
                          (SBAR-PRP
                            (WHNP-3 (-NONE- 0))
                            (S
                              (NP-SBJ (-NONE- *T*-3))
                              (V


2.(from conv. no. 4736) (S  The  law  enforcement  community  ,  uh  ,  uh  ,  you  know  ,  \[  has  *  to  ,  \+  has  *-3  to  \]  separate  the  difference  between  somebody  who  *T*-1  is  being  set  *T*-1  up  in  which  ,  uh  ,  grievous  acts  are  done  *-2  *T*-6  *  \[  \[  \[  to  ,  \+  uh  ,  \]  \+  to  ,  \]  \+  you  know  ,  to  \]  get  somebody  into  \[  a  ,  \+  a  \]  situation  where  they  're  going  *-4  to  be  guilty  \[  of  ,  \+  of  \]  a  crime  *T*-5  .  E_S) 
 (S
  (NP-SBJ-3 (DT The) (NN law) (NN enforcement) (NN community))
  (, ,)
  (INTJ (UH uh))
  (, ,)
  (INTJ (UH uh))
  (PRN (, ,) (S (NP-SBJ (PRP you)) (VP (VBP know))) (, ,))
  (EDITED
    (RM (-DFL- \[))
    (VP (VBZ has) (S (NP-SBJ (-NONE- *)) (VP-UNF (TO to))))
    (, ,)
    (IP (-DFL- \+)))
  (VP
    (VBZ has)
    (S
      (NP-SBJ (-NONE- *-3))
      (VP
        (TO to)
        (RS (-DFL- \]))
        (VP
          (VB separate)
          (NP
            (NP (DT the) (NN difference))


Most complex trees by normalized depth*width:

1.(from conv. no. 2521) (S  it  was  right  close  to  when  we  was  going  *-1  to  find  out  if  we  was  going  *-2  to  get  *-3  to  be  in  the  playoffs  ,  or  not  *T*-4  .  E_S) 
 (S
  (NP-SBJ (PRP it))
  (VP
    (VBD was)
    (ADVP-LOC-PRD
      (RB right)
      (RB close)
      (PP
        (IN to)
        (SBAR-TMP
          (WHADVP-4 (WRB when))
          (S
            (NP-SBJ-1 (PRP we))
            (VP
              (VBD was)
              (VP
                (VBG going)
                (S
                  (NP-SBJ (-NONE- *-1))
                  (VP
                    (TO to)
                    (VP
                      (VB find)
                      (PRT (RP out))
                      (SBAR
                        (IN if)
                        (S
                          (NP-SBJ-2 (PRP we))
                          (VP
                            (VBD was)
                            (VP
                        

In [7]:
for measure in results:
    print len(results[measure])
    
for i, item in enumerate(ordered_results['normalized depth']):
    assert item[2] == ordered_results['normalized average depth'][i][2]

88842
88842
88858
88842
88842
88842
88842
88842
88842
88842
88842


In [8]:
# code to find distinguishing sentences for each (sensible) pair of normalized measures

i = 0
measure_list = list(normalized_measures)
diff = {}

while i < len(measure_list):
    one = measure_list[i]
    data_one = [x[1] for x in ordered_results[one]]
    for other in measure_list[i + 1:]:
        data_other = [x[1] for x in ordered_results[other]]
        comp = zip(data_one, data_other)
        index, values = max(enumerate(comp), key=lambda x: abs(x[1][0] - x[1][1]))
        # sanity check: Are these actually values for the same tree?
        #               Do they actually have these values?
        assert ordered_results[one][index][2] == ordered_results[other][index][2]
        assert ordered_results[one][index][1] == values[0]
        assert ordered_results[other][index][1] == values[1]
        tree = ordered_results[one][index][2]
        diff[(one, other)] = (tree, values)
    i += 1
    
for pair in diff:
    print 'Largest difference in value between {} and {}:'.format(*pair)
    tree, values = diff[pair]
    print 'Value for {pair[0]}: {values[0]}, value for {pair[1]}: {values[1]}'.format(pair=pair, values=values)
    print Tree.fromstring(tree)

Largest difference in value between normalized depth and normalized average depth:
Value for normalized depth: 1.79264461283, value for normalized average depth: 1.26525281993
(S
  (EDITED (RM (-DFL- \[)) (CC and) (, ,) (IP (-DFL- \+)))
  (CC and)
  (, ,)
  (RS (-DFL- \]))
  (ADVP (RB consequently))
  (, ,)
  (NP-SBJ-1 (PRP they))
  (VP
    (VBP 're)
    (RB not)
    (ADJP-PRD
      (RB so)
      (JJ motivated)
      (S
        (NP-SBJ-2 (-NONE- *-1))
        (VP
          (TO to)
          (VP
            (VP (VB get) (VP (VBN involved) (NP (-NONE- *-2))))
            (CC and)
            (VP
              (VB make)
              (S
                (NP-SBJ (-NONE- *))
                (ADJP-PRD
                  (JJ sure)
                  (SBAR
                    (-NONE- 0)
                    (S
                      (NP-SBJ (NNS things))
                      (VP
                        (VBP happen)
                        (NP
                          (NP (DT the) (NN way))
      