# 11 - Managin Linguistic Data

Resource: https://www.nltk.org/book/ch11.html

In [1]:
import nltk

In [7]:
phonetic = nltk.corpus.timit.phones('dr1-fvmh0/sa1')
phonetic[:10]

['h#', 'sh', 'iy', 'hv', 'ae', 'dcl', 'y', 'ix', 'dcl', 'd']

In [None]:
nltk.corpus.timit.word_times('dr1-fvmh0/sa1')

[('she', 7812, 10610),
 ('had', 10610, 14496),
 ('your', 14496, 15791),
 ('dark', 15791, 20720),
 ('suit', 20720, 25647),
 ('in', 25647, 26906),
 ('greasy', 26906, 32668),
 ('wash', 32668, 37890),
 ('water', 38531, 42417),
 ('all', 43091, 46052),
 ('year', 46052, 50522)]

In [11]:
timitdict = nltk.corpus.timit.transcription_dict()
timitdict['greasy'] + timitdict['wash'] + timitdict['water']

['g', 'r', 'iy1', 's', 'iy', 'w', 'ao1', 'sh', 'w', 'ao1', 't', 'axr']

In [12]:
phonetic[17:30]

['g', 'r', 'iy', 's', 'iy', 'w', 'aa', 'sh', 'epi', 'w', 'aa', 'dx', 'ax']

In [13]:
nltk.corpus.timit.spkrinfo('dr1-fvmh0')

SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', birthdate='01/08/60', ht='5\'05"', race='WHT', edu='BS', comments='BEST NEW ENGLAND ACCENT SO FAR')

In [19]:
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
raw = open(merchant_file).read()
print(raw[:163])

<?xml version="1.0"?>
<?xml-stylesheet type="text/css" href="shakes.css"?>
<!-- <!DOCTYPE PLAY SYSTEM "play.dtd"> -->

<PLAY>
<TITLE>The Merchant of Venice</TITLE>


In [17]:
from xml.etree.ElementTree import ElementTree
merchant = ElementTree().parse(merchant_file)
merchant

<Element 'PLAY' at 0x76f50085b650>

In [21]:
merchant[0]

<Element 'TITLE' at 0x76f50085b5b0>

In [29]:
merchant[0].text

'The Merchant of Venice'

In [25]:
merchant[-2][0].text

'ACT IV'

In [26]:
merchant[-2][1]

<Element 'SCENE' at 0x76f5008a6f70>

In [31]:
merchant[-2][1][0].text

'SCENE I.  Venice. A court of justice.'

In [32]:
merchant[-2][1][54]

<Element 'SPEECH' at 0x76f50088ff60>

In [33]:
merchant[-2][1][54][0]

<Element 'SPEAKER' at 0x76f50088ffb0>

In [34]:
merchant[-2][1][54][0].text

'PORTIA'

In [35]:
merchant[-2][1][54][1]

<Element 'LINE' at 0x76f50088b830>

In [36]:
merchant[-2][1][54][1].text

"The quality of mercy is not strain'd,"

In [37]:
for i, act in enumerate(merchant.findall('ACT')):
  for j, scene in enumerate(act.findall('SCENE')):
    for k, speech in enumerate(scene.findall('SPEECH')):
      for line in speech.findall('LINE'):
        if 'music' in str(line.text):
          print("Act %d Scene %d Speech %d: %s" % (i+1, j+1, k+1, line.text))

Act 3 Scene 2 Speech 9: Let music sound while he doth make his choice;
Act 3 Scene 2 Speech 9: Fading in music: that the comparison
Act 3 Scene 2 Speech 9: And what is music then? Then music is
Act 5 Scene 1 Speech 23: And bring your music forth into the air.
Act 5 Scene 1 Speech 23: Here will we sit and let the sounds of music
Act 5 Scene 1 Speech 23: And draw her home with music.
Act 5 Scene 1 Speech 24: I am never merry when I hear sweet music.
Act 5 Scene 1 Speech 25: Or any air of music touch their ears,
Act 5 Scene 1 Speech 25: By the sweet power of music: therefore the poet
Act 5 Scene 1 Speech 25: But music for the time doth change his nature.
Act 5 Scene 1 Speech 25: The man that hath no music in himself,
Act 5 Scene 1 Speech 25: Let no such man be trusted. Mark the music.
Act 5 Scene 1 Speech 29: It is your music, madam, of the house.
Act 5 Scene 1 Speech 32: No better a musician than the wren.


In [38]:
from collections import Counter
speaker_seq = [s.text for s in merchant.findall('ACT/SCENE/SPEECH/SPEAKER')]
speaker_freq = Counter(speaker_seq)
top5 = speaker_freq.most_common(5)
top5

[('PORTIA', 117),
 ('SHYLOCK', 79),
 ('BASSANIO', 73),
 ('GRATIANO', 48),
 ('ANTONIO', 47)]

In [39]:
from collections import defaultdict
abbreviate = defaultdict(lambda: 'OTH')
for speaker, _ in top5:
     abbreviate[speaker] = speaker[:4]

speaker_seq2 = [abbreviate[speaker] for speaker in speaker_seq]
cfd = nltk.ConditionalFreqDist(nltk.bigrams(speaker_seq2))
cfd.tabulate()

     ANTO BASS GRAT  OTH PORT SHYL 
ANTO    0   11    4   11    9   12 
BASS   10    0   11   10   26   16 
GRAT    6    8    0   19    9    5 
 OTH    8   16   18  153   52   25 
PORT    7   23   13   53    0   21 
SHYL   15   15    2   26   21    0 


In [41]:
from nltk.corpus import toolbox
lexicon = toolbox.xml('rotokas.dic')
lexicon[3][0]

<Element 'lx' at 0x76f500437ce0>

In [42]:
lexicon[3][0].tag

'lx'

In [43]:
lexicon[3][0].text

'kaa'

In [45]:
[lexeme.text.lower() for lexeme in lexicon.findall('record/lx')][:50:5]

['kaa',
 'kaakaavo',
 'kaakito',
 'kaapie',
 'kaapo',
 'kaarekopie',
 'kaaveaka',
 'kae',
 'kaekaesoto',
 'kaepie']

In [46]:
import sys
from nltk.util import elementtree_indent
from xml.etree.ElementTree import ElementTree
elementtree_indent(lexicon)
tree = ElementTree(lexicon[3])
tree.write(sys.stdout, encoding='unicode')

<record>
    <lx>kaa</lx>
    <ps>N</ps>
    <pt>MASC</pt>
    <cl>isi</cl>
    <ge>cooking banana</ge>
    <tkp>banana bilong kukim</tkp>
    <pt>itoo</pt>
    <sf>FLORA</sf>
    <dt>12/Aug/2005</dt>
    <ex>Taeavi iria kaa isi kovopaueva kaparapasia.</ex>
    <xp>Taeavi i bin planim gaden banana bilong kukim tasol long paia.</xp>
    <xe>Taeavi planted banana in order to cook it.</xe>
  </record>

In [47]:
html = "<table>\n"
for entry in lexicon[70:80]:
  lx = entry.findtext('lx')
  ps = entry.findtext('ps')
  ge = entry.findtext('ge')
  html += "  <tr><td>%s</td><td>%s</td><td>%s</td></tr>\n" % (lx, ps, ge)
html += "</table>"
print(html)

<table>
  <tr><td>kakae</td><td>???</td><td>small</td></tr>
  <tr><td>kakae</td><td>CLASS</td><td>child</td></tr>
  <tr><td>kakaevira</td><td>ADV</td><td>small-like</td></tr>
  <tr><td>kakapikoa</td><td>???</td><td>small</td></tr>
  <tr><td>kakapikoto</td><td>N</td><td>newborn baby</td></tr>
  <tr><td>kakapu</td><td>V</td><td>place in sling for purpose of carrying</td></tr>
  <tr><td>kakapua</td><td>N</td><td>sling for lifting</td></tr>
  <tr><td>kakara</td><td>N</td><td>arm band</td></tr>
  <tr><td>Kakarapaia</td><td>N</td><td>village name</td></tr>
  <tr><td>kakarau</td><td>N</td><td>frog</td></tr>
</table>


In [48]:
from nltk.corpus import toolbox
lexicon = toolbox.xml('rotokas.dic')
sum(len(entry) for entry in lexicon) / len(lexicon)

13.635955056179775

In [49]:
import re
from xml.etree.ElementTree import SubElement

def cv(s):
  s = s.lower()
  s = re.sub(r'[^a-z]',     r'_', s)
  s = re.sub(r'[aeiou]',    r'V', s)
  s = re.sub(r'[^V_]',      r'C', s)
  return (s)

def add_cv_field(entry):
  for field in entry:
    if field.tag == 'lx':
      cv_field = SubElement(entry, 'cv')
      cv_field.text = cv(field.text)

In [50]:
lexicon = toolbox.xml('rotokas.dic')
add_cv_field(lexicon[53])
print(nltk.toolbox.to_sfm_string(lexicon[53]))

\lx kaeviro
\ps V
\pt A
\ge lift off
\ge take off
\tkp go antap
\sc MOTION
\vx 1
\nt used to describe action of plane
\dt 03/Jun/2005
\ex Pita kaeviroroe kepa kekesia oa vuripierevo kiuvu.
\xp Pita i go antap na lukim haus win i bagarapim.
\xe Peter went to look at the house that the wind destroyed.
\cv CVVCVCV



In [None]:
from collections import Counter
field_sequences = Counter(':'.join(field.tag for field in entry) for entry in lexicon)
field_sequences.most_common(15)

[('lx:ps:pt:ge:tkp:dt:ex:xp:xe', 41),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe', 37),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 27),
 ('lx:ps:pt:ge:tkp:nt:dt:ex:xp:xe', 20),
 ('lx:ps:pt:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe', 17),
 ('lx:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 16),
 ('lx:rt:ps:pt:ge:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 12),
 ('lx:ps:pt:ge:tkp:nt:sf:dt:ex:xp:xe', 9),
 ('lx:ps:pt:ge:ge:tkp:dt:ex:xp:xe', 9),
 ('lx:rt:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe:ex:xp:xe', 9),
 ('lx:ps:ge:tkp:dt:ex:xp:xe', 8),
 ('lx:ps:pt:ge:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 8),
 ('lx:rt:ps:pt:ge:ge:tkp:dt:ex:xp:xe', 8),
 ('lx:alt:rt:ps:pt:ge:tkp:dt:ex:xp:xe:ex:xp:xe', 7),
 ('lx:alt:rt:ps:pt:ge:tkp:dt:ex:xp:xe', 7)]

In [53]:
grammar = nltk.CFG.fromstring('''
  S -> Head PS Glosses Comment Date Sem_Field Examples
  Head -> Lexeme Root
  Lexeme -> "lx"
  Root -> "rt" |
  PS -> "ps"
  Glosses -> Gloss Glosses |
  Gloss -> "ge" | "tkp" | "eng"
  Date -> "dt"
  Sem_Field -> "sf"
  Examples -> Example Ex_Pidgin Ex_English Examples |
  Example -> "ex"
  Ex_Pidgin -> "xp"
  Ex_English -> "xe"
  Comment -> "cmt" | "nt" |
  ''')

In [54]:
def validate_lexicon(grammar, lexicon, ignored_tags):
  rd_parser = nltk.RecursiveDescentParser(grammar)
  for entry in lexicon:
    marker_list = [field.tag for field in entry if field.tag not in ignored_tags]
    if list(rd_parser.parse(marker_list)):
      print("+", ':'.join(marker_list))
    else:
      print("-", ':'.join(marker_list))

In [55]:
lexicon = toolbox.xml('rotokas.dic')[10:20]
ignored_tags = ['arg', 'dcsv', 'pt', 'vx']
validate_lexicon(grammar, lexicon, ignored_tags)

- lx:ps:ge:tkp:sf:nt:dt:ex:xp:xe:ex:xp:xe:ex:xp:xe
- lx:rt:ps:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:tkp:nt:dt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:tkp:nt:sf:dt
- lx:ps:ge:tkp:dt:cmt:ex:xp:xe:ex:xp:xe
- lx:ps:ge:ge:ge:tkp:cmt:dt:ex:xp:xe
- lx:rt:ps:ge:ge:tkp:dt
- lx:rt:ps:ge:eng:eng:eng:ge:tkp:tkp:dt:cmt:ex:xp:xe:ex:xp:xe:ex:xp:xe:ex:xp:xe:ex:xp:xe
- lx:rt:ps:ge:tkp:dt:ex:xp:xe
- lx:ps:ge:ge:tkp:dt:ex:xp:xe:ex:xp:xe
