# Parse texts

In [None]:
!pip install stanza
!pip install lxml

In [40]:
import re
import os 
from collections import defaultdict
from time import sleep
from io import StringIO

import requests
import stanza
from lxml import etree

import logging
logging.disable()

In [3]:
API = 'https://dracor.org/api'

DRACOR_TO_STANZA = {
    'span': 'es',
    'rus': 'ru'
}

In [41]:
plays = {
    'rus': [
        'bulgakov-beg',
        'andreyev-mysl',
        'bulgakov-zojkina-kvartira',
        'chekhov-vishnevyi-sad',
        'ostrovsky-bespridannitsa',
        'chekhov-tri-sestry',
        'ostrovsky-groza',
        'turgenev-holostjak',
        'gogol-revizor',
        'ostrovsky-beshenye-dengi'
    ],
    'span': [
        'clarin-teresa',
        'dicenta-juan-jose',
        'echegaray-arrastrarse',
        'echegaray-mancha',
        'galdos-casandra',
        'galdos-electra',
        'galdos-perfecta',
        'lorca-bernarda',
        'lorca-bodas',
        'lorca-rosita',
        'lorca-yerma',
        'lorca-zapatera',
        'munoz-conferencia',
        'munoz-ortiz',
        'munoz-pergaminos',
        'munoz-refugio',
        'unamuno-esfinge',
        'unamuno-fedra',
        'valera-asclepigenia',
        'valera-atahualpa',
        'valle-aguila',
        'valle-cara',
        'valle-divinas-palabras',
        'valle-luces',
        'valle-romance'
    ]
}

In [42]:
playtexts = defaultdict(dict)

for corpusname, playlist in plays.items():
  for playname in playlist:
    endpoint = f'{API}/corpora/{corpusname}/play/{playname}/tei'
    text = requests.get(endpoint).text
    playtexts[corpusname][playname] = text
    sleep(1)  # just in case: to avoid DDoS :)

In [43]:
NAMESPACE = {'ns': 'http://www.tei-c.org/ns/1.0'}
UTTERANCE_XPATH = '//ns:sp'
TEXT_XPATH = './ns:p'
LOC_XPATH = '//loc'

In [44]:
SPACE_REGEX = re.compile(r'\s+')

In [45]:
# Prepare the text for NER parsing and save all other XML elements.

def get_text_parts(p_tag):
  """In <p> tag, separate raw text and XML elements"""
  parts = []
  part_idxs = []

  if p_tag.text:
    parts.append(p_tag.text)
  
  for child in p_tag:
    parts.append(child)
    if child.tail:
      parts.append(child.tail)
  
  return parts


def compile_relevant_text(parts):
    """
    From all parts of <p> tag (got from get_text_parts),
    compile the text for NER parsing and save information
    about relevant parts.
    """
    selected_parts = []  # parts of text for NER parsing
    selected_part_idxs = []  # idxs of parts where the locations will be annotated
    selected_char_idxs = []  # starts and ends of parts in the string

    prev_end_idx = 0
    for i, part in enumerate(parts):
      text = part if isinstance(part, str) else part.text
      preproc = SPACE_REGEX.sub(' ', text)
      end_idx = prev_end_idx + len(preproc)

      if not isinstance(part, str):
        # replace all `\n` inside <stage> tag for convenience 
        # but not include it into parsing
        if part.tag == 'stage':
          part.text = preproc
        else:
          # include tag text into the string
          # but not to the list of parts where locations will be annotated
          # (because if it is in some tag, it's probably another entity)
          selected_parts.append(preproc)
          prev_end_idx = end_idx
        continue
  
      selected_parts.append(preproc)
      selected_part_idxs.append(i)
      selected_char_idxs.append((prev_end_idx, end_idx))
      prev_end_idx = end_idx

    return (
      ''.join(selected_parts), 
      selected_part_idxs, 
      selected_char_idxs
    )

In [46]:
# Insert parsed locations to the <p> tag as XML elements.

def group_locations_by_relevant_parts(locations, selected_char_idxs):
  """Save the list of locations for each relevant part"""
  locations_by_part = [[] for _ in selected_char_idxs]

  curr_part_idx = 0
  for location in locations:
    # find corresponding part for a location
    while location.start_char >= selected_char_idxs[curr_part_idx][1]:
      curr_part_idx += 1

    part_start, part_end = selected_char_idxs[curr_part_idx]
    # location should be strictly inside the part
    if part_start <= location.start_char and location.end_char < part_end:
      locations_by_part[curr_part_idx].append(location)

  return locations_by_part


def create_elems_from_locations(location_lst, part_text, part_start):
  """
  Create XML elemets for each location found in the string part.
  If no locations are found, raw text is the only element in the part.
  """
  parsed_elems = []
  prev_end_idx = 0

  for ent in location_lst:
    # create XML element for a location
    ent_start = ent.start_char - part_start
    ent_end = ent.end_char - part_start
    loc_elem = etree.fromstring(f'<loc>{ent.text}</loc>')

    # Account for all text before the entity
    # (put it as a starting text of the part or join it to previous element)
    starting_text = part_text[prev_end_idx:ent_start]
    if not parsed_elems:
      parsed_elems.append(starting_text)
    else:
      parsed_elems[-1].tail = starting_text

    parsed_elems.append(loc_elem)
    prev_end_idx = ent_end

  if parsed_elems:  # join the remaining text to the last entity
    parsed_elems[-1].tail = part_text[prev_end_idx:]
  else:  # if there were no locations, raw text is the only element
    parsed_elems = [part_text]

  return parsed_elems


def create_elems_from_ner(
  text, 
  ner_output, 
  all_parts, 
  selected_part_idxs, 
  selected_char_idxs
):
  """
  Update parts of <p> tag so that they include DOM elements for all locations
  while preserving all other elements that were there before parsing.
  """
  locations = [ent for ent in ner_output.ents if ent.type == 'LOC']
  locations_by_part = group_locations_by_relevant_parts(locations, selected_char_idxs)

  for location_lst, char_span, part_idx in zip(locations_by_part, selected_char_idxs, selected_part_idxs):
    part_start, part_end = char_span
    part_text = text[part_start:part_end]
    parsed_elems = create_elems_from_locations(location_lst, part_text, part_start)
    all_parts[part_idx] = parsed_elems

  return all_parts

In [47]:
# Parse locations in the play, insert them as XML elements, save new XML into the file.

def flatten_parts(all_parts):
  """
  From collected elements for each part, 
  make a flattened element list that can be inserted in <p>
  """
  flat_parts = []
  for element_lst in all_parts:
    # an initial XML tag inside <p>, not touched by parsing, e.g. <stage> or <rs>
    if not isinstance(element_lst, list):
      element_lst = [element_lst]

    for i, element in enumerate(element_lst):
      # if the first element is a raw string, add it as a <p> starting text 
      # or join to element from previous part
      if not i and isinstance(element, str):
        if not flat_parts:
          flat_parts.append(element)
        else:
          flat_parts[-1].tail = element
  
      else:  # all other elements in the list are definitely XML elements
        flat_parts.append(element)

  return flat_parts


def parse_ner_in_utterance(p_tags, stanza_obj):
  """
  Parse locations in all <p> tags inside one speaker's utterance
  and insert them as XML elements.
  """
  utterance_parsed_elems = []

  for p_tag in p_tags:
    # get the text for parsing and save other elements inside <p> tag
    all_parts = get_text_parts(p_tag)
    text, selected_part_idxs, selected_char_idxs = compile_relevant_text(all_parts)

    # if there's no relevant text, leave <p> as it was
    if not text:
      utterance_parsed_elems.append([p_tag.text] + p_tag.getchildren())
      continue
    
    # parse locations and insert them to <p> as XML elements. 
    ner_output = stanza_obj(text)
    all_parts = create_elems_from_ner(
        text, 
        ner_output, 
        all_parts, 
        selected_part_idxs, 
        selected_char_idxs
    )

    # create a flat list of all parts for <p> tag
    all_parts = flatten_parts(all_parts)
    utterance_parsed_elems.append(all_parts)

  return utterance_parsed_elems


def replace_text_in_tag(parsed_p_content, p_tags):
  """
  Replace everything insize utterance 
  with new parts with parsed locations
  """
  for parsed_elems, p_tag in zip(parsed_p_content, p_tags):
    # Clean contents of <p>
    for child in p_tag.getchildren():
      p_tag.remove(child)
    p_tag.text = ''
    
    # write new elements to <p>
    for i, elem in enumerate(parsed_elems):
      if not i and isinstance(elem, str):
        p_tag.text = elem
      else:
        p_tag.append(elem)
  
  # not returning anything because all these operations are inplace


def parse_ner_in_play(xml_path, xml_text, stanza_obj):
  """
  The main function.
  Take XML of a play, find all utterances, 
  find <p> tags in each utterance and process them,
  count utterances and parsed locations,
  write new XML into a file.
  """
  tree = etree.parse(StringIO(xml_text))
  utterances = tree.xpath(UTTERANCE_XPATH, namespaces=NAMESPACE)
  print('N utterances', len(utterances))

  for utterance in utterances:
    p_tags = utterance.xpath(TEXT_XPATH, namespaces=NAMESPACE)
    parsed_texts = parse_ner_in_utterance(p_tags, stanza_obj)
    replace_text_in_tag(parsed_texts, p_tags)

  locs = tree.xpath(LOC_XPATH, namespaces=NAMESPACE)
  print('N locations', len(locs))
  
  xml_str = etree.tostring(
      tree,
      pretty_print=True, 
      encoding='utf-8', 
      xml_declaration=True
  ).decode('utf-8')
  
  with open(xml_path, 'w') as f:
    f.write(xml_str)


In [48]:
os.makedirs('corpus', exist_ok=True)

for corpusname, corpus in playtexts.items():
  print(corpusname)
  os.makedirs(os.path.join('corpus', corpusname), exist_ok=True)
  stanza_obj = stanza.Pipeline(lang=DRACOR_TO_STANZA[corpusname], processors='tokenize,ner')

  for playname, play_xml in corpus.items():
    print(playname)
    xml_path = os.path.join('corpus', corpusname, f'{playname}.xml')
    parse_ner_in_play(xml_path, play_xml, stanza_obj)

rus


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

bulgakov-beg
N utterances 821
N locations 176
andreyev-mysl
N utterances 630
N locations 40
bulgakov-zojkina-kvartira
N utterances 1406
N locations 214
chekhov-vishnevyi-sad
N utterances 634
N locations 62
ostrovsky-bespridannitsa
N utterances 1242
N locations 86
chekhov-tri-sestry
N utterances 758
N locations 88
ostrovsky-groza
N utterances 784
N locations 35
turgenev-holostjak
N utterances 883
N locations 47
gogol-revizor
N utterances 927
N locations 63
ostrovsky-beshenye-dengi
N utterances 1214
N locations 74
span


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

clarin-teresa
N utterances 320
N locations 9
dicenta-juan-jose
N utterances 863
N locations 7
echegaray-arrastrarse
N utterances 1599
N locations 51
echegaray-mancha
N utterances 1313
N locations 8
galdos-casandra
N utterances 997
N locations 20
galdos-electra
N utterances 1588
N locations 36
galdos-perfecta
N utterances 1165
N locations 34
lorca-bernarda
N utterances 906
N locations 3
lorca-bodas
N utterances 876
N locations 0
lorca-rosita
N utterances 749
N locations 32
lorca-yerma
N utterances 658
N locations 2
lorca-zapatera
N utterances 532
N locations 11
munoz-conferencia
N utterances 251
N locations 9
munoz-ortiz
N utterances 1218
N locations 61
munoz-pergaminos
N utterances 1508
N locations 43
munoz-refugio
N utterances 1361
N locations 98
unamuno-esfinge
N utterances 817
N locations 3
unamuno-fedra
N utterances 663
N locations 1
valera-asclepigenia
N utterances 193
N locations 34
valera-atahualpa
N utterances 371
N locations 58
valle-aguila
N utterances 1206
N locations 27
val

In [None]:
!zip -r corpus.zip corpus

# Test that only \<loc\> tags have been added

In [49]:
from difflib import SequenceMatcher

for corpusname, corpus in playtexts.items():
  for playname, play_xml in corpus.items():
    xml_path = os.path.join('corpus', corpusname, f'{playname}.xml')
    with open(xml_path) as f:
      parsed_play = f.read()
    
    parsed_play = parsed_play.replace('<?xml version=\'1.0\' encoding=\'utf-8\'?>', '')
    parsed_play = SPACE_REGEX.sub('', parsed_play)
    parsed_play = re.sub(r'</?loc>', '', parsed_play)

    play_xml = play_xml.replace(r'<?xml-stylesheet type="text/css" href="../css/tei.css"?>', '')
    play_xml = SPACE_REGEX.sub('', play_xml)
    print(playname, play_xml == parsed_play)

    # sm = SequenceMatcher(None, play_xml, parsed_play)
    # prev_a = 0
    # prev_b = 0
    # for match in sm.get_matching_blocks():
    #   # print(match.size)
    #   if match.size:
    #     print(play_xml[prev_a-10:match.a+10], parsed_play[prev_b-10:match.b+10])
    #     prev_a += match.size
    #     prev_b += match.size

bulgakov-beg True
andreyev-mysl True
bulgakov-zojkina-kvartira True
chekhov-vishnevyi-sad True
ostrovsky-bespridannitsa True
chekhov-tri-sestry True
ostrovsky-groza True
turgenev-holostjak True
gogol-revizor True
ostrovsky-beshenye-dengi True
clarin-teresa True
dicenta-juan-jose True
echegaray-arrastrarse True
echegaray-mancha True
galdos-casandra True
galdos-electra True
galdos-perfecta True
lorca-bernarda True
lorca-bodas True
lorca-rosita True
lorca-yerma True
lorca-zapatera True
munoz-conferencia True
munoz-ortiz True
munoz-pergaminos True
munoz-refugio True
unamuno-esfinge True
unamuno-fedra True
valera-asclepigenia True
valera-atahualpa True
valle-aguila True
valle-cara True
valle-divinas-palabras True
valle-luces True
valle-romance True
