# Parse texts

In [None]:
!pip install stanza
!pip install lxml

In [2]:
import re
import os 
from collections import defaultdict
from time import sleep
from io import StringIO

import requests
import stanza
from lxml import etree

In [None]:
STANZA_OBJ = stanza.Pipeline(lang='es', processors='tokenize,ner')

In [4]:
API = 'https://dracor.org/api'

In [25]:
plays = {
    # 'rus': [
    #     'bulgakov-beg',
    #     'andreyev-mysl',
    #     'bulgakov-zojkina-kvartira',
    #     'chekhov-vishnevyi-sad',
    #     'ostrovsky-bespridannitsa',
    #     'chekhov-tri-sestry',
    #     'ostrovsky-groza',
    #     'turgenev-holostjak',
    #     'gogol-revizor',
    #     'ostrovsky-beshenye-dengi'
    # ],
    'span': [
        'clarin-teresa',
        'dicenta-juan-jose',
        'echegaray-arrastrarse',
        'echegaray-mancha',
        'galdos-casandra',
        'galdos-electra',
        'galdos-perfecta',
        'lorca-bernarda',
        'lorca-bodas',
        'lorca-rosita'
    ]
}

In [26]:
playtexts = defaultdict(dict)

for corpusname, playlist in plays.items():
  for playname in playlist:
    endpoint = f'{API}/corpora/{corpusname}/play/{playname}/tei'
    text = requests.get(endpoint).text
    playtexts[corpusname][playname] = text
    sleep(1)

In [50]:
NAMESPACE = {'ns': 'http://www.tei-c.org/ns/1.0'}
TURN_XPATH = '//ns:sp'
TEXT_XPATH = './ns:p'
LOC_XPATH = '//loc'

SPACE_REGEX = re.compile(r'\s+')

def get_text_parts(p_tag):
  parts = []
  part_idxs = []

  if p_tag.text:
    parts.append(p_tag.text)
  
  for child in p_tag:
    parts.append(child)
    if child.tail:
      parts.append(child.tail)
  
  return parts


def compile_relevant_text(parts):
    selected_parts = []
    selected_part_idxs = []
    selected_idxs_recalc = []
    prev_end_idx = 0

    for i, part in enumerate(parts):
      if not isinstance(part, str):
        if part.tag != 'stage':
          preproc = SPACE_REGEX.sub(' ', part.text)
          selected_parts.append(preproc)
          prev_end_idx = prev_end_idx + len(preproc)
        continue

      preproc = SPACE_REGEX.sub(' ', part)
      selected_parts.append(preproc)
      selected_part_idxs.append(i)
      selected_idxs_recalc.append((prev_end_idx, prev_end_idx + len(preproc)))
      prev_end_idx = prev_end_idx + len(preproc)

    return ''.join(selected_parts), selected_part_idxs, selected_idxs_recalc


def create_elems_from_ner(text, ner_output, all_parts, selected_part_idxs, selected_idxs_recalc):
  locations = [ent for ent in ner_output.ents if ent.type == 'LOC']

  locations_by_part = [[] for _ in selected_idxs_recalc]
  curr_part_idx = 0

  for location in locations:
    while location.start_char >= selected_idxs_recalc[curr_part_idx][1]:
      curr_part_idx += 1

    part_start, part_end = selected_idxs_recalc[curr_part_idx]
    if part_start <= location.start_char and location.end_char < part_end:
      locations_by_part[curr_part_idx].append(location)

  for location_lst, (part_start, part_end), part_idx in zip(locations_by_part, selected_idxs_recalc, selected_part_idxs):
    part_text = text[part_start:part_end]
    first_tail = part_text  # for the case with no locations
    parsed_elems = []
    prev_end_idx = 0

    for ent in location_lst:
      ent_start = ent.start_char - part_start
      ent_end = ent.end_char - part_start
      loc_elem = etree.fromstring(f'<loc>{ent.text}</loc>')

      if not parsed_elems:
        first_tail = part_text[:ent_start]
      else:
        parsed_elems[-1].tail = part_text[prev_end_idx:ent_start]
      
      parsed_elems.append(loc_elem)
      prev_end_idx = ent_end

    if parsed_elems:
      parsed_elems[-1].tail = part_text[prev_end_idx:]

    if first_tail:
      parsed_elems.insert(0, first_tail)
    all_parts[part_idx] = parsed_elems

  return all_parts


def recompile_parts(all_parts):
  flat_parts = []
  for partgroup in all_parts:
    if not isinstance(partgroup, list):
      partgroup = [partgroup]

    for i, part in enumerate(partgroup):
      if not i and isinstance(part, str):
        if not flat_parts:
          flat_parts.append(part)
        else:
          flat_parts[-1].tail = part
  
      else:
        flat_parts.append(part)

  return flat_parts


def parse_ner_in_turn(p_tags):
  turn_parsed_elems = []
  for p_tag in p_tags:
    all_parts = get_text_parts(p_tag)
    text, selected_part_idxs, selected_idxs_recalc = compile_relevant_text(all_parts)

    if not text:
      turn_parsed_elems.append([text])
      continue

    ner_output = STANZA_OBJ(text)
    all_parts = create_elems_from_ner(text, ner_output, all_parts, selected_part_idxs, selected_idxs_recalc)
    all_parts = recompile_parts(all_parts)
    turn_parsed_elems.append(all_parts)

  return turn_parsed_elems


def replace_text_in_tag(parsed_texts, p_tags):
  for parsed_elems, p_tag in zip(parsed_texts, p_tags):
    old_children = p_tag.getchildren()
    for child in old_children:
      p_tag.remove(child)
    p_tag.text = ''

    elem_start_idx = 0
    if isinstance(parsed_elems[0], str):
      p_tag.text = parsed_elems[0]
      elem_start_idx = 1
    
    for elem in parsed_elems[elem_start_idx:]:
      p_tag.append(elem)


def parse_ner_in_play(xml_path, xml_text):
  tree = etree.parse(StringIO(xml_text))
  turns = tree.xpath(TURN_XPATH, namespaces=NAMESPACE)
  print('N turns', len(turns))

  for turn in turns:
    p_tags = turn.xpath(TEXT_XPATH, namespaces=NAMESPACE)
    parsed_texts = parse_ner_in_turn(p_tags)
    replace_text_in_tag(parsed_texts, p_tags)
  
  locs = tree.xpath(LOC_XPATH, namespaces=NAMESPACE)
  print('N locations', len(locs))
  
  xml_str = etree.tostring(
      tree,
      pretty_print=True, 
      encoding='utf-8', 
      xml_declaration=True
  ).decode('utf-8')
  
  with open(xml_path, 'w') as f:
    f.write(xml_str)


In [51]:
os.makedirs('corpus', exist_ok=True)

for corpusname, corpus in playtexts.items():
  print(corpusname)
  os.makedirs(os.path.join('corpus', corpusname), exist_ok=True)
  for playname, play_xml in corpus.items():
    print(playname)
    xml_path = os.path.join('corpus', corpusname, f'{playname}.xml')
    parse_ner_in_play(xml_path, play_xml)

span
clarin-teresa
N turns 320
N locations 9
dicenta-juan-jose
N turns 863
N locations 7
echegaray-arrastrarse
N turns 1599
N locations 51
echegaray-mancha
N turns 1313
N locations 8
galdos-casandra
N turns 997
N locations 20
galdos-electra
N turns 1588
N locations 36
galdos-perfecta
N turns 1165
N locations 34
lorca-bernarda
N turns 906
N locations 3
lorca-bodas
N turns 876
N locations 0
lorca-rosita
N turns 749
N locations 32


# Test that only \<loc\> tags have been added

In [52]:
from difflib import SequenceMatcher

for corpusname, corpus in playtexts.items():
  for playname, play_xml in corpus.items():
    xml_path = os.path.join('corpus', corpusname, f'{playname}.xml')
    with open(xml_path) as f:
      parsed_play = f.read()
    
    parsed_play = parsed_play.replace('<?xml version=\'1.0\' encoding=\'utf-8\'?>', '')
    parsed_play = SPACE_REGEX.sub('', parsed_play)
    parsed_play = re.sub(r'</?loc>', '', parsed_play)

    play_xml = play_xml.replace(r'<?xml-stylesheet type="text/css" href="../css/tei.css"?>', '')
    play_xml = SPACE_REGEX.sub('', play_xml)
    print(playname, play_xml == parsed_play)

    # sm = SequenceMatcher(None, play_xml, parsed_play)
    # prev_a = 0
    # prev_b = 0
    # for match in sm.get_matching_blocks():
    #   # print(match.size)
    #   if match.size:
    #     print(play_xml[prev_a-10:match.a+10], parsed_play[prev_b-10:match.b+10])
    #     prev_a += match.size
    #     prev_b += match.size

clarin-teresa True
dicenta-juan-jose True
echegaray-arrastrarse True
echegaray-mancha True
galdos-casandra True
galdos-electra True
galdos-perfecta True
lorca-bernarda True
lorca-bodas True
lorca-rosita True
