## Installation and downloads
original debugging code is [here](https://colab.research.google.com/drive/1Lx9pIxX0JYOGFG0Aoe39qW4N5vmOV74G?usp=sharing).



In [None]:
! wget https://dl.fbaipublicfiles.com/GENRE/fairseq_multilingual_entity_disambiguation.tar.gz
! tar -xvf fairseq_multilingual_entity_disambiguation.tar.gz

! wget http://dl.fbaipublicfiles.com/GENRE/titles_lang_all105_marisa_trie_with_redirect.pkl

In [None]:
# remove locally installed `examples` package to avoid import error
! rm -rf /usr/local/lib/python3.7/dist-packages/examples

! git clone --branch fixing_prefix_allowed_tokens_fn https://github.com/nicola-decao/fairseq

# remove the bugged lines in this fairseq version
! sed -i -e '26,27d' /content/fairseq/fairseq/registry.py

In [None]:
! cd fairseq && pip install  ./

In [None]:
# further path fixes

! mkdir -p examples_dir
! cp -r /content/fairseq/examples/ /content/examples_dir
! ls /content/examples_dir

! cp -r /content/fairseq/fairseq/models/speech_to_text/modules /usr/local/lib/python3.7/dist-packages/fairseq/models/speech_to_text

In [None]:
!rm -rf GENRE
!git clone https://github.com/facebookresearch/GENRE.git

In [None]:
! cd GENRE && pip install ./

In [None]:
! pip install sentencepiece marisa_trie

In [None]:
# ! pip install spacy
! python -m spacy download es_core_news_sm
! python -m spacy download ru_core_news_sm

# Parse

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [11]:
!cp -r /content/drive/MyDrive/SWT/final/ .

In [12]:
import sys
sys.path.append('/content/GENRE/genre')
sys.path.append('/content/examples_dir')

In [13]:
import re
import os
import pickle
from collections import defaultdict
from time import sleep

import spacy
import requests
from lxml import etree

from fairseq_model import mGENRE
from genre.trie import Trie, MarisaTrie

In [14]:
NAMESPACE = {'ns': 'http://www.tei-c.org/ns/1.0'}
UTTERANCE_XPATH = '//ns:sp'
TEXT_XPATH = './ns:p'
LOC_XPATH = './/ns:loc'

SPACE_REGEX = re.compile(r'\s+')

MAX_CONTEXT_CHARS = 1000  # context limit from one side of location

REQUEST_URL = 'https://%s.wikipedia.org/w/api.php'
USER_AGENT = {'User-Agent': 'Location extractor (e.garanina@student.rug.nl)'}
N_TITLES_PER_REQUEST = 50

TITLE_REGEX = re.compile(r'^(.+?) >> (.+)$')
GENRE_THRESHOLD = -0.65

WIKILANGS = {'ru', 'es'}

In [15]:
GENRE_MODEL = mGENRE.from_pretrained("fairseq_multilingual_entity_disambiguation").eval()

In [16]:
# memory efficient but slower prefix tree (trie) -- it is implemented with `marisa_trie`
with open("titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
    TRIE = pickle.load(f)

In [17]:
SPACY_OBJS = {
    'span': spacy.load("es_core_news_sm"),
    'rus': spacy.load("ru_core_news_sm")
}

In [40]:
# Cut context of the location so that it fits into the model.

def sentence_split(lang, text):
    """
    Split text into sentences.
    Return sentences and their indices in the text.
    """
    # workaround to avoid splitting by [START] or [END]
    clear_text = text.replace('[START] ', 'a' * 8).replace(' [END]', 'a' * 6)

    nlp = SPACY_OBJS[lang]
    doc = nlp(clear_text)
    assert doc.has_annotation("SENT_START")
  
    sentence_idxs = [(s.start_char, s.end_char) for s in doc.sents]
    sentences = [text[s:e] for s, e in sentence_idxs]
  
    return sentences, sentence_idxs


def get_location_sentence_id(loc_start, loc_end, sentence_idxs):
    """
    Get sentence idx of the parsed location
    based on indices of location and sentences in the text.
    """
    loc_sentence_id = 0
    n_sentences = len(sentence_idxs)

    # iterate until start of location is in the current sentence
    while loc_start >= sentence_idxs[loc_sentence_id][1]:
        loc_sentence_id += 1

    # print(loc_start, loc_end, sentence_idxs)
    # end of location must be in the same sentence
    assert loc_end < sentence_idxs[loc_sentence_id][1]
    return loc_sentence_id


def find_context(sentences, loc_sentence_id, loc_span, loc_sentence_start):
    """
    Limit context for locations 
    (due to sequence length restriction in the model).
    Take sentences from left and right until 
    the context length reaches MAX_CONTEXT_CHARS.
    """
    context = {
        'left': {
            'offset': 0,  # absolute diff between current sentence idx and loc_sentence_id
            'n_chars': 0,  # number of characters in already taken sentences
            'increment': -1,  # "direction" of incrementing the indices from loc_sentence_id
            'start': loc_span[0] - loc_sentence_start  # start idx of location inside loc_sentence_id
        },
        'right': {
            'offset': 0,
            'n_chars': 0,
            'increment': 1,
            'start': loc_span[1] - loc_sentence_start
        }
    }

    # consider right and left context separately
    for key, info in context.items():
        idx = loc_sentence_id
        stable_idx = loc_sentence_id

        # take new sentences until n_chars exceeds limit
        while info['n_chars'] < MAX_CONTEXT_CHARS:
            stable_idx = idx  # previous idx which did not exceed the limit

            # if we consider sentence with loc (at first step), 
            # add to n_chars n_symbols before / after the location
            if info['offset'] == 0:
                info['n_chars'] += len(sentences[loc_sentence_id][:info['start']])
            else:
                # new index ty try
                idx = loc_sentence_id + (info['offset'] * info['increment'])

                # if new idx is out of range, end the loop
                if idx < 0 or idx == len(sentences):
                  break

                info['n_chars'] += len(sentences[idx])

            # increasing the diff (i.e. going one sent further)
            info['offset'] += 1

        # if we're out of the loop, 
        # info['idx'] is out of the range or limit of n_chars is exceeded, 
        # so take idx from previous iteration, which passed
        info['idx'] = stable_idx

    return context


def cut_context(text, lang):
    """ 
    Cut sentences from left and right context of location.
    Split text into sentences with spacy, 
    cut sentences which do not fit into MAX_CONTEXT_CHARS limit.
    """
    # if the whole text is much smaller than overall limit, do nothing
    if len(text) <= MAX_CONTEXT_CHARS:
        return text

    # split text into sentences
    sentences, sentence_idxs = sentence_split(lang, text)
    # print(text)
    # print(sentences)

    # find location position in the text
    loc_regex = re.compile(r'\[START\].+?\[END\]')
    loc = loc_regex.search(text)
    loc_start, loc_end = loc.start(), loc.end()

    # get idx of the sentence with the location
    loc_sentence_id = get_location_sentence_id(loc_start, loc_end, sentence_idxs)

    # print(loc.group(), loc_start, loc_end)
    # print(sentences)
    # print(sentence_idxs)

    # find info on limited context (border sentence idxs)
    context = find_context(
        sentences, 
        loc_sentence_id, 
        (loc_start, loc_end), 
        sentence_idxs[loc_sentence_id][0]
    )

    # return cut context
    return ' '.join(sentences[context['left']['idx']:context['right']['idx'] + 1])

In [51]:
# Get texts for locations and run GENRE inference.

def get_text_parts(p_tag, text_only_for_loc=False):
  """
  In <p> tag, separate raw text and XML elements. 
  Treat `note` tags as nested <p>.
  if `text_only_for_loc` is True, write only raw text for loc tags.
  """
  parts = [' ']
  part_idxs = []

  if p_tag.text:
    parts.append(p_tag.text)
  
  for child in p_tag:
    if child.tag == f'{{{NAMESPACE["ns"]}}}note':
      parts += get_text_parts(child, text_only_for_loc)
      continue

    if text_only_for_loc and child.tag == f'{{{NAMESPACE["ns"]}}}loc':
      parts.append(child.text)
    else:
      parts.append(child)

    if child.tail:
      parts.append(child.tail)
  
  return parts


def create_text_for_location(loc_idx, selected_parts):
    """
    Given text parts, create representation 
    for GENRE inference for one location
    """
    final_parts = []
    for i, part in enumerate(selected_parts):
      text = f'[START] {part} [END] ' if loc_idx == i else part
      final_parts.append(text)
    whole_text = SPACE_REGEX.sub(' ', ''.join(final_parts)).strip()
    limited_context = cut_context(whole_text, lang)
    return limited_context


def compile_relevant_texts(parts, lang):
    """
    From all parts of <p> tag (got from get_text_parts),
    compile separate texts for GENRE inference 
    for each location in the text.
    """
    selected_parts = []
    loc_idxs = []

    i = 0
    for part in parts:
      text = part if isinstance(part, str) else part.text
  
      if not isinstance(part, str):
        # leave out <stage> tag
        if part.tag == f'{{{NAMESPACE["ns"]}}}stage':
          continue

        # save parts that contain target locations
        elif part.tag == f'{{{NAMESPACE["ns"]}}}loc':
          loc_idxs.append(i)

      selected_parts.append(text)
      i += 1
    
    # create separate texts for each location
    final_texts = []
    for loc_idx in loc_idxs:
      loc_text = create_text_for_location(loc_idx, selected_parts)
      final_texts.append(loc_text)

    return final_texts


def get_location_texts_for_utterance(prev_p_tag, p_tags, lang):
  """
  Parse locations in all <p> tags inside one speaker's utterance
  and create texts for inference for each location.
  Include <p> tag from previous utterance 
  for more informative left context.
  """
  utterance_loc_texts = []
  
  prev_parts = []
  if prev_p_tag is not None:
    prev_parts = get_text_parts(prev_p_tag, text_only_for_loc=True)

  # one flat list of parts for the whole utterance (multiple <p> tags)
  all_parts = (
      prev_parts 
      + [' '] 
      + [part for p_tag in p_tags for part in get_text_parts(p_tag)] 
  )
  utterance_loc_texts = compile_relevant_texts(all_parts, lang)

  return utterance_loc_texts


def get_location_texts_for_play(lang, tree):
  """
  Given XML of a play, find all utterances with locations,
  create a text for GENRE inference for each location.
  """
  linking_texts = []
  
  utterances = tree.xpath(UTTERANCE_XPATH, namespaces=NAMESPACE)
  print('N utterances', len(utterances))

  prev_p = None
  for utterance in utterances:
    p_tags = utterance.xpath(TEXT_XPATH, namespaces=NAMESPACE)
    if not p_tags:
      prev_p = None
      continue

    if utterance.xpath(LOC_XPATH, namespaces=NAMESPACE):
      linking_texts += get_location_texts_for_utterance(prev_p, p_tags, lang)

    prev_p = p_tags[-1]
  
  return linking_texts


def link_locations_in_play(linking_texts):
  """
  Run GENRE inference, return texts with linked locations.
  """
  # run inference
  linked_locations = GENRE_MODEL.sample(
    linking_texts,
    prefix_allowed_tokens_fn=lambda batch_id, sent: [
        e for e in TRIE.get(sent.tolist()) 
        if e < len(GENRE_MODEL.task.target_dictionary)
    ]
  )

  # prepare links for json serialization
  for loc_group in linked_locations:
    for loc in loc_group:
      loc['score'] = float(loc['score'])

  return [
      [text, locs] 
      for text, locs in zip(linking_texts, linked_locations)
  ]


def process_play(lang, xml_path):
  """
  The main function.
  Get texts for each location in the play
  and run GENRE on them.
  """
  with open(xml_path) as f:
    tree = etree.parse(f)

  linking_texts = get_location_texts_for_play(lang, tree)
  linking_results = link_locations_in_play(linking_texts)
  return linking_results

In [44]:
def parse_title(raw_title):
    """Divide GENRE output result into title and language."""
    parts = TITLE_REGEX.search(raw_title)
    title, lang = parts.group(1), parts.group(2)
    return title, lang


def get_unique_titles(location_candidates):
    """
    From GENRE output for all contexts, compile 
    lists of unique wiki titles for relevant languages.
    """
    unique_titles = defaultdict(set)
    for play_locations in location_candidates.values():
        for _, loc_group in play_locations:
            for loc_title in loc_group:
                title, lang = parse_title(loc_title['text'])
                if lang in WIKILANGS:
                    unique_titles[lang].add(title)
    return unique_titles


def run_request(lang, titles):
  """Request Wikipedia API by Wikipedia titles."""
  r = requests.get(
      REQUEST_URL % lang,
      params={
          'action': 'query',
          'prop': 'pageprops|info',
          'ppprop': 'wikibase_item',
          'inprop': 'url',
          'redirects': 1,
          'titles': titles,
          'format': 'json'
      },
      headers=USER_AGENT
  ).json()
  return r


def query_wikipedia_unique(titles):
    """
    Run Wikipedia API requests for all unique titles returned by GENRE. 
    Retrieve Wikidata ID and Wikipedia URL.
    """
    links_by_title = {}
    for lang, title_set in titles.items():
      title_list = list(title_set)

      # query by multiple titles at once
      for i in range(0, len(title_list), N_TITLES_PER_REQUEST):
          curr_titles = title_list[i:i + N_TITLES_PER_REQUEST]
          titles = '|'.join(curr_titles)

          r = run_request(lang, titles)
          for k, v in r['query']['pages'].items():
              if not k.startswith('-'):
                links_by_title[v['title']] = {
                    'wikidata_id': v['pageprops']['wikibase_item'],
                    'url': v['fullurl']
                }

          sleep(1)

    return links_by_title


def reformat_location_candidates(location_candidates, links_by_title):
    """
    Filter and reformat GENRE output.
    For each valid prediction add Wikidata ID and Wikipedia URL.
    """
    for play, play_locations in location_candidates.items():
      new_play_locations = []

      for text, loc_group in play_locations:
          new_loc_group = []
  
          for loc in loc_group:
              title, lang = parse_title(loc['text'])
              is_confident = loc['score'] > GENRE_THRESHOLD
              corr_lang = lang in WIKILANGS
              exists = title in links_by_title

              if corr_lang and is_confident and exists:
                  link = links_by_title[title]
                  new_loc_group.append(
                      (link['wikidata_id'], loc['score'], link['url'])
                  )

          new_play_locations.append({
              'text': text,
              'scores': new_loc_group
          })
        
      location_candidates[play] = new_play_locations

    return location_candidates


def get_wikidata_info(locations):
    """
    Get wiki data for all unique GENRE titles;
    filter and reformat all GENRE predictions.
    """
    unique_titles = get_unique_titles(locations)
    links_by_title = query_wikipedia_unique(unique_titles)
    locations = reformat_location_candidates(locations, links_by_title)
    return locations

In [52]:
langs = os.listdir('final')
locations = {}

for lang in langs:
  if lang.startswith('.'):
    continue

  corpus_dir = os.path.join('final', lang)
  for playname in os.listdir(corpus_dir):
    if not playname.endswith('.xml'):
    # if not playname == 'petrov-ostrov-mira.xml':
      continue

    print(playname)
    xml_path = os.path.join(corpus_dir, playname)
    play_locations = process_play(lang, xml_path)
    locations[playname[:-4]] = play_locations
    print(len(play_locations))

locations = get_wikidata_info(locations)

valle-luces.xml
N utterances 1174


  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


71
valle-romance.xml
N utterances 950
14
munoz-ortiz.xml
N utterances 1218
50
galdos-electra.xml
N utterances 1588
20
valle-cara.xml
N utterances 1186
25
valera-atahualpa.xml
N utterances 371
50
galdos-perfecta.xml
N utterances 1165
28
galdos-casandra.xml
N utterances 997
10
munoz-refugio.xml
N utterances 1361
82
echegaray-arrastrarse.xml
N utterances 1599
43
ostrovsky-beshenye-dengi.xml
N utterances 1214
54
bulgakov-zojkina-kvartira.xml
N utterances 1406
104
chekhov-tri-sestry.xml
N utterances 758
67
ostrovsky-groza.xml
N utterances 784
20
ostrovsky-bespridannitsa.xml
N utterances 1242
52
turgenev-holostjak.xml
N utterances 883
33
bulgakov-beg.xml
N utterances 821
150
gogol-revizor.xml
N utterances 927
45
chekhov-vishnevyi-sad.xml
N utterances 634
36
petrov-ostrov-mira.xml
N utterances 636
133


In [53]:
import json
with open('genre_ranking.json', 'w') as f:
  json.dump(locations, f, ensure_ascii=False, indent=2)

In [54]:
!cp genre_ranking.json /content/drive/MyDrive/SWT/

In [55]:
drive.flush_and_unmount()