parameter document:http://babelfy.org/guide

In [1]:
import json
from collections import defaultdict

import pandas as pd

from secrets import babelfy_key

In [3]:
import urllib
import json 

# a simple wrapper around Babely's HTTP API
def babelfy_request(text, key, lang='ES', annotation_type="NAMED_ENTITIES", matching='EXACT_MATCHING'):
    service_url = 'https://babelfy.io/v1/disambiguate'
    matching_strategies = ['PARTIAL_MATCHING', 'EXACT_MATCHING']
    annotation_types = [
        #'ALL',
        'NAMED_ENTITIES',
        #'CONCEPTS'
    ]
    
    # sanity check on input parameters
    assert annotation_type in annotation_types
    assert matching in matching_strategies

    params = {
    'text' : text,
    'lang' : lang,
    'key'  : key,
    'annType': annotation_type,
    'match': matching
    }

    params = urllib.parse.urlencode(params)
    params = params.encode('utf8') # POST data must be bytes
    req = urllib.request.Request(service_url, data=params, method='POST')
    resp = urllib.request.urlopen(req)
    data = json.loads(resp.read().decode('utf8'))
    return data

In [4]:
# print out Babelfy's output in a user friendly way
def display_babelfy_output(input_doc, b_output):
    """Print Babeblfy's output with minimal formatting."""
    
    print(f"Babelfy found {len(b_output)} links:")
    
    for n, entry in enumerate(b_output):
        start_offset = entry['charFragment']['start']
        end_offset = entry['charFragment']['end'] + 1
        surface = input_doc[start_offset:end_offset]
        entity_link = entry['DBpediaURL'] if entry['DBpediaURL'] else entry['BabelNetURL']
        
        print(f"[{n + 1}] {surface}，({start_offset}, {end_offset}) -> {entity_link}")
    return

In [5]:
with open('location_idxs.json') as f:
  texts = json.load(f)

In [6]:
langs = {'span': 'ES', 'rus': 'RU'}

In [7]:
babelfy_outputs = defaultdict(dict)
for lang, plays in texts.items():
  for playname, (text, locations) in plays.items():
    print(playname)
    babelfy_output = babelfy_request(text, babelfy_key, lang=langs[lang])
    babelfy_outputs[lang][playname] = babelfy_output

galdos-perfecta
valle-cara
valera-atahualpa
echegaray-arrastrarse
munoz-ortiz
munoz-refugio
valle-romance
galdos-electra
valle-luces
galdos-casandra
turgenev-holostjak
bulgakov-zojkina-kvartira
gogol-revizor
ostrovsky-beshenye-dengi
chekhov-vishnevyi-sad
ostrovsky-bespridannitsa
chekhov-tri-sestry
ostrovsky-groza
petrov-ostrov-mira
bulgakov-beg


In [8]:
with open('babelfy_outputs.json', 'w') as f:
  json.dump(babelfy_outputs, f, ensure_ascii=False)

In [12]:
rows = []

for lang, plays in texts.items():
  tp = 0
  gs = 0

  for playname, (text, locations) in plays.items():
    location_spans = {tuple(x[1]): x[2] for x in locations}
    gs += len(location_spans)

    babelfy_output = babelfy_outputs[lang][playname]
    babelfy_output = [x for x in babelfy_output if x['DBpediaURL']]

    for output in babelfy_output:
      row = {
          'lang': lang,
          'play': playname
      }
      st = output['charFragment']['start']
      end = output['charFragment']['end'] + 1
      row['left'] = text[max(0, st-200):st]
      row['loc_text'] = text[st:end]
      row['right'] = text[end:min(len(text), end+200)]
      row['dblink'] = output['DBpediaURL']
  
      if (st, end) in location_spans:
        tp += 1
        row['is_correct_location'] = 1
        row['is_correct_link'] = ''
        row['is_predicted_as_location'] = 'null'
        row['wikilink'] = location_spans[(st, end)]
      else:
        row['is_correct_location'] = 0
        row['is_correct_link'] = 'null'
        row['is_predicted_as_location'] = ''
        row['wikilink'] = ''

      rows.append(row)
    
  fn = gs - tp
  print(f'{lang}: GS {gs}, TP {tp}, FN {fn}')


df = pd.DataFrame(rows)
df.to_csv('babelfy_output.csv', index=False)

span: GS 393, TP 50, FN 343
rus: GS 694, TP 26, FN 668
