Papers: [GENRE](https://arxiv.org/abs/2010.00904), [mGENRE](https://arxiv.org/abs/2103.12528)

Code: [Github](https://github.com/facebookresearch/GENRE)

# Download models

In [None]:
! wget https://dl.fbaipublicfiles.com/GENRE/fairseq_multilingual_entity_disambiguation.tar.gz
! tar -xvf fairseq_multilingual_entity_disambiguation.tar.gz

! wget http://dl.fbaipublicfiles.com/GENRE/fairseq_e2e_entity_linking_aidayago.tar.gz
! tar -xvf fairseq_e2e_entity_linking_aidayago.tar.gz

! wget http://dl.fbaipublicfiles.com/GENRE/titles_lang_all105_marisa_trie_with_redirect.pkl

# Install and debug packages

Issues that are required to fix are described in [this Github issue](https://github.com/facebookresearch/GENRE/issues/66).

In [None]:
# remove locally installed `examples` package to avoid import error
! rm -rf /usr/local/lib/python3.7/dist-packages/examples

! git clone --branch fixing_prefix_allowed_tokens_fn https://github.com/nicola-decao/fairseq

# remove the bugged lines in this fairseq version
! sed -i -e '26,27d' /content/fairseq/fairseq/registry.py

! cd fairseq && pip install  ./

# further path fixes
! mkdir -p examples_dir
! cp -r /content/fairseq/examples/ /content/examples_dir
! cp -r /content/fairseq/fairseq/models/speech_to_text/modules /usr/local/lib/python3.7/dist-packages/fairseq/models/speech_to_text

!git clone https://github.com/facebookresearch/GENRE.git
! cd GENRE && pip install ./

! pip install sentencepiece marisa_trie

In [3]:
import sys
sys.path.append('/content/GENRE/genre')
sys.path.append('/content/examples_dir')

# Run disambiguation model

In [4]:
import pickle

from fairseq_model import mGENRE
from genre.trie import Trie, MarisaTrie

In [5]:
MGENRE_MODEL = mGENRE.from_pretrained("fairseq_multilingual_entity_disambiguation").eval()

In [6]:
# memory efficient but slower prefix tree (trie) -- it is implemented with `marisa_trie`
with open("titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
    TRIE = pickle.load(f)

In [7]:
texts = [
    '[START] Alexander [END] werd geboren in Pella, in Macedonië, 21 juli 356 v.Chr. Hij was de zoon van de Macedonische koning Philippus II, van de dynastie van de Argeaden.',
    'Alexander werd geboren in Pella, in [START] Macedonië [END] , 21 juli 356 v.Chr. Hij was de zoon van de Macedonische koning Philippus II, van de dynastie van de Argeaden.',
    'Alexander werd geboren in Pella, in Macedonië, 21 juli 356 v.Chr. Hij was de zoon van de Macedonische koning [START] Philippus II [END] , van de dynastie van de Argeaden.'
]

In [8]:
%%time

predictions = MGENRE_MODEL.sample(
  texts,
  prefix_allowed_tokens_fn=lambda batch_id, sent: [
      e for e in TRIE.get(sent.tolist()) 
      if e < len(MGENRE_MODEL.task.target_dictionary)
  ]
)

  beams_buf = indices_buf // vocab_size
  unfin_idx = idx // beam_size


CPU times: user 8.89 s, sys: 142 ms, total: 9.04 s
Wall time: 9.21 s


In [9]:
predictions

[[{'text': 'Alexander de Grote >> nl', 'score': tensor(-0.1351)},
  {'text': 'Alexander I van Macedonië >> nl', 'score': tensor(-0.4823)},
  {'text': 'Alexander van Macedonië >> nl', 'score': tensor(-0.5138)},
  {'text': 'Alexander II van Macedonië >> nl', 'score': tensor(-0.6219)},
  {'text': 'Alexander I van Makedonië >> af', 'score': tensor(-2.4315)}],
 [{'text': 'Macedonië (oudheid) >> nl', 'score': tensor(-0.0815)},
  {'text': 'Macedonië (Romeinse provincie) >> nl', 'score': tensor(-0.4778)},
  {'text': 'Macedonië (regio) >> nl', 'score': tensor(-0.4992)},
  {'text': 'Macedonië (Griekenland) >> nl', 'score': tensor(-0.5649)},
  {'text': 'Noord-Macedonië >> nl', 'score': tensor(-0.8881)}],
 [{'text': 'Philippus II van Macedonië >> nl', 'score': tensor(-0.0894)},
  {'text': 'Philippos II van Macedonië >> nl', 'score': tensor(-0.6956)},
  {'text': 'Philippus I van Macedonië >> nl', 'score': tensor(-0.8157)},
  {'text': 'Philippe II de Macédoine >> fr', 'score': tensor(-0.8543)},
  {'

# Run end-to-end EL model (for English only)

In [10]:
from fairseq_model import GENRE
from genre.entity_linking import get_end_to_end_prefix_allowed_tokens_fn_fairseq as get_prefix_allowed_tokens_fn
from genre.utils import get_entity_spans_fairseq as get_entity_spans

In [None]:
GENRE_MODEL = GENRE.from_pretrained("fairseq_e2e_entity_linking_aidayago").eval()

In [12]:
%%time

sentences = [
    "Alexander III was born in Pella, the capital of the Kingdom of Macedon, on the sixth day of the ancient Greek month of Hekatombaion."
]

# bug: need to prepend a space before a sentence
sentences = [f' {sent}' for sent in sentences]

prefix_allowed_tokens_fn = get_prefix_allowed_tokens_fn(GENRE_MODEL, sentences)

predictions = GENRE_MODEL.sample(
    sentences,
    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
)

  beams_buf = indices_buf // vocab_size


CPU times: user 12.1 s, sys: 91.2 ms, total: 12.1 s
Wall time: 12.1 s


  unfin_idx = idx // beam_size


In [13]:
predictions

[[{'text': ' { Alexander III } [ Alexander III of Macedon ] was born in { Pella } [ Pella ], the capital of the { Kingdom of Macedon } [ Macedonia (ancient kingdom) ], on the sixth day of the ancient { Greek } [ Greece ] month of Hekatombaion.',
   'score': tensor(-0.6428)},
  {'text': ' { Alexander III } [ Alexander III of Macedon ] was born in { Pella } [ Pella ], the capital of the { Kingdom of Macedon } [ Macedonia (ancient kingdom) ], on the sixth day of the { ancient Greek } [ Ancient Greece ] month of Hekatombaion.',
   'score': tensor(-0.6429)},
  {'text': ' { Alexander } [ Alexander III of Macedon ] III was born in { Pella } [ Pella ], the capital of the { Kingdom of Macedon } [ Macedonia (ancient kingdom) ], on the sixth day of the ancient { Greek } [ Greece ] month of Hekatombaion.',
   'score': tensor(-0.6510)},
  {'text': ' { Alexander } [ Alexander III of Macedon ] III was born in { Pella } [ Pella ], the capital of the { Kingdom of Macedon } [ Macedonia (ancient kingdom)