In [41]:
import json
import re
import os
import itertools
from transformers import ReformerModelWithLMHead

import torch
import random
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
# Encoding
def encode(list_of_strings, pad_token_id=0, device="cpu"):
    max_length = max([len(string) for string in list_of_strings])

    # create emtpy tensors
    attention_masks = torch.zeros((len(list_of_strings), max_length), dtype=torch.long)
    input_ids = torch.full((len(list_of_strings), max_length), pad_token_id, dtype=torch.long)

    for idx, string in enumerate(list_of_strings):
        # make sure string is in byte format
        if not isinstance(string, bytes):
            string = str.encode(string)

        input_ids[idx, :len(string)] = torch.tensor([x + 2 for x in string])
        attention_masks[idx, :len(string)] = 1

    return input_ids.to(device), attention_masks.to(device)
    
# Decoding
def decode(outputs_ids):
    decoded_outputs = []
    for output_ids in outputs_ids.tolist():
        # transform id back to char IDs < 2 are simply transformed to ""
        decoded_outputs.append("".join([chr(x - 2) if x > 1 else "" for x in output_ids]))
    return decoded_outputs

model = ReformerModelWithLMHead.from_pretrained("google/reformer-enwik8").to(device)
prompt = "Something was created in the year [["
prompt = "The editor is named [["

encoded, attention_masks = encode([prompt], device=device)
decode(model.generate(encoded, do_sample=True, num_return_sequences=10, max_length=50))

['The editor is named [[Samuel Howard Taylor]]. On S',
 'The editor is named [[Herbert Brown (author)|Herbe',
 'The editor is named [[Yongdan Chenguon]].\n\n== See ',
 'The editor is named [[Merril Paul]] after him. A f',
 'The editor is named [[Walter Walter]]. Prihe died ',
 'The editor is named [[Alexandra Bulisc (composer)|',
 'The editor is named [[Prince Charles Concord, Duke',
 'The editor is named [[Richard RÃ©ditor (class war ',
 'The editor is named [[Justine Debbie]] and 26 of t',
 'The editor is named [[Robert Grouse]].\n\n* Several ']

In [3]:
def set_seed(seed=0):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    

In [30]:
def extract_entity(text):
    regex_pattern = r'\[\[([^\[|\]]+?)(?:\([^)]*\))?(?:\|[^|\]]+)*\]\]'
    # regex_pattern = r'\[\[([^\[|\]]+?)(?:\([^)]*\))?(?:\|([^|\]#]+))*\]\]'
    # regex_pattern = r'\[\[([^|\]#]+?)(?:\([^)]*\))?(?:\|([^|\]#]+))*\]\]'

    match = re.search(regex_pattern, text)

    if match:
        result = match.group(1).strip()
        return result
    
    return None

In [34]:
print(extract_entity("This event is known as [[Hansenschaft al Goten#sdf|Hansenschaft]].\n\n==History==\n''Main article: [[Histor"))
print(extract_entity("This event is known as [[Hansenschaft al Goten(disa)|Hansenschaft]].\n\n==History==\n''Main article: [[Histor"))

Hansenschaft al Goten#sdf
Hansenschaft al Goten


In [6]:
entity_types_to_prompts = {
    "http://schema.org/CreativeWork": "The creative work called [[",
    "http://schema.org/Event": "The event called [[",
    "http://schema.org/Intangible": "The concept called [[",
    "http://schema.org/Organization": "The organization is called [[",
    "http://schema.org/Person": "The person named [[",
    "http://schema.org/Place": "The place is named [[",
    "http://schema.org/Product": "The product is called [[",
    "http://schema.org/Taxon": "The taxon named [[",
    "http://schema.org/FictionalEntity": "The fictional entity named [[",
}
entity_types_to_fake_entities = {}

In [7]:
for et, p in entity_types_to_prompts.items():
    set_seed(0)
    print(f"***{et}***")
    encoded, attention_masks = encode([p], device=device)
    res = decode(model.generate(encoded, do_sample=True, num_return_sequences=1100, max_length=100))
    # print(json.dumps(res, indent=4))
    extracted_res = [extract_entity(s) for s in res if extract_entity(s) is not None]
    extracted_res = random.sample(extracted_res, 1000)
    entity_types_to_fake_entities[et] = extracted_res
    # print(json.dumps(extracted_res, indent=4))

***http://schema.org/CreativeWork***
***http://schema.org/Event***
***http://schema.org/Intangible***
***http://schema.org/Organization***
***http://schema.org/Person***
***http://schema.org/Place***
***http://schema.org/Product***
***http://schema.org/Taxon***
***http://schema.org/FictionalEntity***


In [39]:
DATA_ROOT = "."
PRED_URI_TO_SO_PAIRS_PATH = os.path.join(DATA_ROOT, "yago_pred_uri_to_so_pairs_randomized_1k.json")
YAGO_QEC_PATH = os.path.join(DATA_ROOT, "yago_qec.json") 
YAGO_FAKE_ENTITIES_PATH = os.path.join(DATA_ROOT, "fake_entities.json") 
with open(YAGO_QEC_PATH, "r") as fp:
    yago_qec = json.load(fp)

In [49]:
print(json.dumps({k: v["entity_types"] for k, v in yago_qec.items()}, indent=4))

{
    "http://schema.org/about": [
        "http://schema.org/Event",
        "http://schema.org/CreativeWork",
        "http://schema.org/Product"
    ],
    "http://schema.org/children": [
        "http://schema.org/Person"
    ],
    "http://schema.org/director": [
        "http://schema.org/CreativeWork",
        "http://schema.org/Product"
    ],
    "reverse-http://schema.org/homeLocation": [
        "http://schema.org/Organization",
        "http://schema.org/Place"
    ],
    "reverse-http://schema.org/founder": [
        "http://schema.org/Person"
    ],
    "http://schema.org/leader": [
        "http://schema.org/Organization",
        "http://schema.org/Place"
    ],
    "reverse-http://schema.org/lyricist": [
        "http://schema.org/Person"
    ],
    "http://schema.org/manufacturer": [
        "http://schema.org/CreativeWork",
        "http://schema.org/Product"
    ],
    "http://schema.org/influencedBy": [
        "http://schema.org/Person"
    ],
    "http://schema.o

In [10]:
entity_types_to_fake_entities

{'http://schema.org/CreativeWork': ['Devorage and Planning',
  'Ptolemy',
  'checkers',
  'historical revisionism',
  'Dialectic',
  'Moon',
  'funnies',
  'meter work',
  'Da Noroada',
  'Gales Gales',
  'The element of reality',
  'time:service.',
  'Autreas Smell',
  'pregapassirin',
  'Dr. Marginery',
  'shaological creationism',
  'Hilbert',
  'odiana nature',
  'Draging',
  'Francis Grayson-Guler',
  'Neu-O Day',
  'Indonesia',
  'dualism',
  'Wiccan',
  "Freeman's Globe",
  'The Hubble White World',
  'the Seven Awakes of the Discovery of Excellence',
  'The Creative Works of Arube the Strauss',
  'Dr. Martin Boom',
  'Jerem Lowe',
  'creative complex',
  'denset metalogy',
  'Vertebra',
  'Blue the Apple',
  'Creative Communication',
  'Mineralism',
  'Mithah',
  'Robert G. Gordon',
  'Process',
  'Darwin Forkes',
  'Valkyrie',
  'The Market Whise Fire',
  'blending',
  "Devil's Feathers",
  'Libsations',
  'Creative Heavenly Forces',
  'Huffman zovigina tovigina',
  'logs',
  

In [37]:
with open(YAGO_FAKE_ENTITIES_PATH, "w", encoding='utf-8') as fp:
    json.dump(entity_types_to_fake_entities, fp, ensure_ascii=False, indent=4)

In [42]:
for k, v in yago_qec.items():
    entity_types = yago_qec[k]["entity_types"]
    eligible_fake_entities = list(itertools.chain.from_iterable([entity_types_to_fake_entities[et] for et in entity_types]))
    yago_qec[k]["fake_entities"] = random.sample(eligible_fake_entities, len(yago_qec[k]["entities"]))

In [50]:
yago_qec['http://schema.org/editor']["fake_entities"]

['old contrast',
 'Flasderster',
 'Motivation',
 'time',
 'Compact Disc',
 'Hermitima',
 'amphorescents',
 'Poaceae',
 'analogue testing',
 'free motion',
 'Gratiful Depression',
 'Death of the Murder',
 'Jeb Callaghan',
 'Pritching the Telephone',
 'product butter',
 'syntnesis',
 'Adolf Hitler',
 'peer-to-peer',
 'Librevians',
 'powder',
 'Callaf',
 'nuclear arrival',
 'History of the Americas#History of the Americas',
 'Vita Crevi',
 'PPP',
 'porces',
 'connectivity',
 'Agagiwa Thermodynamic',
 'Russian Russia',
 'Lecies',
 'well-worthiness',
 'Valkyrie',
 'Tequin',
 'Tilturgun',
 'Diemen Betham atatii',
 'William Wilkinsot Mikhailov',
 'Don an Award',
 'Blue Works',
 'Andromon',
 'voiceless consonant',
 'photous',
 'Connecticut',
 'Pelry and the Travelsable Works',
 'the fifth film',
 'fuzzy fuzzy',
 'The Southern Participation of the Barbarian',
 'Compact Disc',
 'deciding sheet',
 'Stormsryke',
 'Service aligning',
 'equity show',
 'most buyed organization',
 'Divinity Divinity',

In [51]:
with open(YAGO_QEC_PATH, "w", encoding='utf-8') as fp:
    json.dump(yago_qec, fp, ensure_ascii=False, indent=4)