# Entity Linking with Wikidata

## Experiment with Wikidata API

See: 

* Pywikibot: https://github.com/wikimedia/pywikibot
* Member of political party (P102): https://www.wikidata.org/wiki/Property:P102

In [4]:
import pywikibot
import time

In [19]:
person_qid = 'Q22686' # Donald Trump

site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()
person = pywikibot.ItemPage(repo, person_qid)
data = person.get()

In [22]:
person_aliases = data['labels']['en']
person_aliases

'Donald Trump'

In [6]:
print(data)

{'labels': <class 'pywikibot.page._collections.LanguageDict'>({'zh': '唐納·川普', 'pl': 'Donald Trump', 'gd': 'Donald Trump', 'es': 'Donald Trump', 'ta': 'டோனால்ட் டிரம்ப்', 'ms': 'Donald Trump', 'hu': 'Donald Trump', 'pdc': 'Donald Trump', 'sq': 'Donald Trump', 'bcl': 'Donald Trump', 'sv': 'Donald Trump', 'nl': 'Donald Trump', 'ar': 'دونالد ترامب', 'pt': 'Donald Trump', 'yi': 'דאנאלד טראמפ', 'ru': 'Дональд Трамп', 'sr': 'Доналд Трамп', 'tr': 'Donald John Trump', 'mk': 'Доналд Трамп', 'fi': 'Donald Trump', 'pfl': 'Donald Trump', 'uk': 'Дональд Трамп', 'hr': 'Donald Trump', 'da': 'Donald Trump', 'he': 'דונלד טראמפ', 'fr': 'Donald Trump', 'ko': '도널드 트럼프', 'lv': 'Donalds Tramps', 'it': 'Donald Trump', 'gl': 'Donald Trump', 'id': 'Donald Trump', 'de': 'Donald Trump', 'ja': 'ドナルド・トランプ', 'vi': 'Donald Trump', 'en': 'Donald Trump', 'sh': 'Donald Trump', 'hi': 'डोनाल्ड ट्रम्प', 'sk': 'Donald Trump', 'fy': 'Donald Trump', 'th': 'ดอนัลด์ ทรัมป์', 'ro': 'Donald Trump', 'ca': 'Donald Trump', 'fo': 'Do

In [7]:
# Check if the person has a 'member of political party' property
if 'P102' in person.claims:
    party_memberships = person.claims['P102']
    for membership in party_memberships:
        party = membership.getTarget()
        if party:
            print(party.labels['en'])  # Print the name of the party in English

Republican Party
Independence Party of America
Democratic Party
Republican Party
Republican Party
Reform Party of the United States of America


In [8]:
from datetime import datetime

# Function to convert pywikibot.WbTime to datetime.datetime
def wbtime_to_datetime(wbtime):
    if wbtime:
        return datetime(wbtime.year, wbtime.month or 1, wbtime.day or 1)
    return None

# Function to get party membership data
def get_party_memberships(person):
    party_memberships_data = []

    if 'P102' in person.claims:
        party_memberships = person.claims['P102']
        for membership in party_memberships:
            party = membership.getTarget()
            qualifiers = membership.qualifiers

            # Initialize dictionary for this membership
            membership_data = {}

            # Get 'from' date (start time) and convert to datetime
            from_date = qualifiers.get('P580', [None])[0]
            membership_data['from'] = wbtime_to_datetime(from_date.getTarget()) if from_date else None

            # Get 'to' date (end time) and convert to datetime
            to_date = qualifiers.get('P582', [None])[0]
            membership_data['to'] = wbtime_to_datetime(to_date.getTarget()) if to_date else None

            # Get the English label of the party, if available
            membership_data['party'] = party.labels.get('en', 'No English label') if party else "Unknown Party"

            # Append this membership data to the list
            party_memberships_data.append(membership_data)

    return party_memberships_data

In [9]:
memberships = get_party_memberships(person)
print(memberships)

[{'from': datetime.datetime(1987, 7, 1, 0, 0), 'to': datetime.datetime(1999, 10, 1, 0, 0), 'party': 'Republican Party'}, {'from': datetime.datetime(1999, 10, 1, 0, 0), 'to': datetime.datetime(2001, 8, 1, 0, 0), 'party': 'Independence Party of America'}, {'from': datetime.datetime(2001, 8, 1, 0, 0), 'to': datetime.datetime(2009, 9, 1, 0, 0), 'party': 'Democratic Party'}, {'from': datetime.datetime(2009, 9, 1, 0, 0), 'to': datetime.datetime(2011, 12, 1, 0, 0), 'party': 'Republican Party'}, {'from': datetime.datetime(2011, 12, 1, 0, 0), 'to': datetime.datetime(2012, 4, 1, 0, 0), 'party': 'Unknown Party'}, {'from': datetime.datetime(2012, 4, 1, 0, 0), 'to': None, 'party': 'Republican Party'}, {'from': None, 'to': None, 'party': 'Reform Party of the United States of America'}]


## Experimenting with mGENRE

See:

* https://huggingface.co/facebook/mgenre-wiki

In [99]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [100]:
# OPTIONAL: load the prefix tree (trie), you need to additionally download
# https://huggingface.co/facebook/mgenre-wiki/blob/main/trie.py and 
# https://huggingface.co/facebook/mgenre-wiki/blob/main/titles_lang_all105_trie_with_redirect.pkl
# that is fast but memory inefficient prefix tree (trie) -- it is implemented with nested python `dict`
# NOTE: loading this map may take up to 10 minutes and occupy a lot of RAM!
import pickle
from utils.trie import Trie
with open("../data/titles_lang_all105_trie_with_redirect.pkl", "rb") as f:
     trie = Trie.load_from_dict(pickle.load(f))

# or a memory efficient but a bit slower prefix tree (trie) -- it is implemented with `marisa_trie` from
# https://huggingface.co/facebook/mgenre-wiki/blob/main/titles_lang_all105_marisa_trie_with_redirect.pkl
# from genre.trie import MarisaTrie
# with open("titles_lang_all105_marisa_trie_with_redirect.pkl", "rb") as f:
#     trie = pickle.load(f)




In [None]:
with open("../data/lang_title2wikidataID-normalized_with_redirect.pkl", "rb") as f:
    lang_title2wikidataID = pickle.load(f)

In [None]:
import importlib
import utils.genre.hf_model  # Import the module from which mGENRE originates

# Reload the module
importlib.reload(utils.genre.hf_model)

# Now, you can re-import mGENRE from the reloaded module
from utils.genre.hf_model import mGENRE


In [None]:
model = mGENRE.from_pretrained("facebook/mgenre-wiki").eval()

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda:1")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))

# Otherwise use the CPU
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

model = model.to(device)

There are 2 GPU(s) available.
Device name: NVIDIA A40


In [None]:
sentences = ["[START] Einstein [END] era un fisico tedesco."* 1000]

model.sample(
    sentences,
    #prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
    prefix_allowed_tokens_fn=lambda batch_id, sent: [
        e for e in trie.get(sent.tolist())
        if e < len(model.tokenizer) - 1
    ],
    text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(" >> ")))], key=lambda y: int(y[1:])),
)


[[{'text': 'Albert Einstein >> it',
   'score': tensor(-0.3944, device='cuda:1'),
   'id': 'Q937'},
  {'text': 'Alfred Einstein >> it',
   'score': tensor(-0.6906, device='cuda:1'),
   'id': 'Q60197'},
  {'text': 'Walter Ernsting >> en',
   'score': tensor(-1.8036, device='cuda:1'),
   'id': 'Q71309'},
  {'text': 'Walter Ernsting >> de',
   'score': tensor(-1.8232, device='cuda:1'),
   'id': 'Q71309'},
  {'text': 'Alfred Einstein >> en',
   'score': tensor(-1.8779, device='cuda:1'),
   'id': 'Q60197'}]]

In [None]:
#tokenizer = AutoTokenizer.from_pretrained("facebook/mgenre-wiki")
#model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mgenre-wiki").eval()

In [None]:
# sentences = ["[START] Einstein [END] era un fisico tedesco."]
# # Italian for "[START] Einstein [END] was a German physicist."

# outputs = model.generate(
#     **tokenizer(sentences, return_tensors="pt"),
#     num_beams=5,
#     num_return_sequences=5,
#     # OPTIONAL: use constrained beam search
#     prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
# )

# tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [None]:
def disambEntity(model, prompts, lang_title2wikidataID, trie):

    output = model.sample(
        prompts,
        prefix_allowed_tokens_fn=lambda batch_id, sent: [
            e for e in trie.get(sent.tolist())
            if e < len(model.tokenizer) - 1
        ],
        text_to_id=lambda x: max(lang_title2wikidataID[tuple(reversed(x.split(" >> ")))], key=lambda y: int(y[1:])),
    )
        
    return output

In [None]:
PROMPT_TEMPLATE_prefix = "Discussing [START_ENT] {entity} [END_ENT]"
PROMPT_TEMPLATE = "{prefix} . {article}"

# Test the template with a dummy text
prefix = PROMPT_TEMPLATE_prefix.format(entity='Donald Trump')
example_prompt = PROMPT_TEMPLATE.format(prefix=prefix,
      article='Donald John Trump (* 14. Juni 1946 in Queens, New York City, New York) ist ein US-amerikanischer Unternehmer, Entertainer und Politiker der Republikanischen Partei, der von 2017 bis 2021 der 45. Präsident der Vereinigten Staaten war. Er gilt als einer der umstrittensten Politiker der US-Geschichte. Er ist außerdem der erste Präsident, gegen den zwei Amtsenthebungsverfahren angestrengt wurden, und der sich nach seiner Präsidentschaft mit mehreren Strafprozessen - unter anderem wegen Verschwörung gegen die USA - konfrontiert sieht.')
print(example_prompt)

Discussing [START_ENT] Donald Trump [END_ENT] . Donald John Trump (* 14. Juni 1946 in Queens, New York City, New York) ist ein US-amerikanischer Unternehmer, Entertainer und Politiker der Republikanischen Partei, der von 2017 bis 2021 der 45. Präsident der Vereinigten Staaten war. Er gilt als einer der umstrittensten Politiker der US-Geschichte. Er ist außerdem der erste Präsident, gegen den zwei Amtsenthebungsverfahren angestrengt wurden, und der sich nach seiner Präsidentschaft mit mehreren Strafprozessen - unter anderem wegen Verschwörung gegen die USA - konfrontiert sieht.


In [None]:
output = disambEntity(model, [example_prompt], lang_title2wikidataID, trie)
print(output[0])

[{'text': 'Donald Trump >> de', 'score': tensor(-0.1125, device='cuda:1'), 'id': 'Q22686'}, {'text': 'Donald Trumps Präsidentschaft >> de', 'score': tensor(-0.6292, device='cuda:1'), 'id': 'Q27809653'}, {'text': 'Donald Trumps Präsidentschaftswahlkampf 2015/16 >> de', 'score': tensor(-0.7457, device='cuda:1'), 'id': 'Q20121517'}, {'text': 'Donald Trumps Amtseinführung >> de', 'score': tensor(-0.8203, device='cuda:1'), 'id': 'Q27824398'}, {'text': 'Donald J. Trump >> de', 'score': tensor(-1.1120, device='cuda:1'), 'id': 'Q22686'}]


## Experimenting with GENRE

**Background:**

See:

- Model-Card: https://huggingface.co/facebook/genre-kilt
- Repository: https://github.com/facebookresearch/GENRE

**Setup:**

Load the prefix tree (trie), you need to additionally download

- https://huggingface.co/facebook/genre-kilt/blob/main/trie.py and
- https://huggingface.co/facebook/genre-kilt/blob/main/kilt_titles_trie_dict.pkl

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from utils.trie import Trie
# import pickle
# import torch
# import time

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(
#     "facebook/genre-kilt", add_prefix_space=True)
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/genre-kilt").eval()

# # Check if GPU is available
# if torch.cuda.is_available():
#     device = torch.device("cuda:1")
#     print(f"There are {torch.cuda.device_count()} GPU(s) available.")
#     print("Device name:", torch.cuda.get_device_name(0))

# # Otherwise use the CPU
# else:
#     print("No GPU available, using the CPU instead.")
#     device = torch.device("cpu")

# model = model.to(device)
# #print(f"Model loaded: {model.config}")

There are 2 GPU(s) available.
Device name: NVIDIA A40


In [None]:
# ------------------- Load Prefix Tree -------------------

with open("../data/kilt_titles_trie_dict.pkl", "rb") as f:
    trie = Trie.load_from_dict(pickle.load(f))

In [None]:
# # ------------------- Make Prediction -------------------

# sentences = ["[START_ENT]", "Einstein",
#              "[END_ENT]", "was", "a", "German", "physicist."]
# input_sequence = tokenizer(
#     sentences, return_tensors="pt", is_split_into_words=True)
# input_sequence.to(device)

# # Pass input sequence as list into the model
# outputs = model.generate(
#     **input_sequence,
#     num_beams=5,
#     num_return_sequences=5,
#     max_new_tokens=100,
#     prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
# )

# # Decode model output to obtain entity candidates
# candidates = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# print(f"Input Sequence: {sentences}")
# print(f"Candidates: {candidates}")

Input Sequence: ['[START_ENT]', 'Einstein', '[END_ENT]', 'was', 'a', 'German', 'physicist.']
Candidates: ['Albert Einstein', 'Albert Einstein (disambiguation)', 'Albert Einstein in popular culture', 'Edwin Einstein', 'Albert Newton']


## Fetch Articles

In [1]:
from utils.preprocessing import *
from utils.accelerators import *
from utils.multithreading import *
from utils.database import *
from utils.files import *
from datasets import Dataset
import random

  from .autonotebook import tqdm as notebook_tqdm


### Connect to Database

Credentials are sourced from the `.env` file.

In [2]:
_, db = getConnection(use_dotenv=True)

### Query Database

Fetches a limited number of articles from the database that haven't been processed yet, 
returning specified fields like url, title, and parsing result text.

In [3]:
collection = "articles.sampled.triplets"
fields = {"url": 1, "title": 1, "parsing_result.text": 1, 'processing_result': 1, 'triplets': 1}
query = {"triplets": {"$exists": True}}
articles = fetchArticleTexts(db, 100, 0, fields, query, collection)

Example article:

In [None]:
# example_article = random.choice(articles)
# title = example_article.get("title")
# text = example_article.get("parsing_result").get("text")
# print(f"Title: {title}\nText: {text}")
# print(f"Processing Result: {example_article.get('processing_result')}")



Processes the 'parsing_result' of each article to clean the text, and filters out articles 
that lack a 'title' or 'parsing_result'.


In [None]:
def updateArticle(db, id: str, values: dict = {}, collection="articles.sampled.triplets"):
    "Updates scraping task in database"
    filter = {"_id": ObjectId(id)}
    values = {"$set": {**values}}
    r = db[collection].update_one(filter, values)
    return r

In [None]:
roles = ["hero", "villain", "victim"]

for article in tqdm(articles):

    try: 
        triplets = article.get("triplets", [])
        
        # Chunks
        for chunk in triplets:
            text = chunk.get("chunk", [])
            answers = chunk.get("answers")

            # Answers 
            for answer in answers:
                triplet = answer.get("triplet")

                # Roles (hero, villain, victim)
                for role in roles:
                    entity = triplet.get(role)

                    if entity and entity != "None":

                        # Disambiguate entity
                        prefix = PROMPT_TEMPLATE_prefix.format(entity=entity)
                        prompt = PROMPT_TEMPLATE.format(prefix=prefix, article=text) 
                        output = disambEntity(model, [prompt], lang_title2wikidataID, trie)


                        # Update triplet
                        entity_disamb = output[0][0].get("text").split(" >> ")[0]
                        entity_qid = output[0][0].get("id")
                        score = output[0][0].get("score").item()

                        triplet[role] = {"entity": entity,
                                        "entity_disamb": entity_disamb,
                                        "entity_qid": entity_qid,
                                        "score": round(float(score), 4)
                                        }
                                        #"output": [output[0]]}
                        
                    else:
                        # Update triplet
                        triplet[role] = {"entity": "None",
                                        "entity_disamb": None,
                                        "entity_qid": None,
                                        "score": None,
                                        "output": None}
                        
                #print(triplet)
        
       # print(article.get("_id"))

    except Exception as e:
        print(e)
        
    updateArticle(db, article.get("_id"), {"triplets_disamb": triplets})

100%|██████████| 100/100 [04:48<00:00,  2.89s/it]


## Retrieve Data from Wikidata 

In [10]:
collection = "articles.sampled.triplets"
fields = {"_id": 1, 'triplets_disamb': 1}
query = {"triplets_disamb": {"$exists": True}}
articles = fetchArticleTexts(db, 100, 0, fields, query, collection)

In [11]:
#articles[0]

In [12]:
def updateWikidataEntity(db, id: str, values: dict = {}, collection="wikidata.entities"):
    "Updates scraping task in database"
    filter = {"qid": id}
    values = {"$set": {**values}}
    r = db[collection].update_one(filter, values, upsert=True)
    return r

def checkWikidataEntity(db, id: str, collection="wikidata.entities"):
    filter = {"qid": id}
    existing_document = db[collection].find_one(filter)
    return existing_document

In [13]:
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()

In [25]:
roles = ["hero", "villain", "victim"]

for article in tqdm(articles):

    try: 
        triplets = article.get("triplets_disamb", [])
        
        # Chunks
        for chunk in triplets:
            text = chunk.get("chunk", [])
            answers = chunk.get("answers")

            # Answers 
            for answer in answers:
                triplet = answer.get("triplet")

                # Roles (hero, villain, victim)
                for role in roles:
                    entity = triplet.get(role)
                    qid = entity.get("entity_qid")
                    if qid:
                        existing_document = checkWikidataEntity(db, qid)

                        if not existing_document:

                            person = pywikibot.ItemPage(repo, qid) 
                            name = person.labels.get('en', 'No English label') if person else "Unknown Person"
                            party_membership = get_party_memberships(person)

                            data = person.toJSON()
                            time.sleep(0.1)
                            updateWikidataEntity(db, qid, {
                                "name": name,
                                "party_membership": party_membership, 
                                "data": data}
                            )

                        #print(qid)
                    

    except Exception as e:
        print(e)

100%|██████████| 100/100 [00:15<00:00,  6.54it/s]
