In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kensho-derived-wikimedia-data/statements.csv
/kaggle/input/kensho-derived-wikimedia-data/page.csv
/kaggle/input/kensho-derived-wikimedia-data/item.csv
/kaggle/input/kensho-derived-wikimedia-data/property_aliases.csv
/kaggle/input/kensho-derived-wikimedia-data/property.csv
/kaggle/input/kensho-derived-wikimedia-data/item_aliases.csv
/kaggle/input/kensho-derived-wikimedia-data/link_annotated_text.jsonl


In [2]:
!pip install spacy
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [68]:
!pip install spacy-lookups-data



In [2]:
import pandas as pd
import json
from pathlib import Path
import spacy

In [3]:
import random

In [4]:
nlp = spacy.load("en_core_web_lg")

<h1>Preparing Knowledge Base for Disambiguities

Each item represents a page on the wikipedia

In [11]:
item_df = pd.read_csv('/kaggle/input/kensho-derived-wikimedia-data/item.csv')

In [12]:
item_df.head(5)

Unnamed: 0,item_id,en_label,en_description
0,1,Universe,totality of space and all contents
1,2,Earth,third planet from the Sun in the Solar System
2,3,life,matter capable of extracting energy from the e...
3,4,death,permanent cessation of vital functions
4,5,human,"common name of Homo sapiens, unique extant spe..."


In [13]:
def load_entities(df):
    names = dict()
    descriptions = dict()
    
    for index, row in df.iterrows():
        qid = str(row['item_id'])
        name = str(row['en_label'])
        desc = str(row['en_description'])
        names[qid] = name
        descriptions[qid] = desc
    
    return names, descriptions

In [16]:
name_dict, desc_dict = load_entities(item_df)

Loading spacy knowledge base

In [19]:
from spacy.kb import InMemoryLookupKB
vocab = nlp.vocab
kb = InMemoryLookupKB(vocab=vocab, entity_vector_length=300)

Inserting our entities into the knowledge base

In [20]:
for qid, desc in desc_dict.items():
    desc_doc = nlp(desc)
    desc_enc = desc_doc.vector
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)   # 342 is an arbitrary value here

In [21]:
for qid, name in name_dict.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)

  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])   # 100% prior probability P(entity|alias)


Loading the aliases for the entities

In [22]:
alias_df = pd.read_csv('/kaggle/input/kensho-derived-wikimedia-data/item_aliases.csv')

Inserting the aliases into the knowledge base with respect to the corresponding item id

In [61]:
alias_df.head()

Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet


In [24]:
alias = {}
for index, row in alias_df.iterrows():
        qid = str(row['item_id'])
        name = str(row['en_alias'])
        alias[qid] = name

In [25]:
for qid, name in alias.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])

  kb.add_alias(alias=name, entities=[qid], probabilities=[1])
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])
  kb.add_alias(alias=name, entities=[qid], probabilities=[1])


In [26]:
# change the directory and file names to whatever you like
import os
output_dir = Path.cwd().parent / "my_output"
if not os.path.exists(output_dir):
    os.mkdir(output_dir) 
kb.to_disk(output_dir / "my_kb")

<h1>Preparing Training Data

In [27]:
page_df = pd.read_csv('/kaggle/input/kensho-derived-wikimedia-data/page.csv')

In [60]:
page_df.head()

Unnamed: 0,page_id,item_id,title,views
0,12,6199,Anarchism,31335
1,25,38404,Autism,49693
2,39,101038,Albedo,14573
3,290,9659,A,25859
4,303,173,Alabama,52765


In [28]:
page_to_item = {}
for index, row in page_df.iterrows():
        page_id = str(row['page_id'])
        item_id = str(row['item_id'])
        page_to_item[page_id] = item_id

In [29]:
counter = 0
dataset = []
json_loc = Path("/kaggle/input/kensho-derived-wikimedia-data/link_annotated_text.jsonl")
with json_loc.open("r", encoding="utf8") as jsonfile:
    for line in jsonfile:
        example = json.loads(line)
        sections = example["sections"]
        for section in sections:
            text = section['text']
            link_lengths = section["link_lengths"]
            link_offset = section["link_offsets"]
            target_page_ids = section["target_page_ids"]
            
            for length, offset, page_id in zip(link_lengths, link_offset, target_page_ids):
                QID = page_to_item[str(page_id)]
                offset = (offset, offset + length)
                entity_label = text[offset[0]:offset[1]]
                entities = [(offset[0], offset[1], entity_label)]
                links_dict = {QID: 1.0}
                dataset.append((text, {"links": {offset: links_dict}, "entities": entities}))
                counter +=1
        if counter >= 100000:
            break

In [30]:
dataset[0]

("Anarchism is an anti-authoritarian political and social philosophy that rejects hierarchies deemed unjust and advocates their replacement with self-managed, self-governed societies based on voluntary, cooperative institutions. These institutions are often described as stateless societies, although several authors have defined them more specifically as distinct institutions based on non-hierarchical or free associations. Anarchism's central disagreement with other ideologies is that it holds the state to be undesirable, unnecessary, and harmful. Anarchism is usually placed on the far-left of the political spectrum, and much of its economics and legal philosophy reflect anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism, or participatory economics. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. Anarchist schools of thought can d

In [31]:
from spacy.training import Example

TRAIN_EXAMPLES = []
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")
sentencizer = nlp.get_pipe("sentencizer")

for text, annotation in dataset:
    try:
        example = Example.from_dict(nlp.make_doc(text), annotation)
        example.reference = sentencizer(example.reference)
        TRAIN_EXAMPLES.append(example)
    except: 
        pass
    

<h1>Entity Linker being linked to the knowledge base</h1>

In [51]:
from spacy.ml.models import load_kb

entity_linker = nlp.add_pipe("entity_linker", config={"incl_prior": False}, last=True)
#entity_linker = nlp.get_pipe("entity_linker")
entity_linker.initialize(get_examples=lambda: TRAIN_EXAMPLES, kb_loader=load_kb(output_dir / "my_kb"))

<h1> Training the entity linker</h1>

In [52]:
from spacy.util import minibatch, compounding

with nlp.select_pipes(enable=["entity_linker"]):   # train only the entity_linker
    optimizer = nlp.resume_training()
    for itn in range(500):
        random.shuffle(TRAIN_EXAMPLES)
        batches = minibatch(TRAIN_EXAMPLES, size=4)  
        losses = {}
        for batch in batches:
            nlp.update(
                batch,   
                drop=0.2,      # prevent overfitting
                losses=losses,
                sgd=optimizer,
            )

<h1> Testing the Entity Linker <h1>

In [53]:
text = "Our Earth has a  continet named North America"
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_, ent.kb_id_)

Earth LOC 2
North America LOC 49


In [57]:
import spacy
from spacy import displacy



# Example text
text = "Our Earth has a continent named North America"

# Process the text
doc = nlp(text)

# Print the entities, their labels, and knowledge base IDs
for ent in doc.ents:
    print(ent.text, ent.label_,ent.kb_id_,f"https://www.wikidata.org/wiki/Q{ent.kb_id_}")


Earth LOC 2 https://www.wikidata.org/wiki/Q2
North America LOC 49 https://www.wikidata.org/wiki/Q49


In [59]:
# Visualize the entities
displacy.render(doc, style="ent", jupyter=True)