## Initial imports and download files

In [1]:
from tqdm.auto import tqdm #For progress bars
import random
from IPython.display import clear_output #For clearing outputs of installs
import nltk
nltk.download('punkt')
from itertools import product
clear_output()

In [2]:
!pip install --upgrade --no-cache-dir gdown

! gdown 1iIX-vkOv8eJrg4i8Fib5C2r6eFmRE89_ #WebNLG files
! unzip /content/webnlg.zip -d /content

! gdown 1n1rDjtdc9Tr4NaNna52burAt9H4B0Eb5 #Wikipedia files
! unzip /content/wikipedia.zip -d /content

clear_output()

## Setup datasets and tools

#### Paths

In [3]:
path = '/content/'

In [4]:
wikipedia_path = path + 'wikipedia'

### Wikipedia corpora

In [None]:
!pip install apache_beam mwparserfromhell
!pip install datasets
clear_output()
print('Huggingface "datasets" installed!')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting apache_beam
  Downloading apache_beam-2.48.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.3/14.3 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mwparserfromhell
  Downloading mwparserfromhell-0.6.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting crcmod<2.0,>=1.7 (from apache_beam)
  Downloading crcmod-1.7.tar.gz (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting orjson<4.0 (from apache_beam)
  Downloading orjson-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.

#### 1. Load wikipedia dataset (huggingface)

In [None]:
from datasets import load_from_disk

print('Loading Wikipedia dataset (2-3 minutes)...', end=' ')
wikipedia = load_from_disk(wikipedia_path)
print('Done :)')

#### 2. Extracting information of our interest from the wikipedia dataset and functions to search articles quickly in the dataset

In [None]:
def get_sentences_from_text(text):
    paragraphs = [p for p in text.split('\n') if p]
    article_sentences = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        sentences = [sentence.replace(' ()','').lstrip(' ') for sentence in sentences]
        article_sentences.append(sentences)
    article_sentences = [sentence for sentences in article_sentences for sentence in sentences]
    return article_sentences

Article indexes dictionary, with article titles as keys and indexes in wikipedia dataset as value (useful to search articles)

In [None]:
def get_article_indexes(titles):
    article_indexes = dict()
    for i, title in enumerate(titles):
        article_indexes[title] = i
    return article_indexes

In [None]:
titles = wikipedia['train']['title']

article_indexes = get_article_indexes(titles)

In [None]:
def duplicates(list): #Check if there are 2 articles with the same title (there is only one, not important)
    dup = []
    seen = set()
    for i in list:
        if i in seen:
            dup.append(i)
        seen.add(i)

    return dup

duplicates(titles)

Search article function, it returns an article given a string title if it is found (thanks to the article_indexes dictionary)

In [None]:
def wikipedia_searchArticle(article, article_indexes=article_indexes):
    try:
        index = article_indexes[article]
        return wikipedia['train'][index]['text']
    except:
        return None

In [None]:
random_titles = random.sample(titles, len(titles)) #List of unsorted titles

### Setting up DBpedia Spotlight, DBpedia SparQL and Stanford Stanza parser

#### DBpedia Spotlight

In [None]:
!pip install spacy_dbpedia_spotlight
clear_output()
print('DBpedia spotlight installed!')

In [None]:
import spacy_dbpedia_spotlight

spotlight = spacy_dbpedia_spotlight.create('en')

#### DBpedia SparQL (+ functions with queries)

In [None]:
!pip install sparqlwrapper
clear_output()
print('SPARQLWrapper installed!')

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def sparqlQuery(entity):
    query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?obj {
    dbr: rdf:type ?obj
    FILTER strstarts(str(?obj), str(dbo:))
}'''

    new_query = query[:177] + entity + query[177:]
    return new_query

def sparql_entityType(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()
        return ret["results"]["bindings"][0]['obj']['value'].split('/')[-1]
    except Exception as e:
        return None
        #print(e)

In [None]:
def sparqlQuery2(entity):
    query = '''
PREFIX dbr: <http://dbpedia.org/resource/>

SELECT DISTINCT ?property {
    { dbr: ?property ?o }
    union
    { ?s ?property dbr: }
}'''
    new_query = query[:83] + entity + query[83:132] + entity + query[132:]
    return new_query

def sparql_properties(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery2(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        properties = set()
        for element in ret['results']['bindings']:
            properties.add(element['property']['value'].split('/')[-1])
        return properties
    except Exception as e:
        return None

In [None]:
def sparqlQuery3(type, property, limit):
    query = '''
PREFIX prop: <http://dbpedia.org/property/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?s {
    { ?s prop: ?o .
      ?s rdf:type dbo: }
} ORDER BY RAND() LIMIT 
'''
    
    new_query = query[:182] + property + query[182:210] + type + query[210:237] + str(limit) + query[237:]
    return new_query

def sparql_searchPages(type, property, limit=400, wexea=False):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery3(type, property, limit)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        titles = set()
        for element in ret['results']['bindings']:
            if wexea:
                titles.add(element['s']['value'].split('/')[-1])
            else:
                titles.add(element['s']['value'].split('/')[-1].replace('_',' '))
        return titles
    except Exception as e:
        print('Error:', e)
        return None

#### UD Parser (Stanza)

In [None]:
!pip install stanza
import stanza
stanza.download('en')
from stanza.models.common.doc import Document
clear_output()
print('Stanza installed!')

In [None]:
parser = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

Function to print UDs from stanza document (first UDs = doc.sentences[0].to_dict())

In [None]:
def print_UDs(UDs):
    print ("{:<20} | {:<15} | {:<20} ".format('Token', 'Relation', 'Head'))
    print ("-" * 55)
    for word in UDs:
        print ("{:<20} | {:<15} | {:<20} "
            .format( '('+str(word['id'])+') ' + str(word['text']), str(word['deprel']), '('+str(UDs[word['head']-1]['id'])+') ' + str(UDs[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))
        

## Pipeline common functions

In [None]:
def get_triple(text_triple):
    firstBarIndex = text_triple.find('|')-1
    secondBarIndex = text_triple.rfind('|')+2

    sub = text_triple[:firstBarIndex]
    prop = text_triple[firstBarIndex + 3 : secondBarIndex-3]
    obj = text_triple[secondBarIndex:]

    triple = [sub, prop, obj]
    return triple

In [None]:
def modify_sentence(subject, object, entitySubject_info, entityObject_info, sentence, remove_underscores = False, put_OriginalEntities = True):
 
    entitySubject_str, subj_start, subj_end = entitySubject_info
    entityObject_str, obj_start, obj_end = entityObject_info

    sentence = str(sentence)
    if subj_start < obj_start:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + subject + sentence[subj_end:obj_start].replace('_',' ') + object + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + subject + sentence[subj_end:obj_start] + object + sentence[obj_end:]

            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + subject + sentence[subj_end:obj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:]
            
            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start])
    else:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + object + sentence[obj_end:subj_start].replace('_',' ') + subject + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + object + sentence[obj_end:subj_start] + subject + sentence[subj_end:]
            
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + object + sentence[obj_end:subj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:]
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start])

    return modified_sentence, sub_startChar, obj_startChar

## Entity match with Spotlight

In [None]:
def get_article(title, articles_dict):
    if title in articles_dict:
        article = articles_dict[title]
    else:
        article = wikipedia_searchArticle(title)
        if article:
            articles_dict[title] = article
        else:
            articles_dict[title] = None
    return article

In [None]:
spotlightSentences_dict = dict()
articles_dict = dict()

In [54]:
def get_sentences_strictSearch(title, spotlightSentences_dict):
    if title in spotlightSentences_dict:
        spotlight_article_sentences = spotlightSentences_dict[title]
    else:
        if articles_dict[title]:
            article_sentences = get_sentences_from_text(articles_dict[title])
            spotlight_article_sentences = []
            print(title, 'sentences spotlight:')
            for sentence in tqdm(article_sentences):
                ner_sentence = spotlight(sentence)
                spotlight_article_sentences.append(ner_sentence)
            spotlightSentences_dict[title] = spotlight_article_sentences
        else:
            spotlight_article_sentences = None
            spotlightSentences_dict[title] = None
        
    return spotlight_article_sentences

In [None]:
from itertools import product

def spotlight_sentence_extraction(subject, object, spotlight_subject_sentences, spotlight_object_sentences, subSpotlightEnts, objSpotlightEnts):
    subTripleEnts = set(subTripleEnt.kb_id for subTripleEnt in subSpotlightEnts)
    objTripleEnts = set(objTripleEnt.kb_id for objTripleEnt in objSpotlightEnts)
    
    repeatedEntities = subTripleEnts & objTripleEnts
    if repeatedEntities:
        if len(subTripleEnts) > len(repeatedEntities):
            for repeatedEntity in repeatedEntities:
                subTripleEnts.discard(repeatedEntity)
        elif len(objTripleEnts) > len(repeatedEntities):
            for repeatedEntity in repeatedEntities:
                objTripleEnts.discard(repeatedEntity)

    spotlight_sub_extractedSentences = []
    spotlight_sub_startSubObj = []
    if spotlight_subject_sentences:
        for spotlight_subject_sentence in spotlight_subject_sentences:
            subSentenceEnts = set(subSentenceEnt.kb_id for subSentenceEnt in spotlight_subject_sentence.ents)
            subSentenceEnts_dict = {subSentenceEnt.kb_id: subSentenceEnt for subSentenceEnt in spotlight_subject_sentence.ents}
            
            sub_matches = subSentenceEnts & subTripleEnts
            obj_matches = subSentenceEnts & objTripleEnts

            subObj_combinations = [subObj_combination for subObj_combination in product(*[sub_matches, obj_matches])]

            for subObj_combination in subObj_combinations:
                entity_subject = subSentenceEnts_dict[subObj_combination[0]]
                entity_object = subSentenceEnts_dict[subObj_combination[1]]
                entitySubject_info = [entity_subject.text, entity_subject.start_char, entity_subject.end_char]
                entityObject_info = [entity_object.text, entity_object.start_char, entity_object.end_char]
                modified_spotlight_subject_sentence, subject_sub_startChar, subject_obj_startChar = modify_sentence(subject, object, entitySubject_info, entityObject_info, spotlight_subject_sentence)
                spotlight_sub_extractedSentences.append(modified_spotlight_subject_sentence)
                spotlight_sub_startSubObj.append([subject_sub_startChar, subject_obj_startChar])
    
    spotlight_obj_extractedSentences = []
    spotlight_obj_startSubObj = []
    if spotlight_object_sentences:
        for spotlight_object_sentence in spotlight_object_sentences:
            objSentenceEnts = set(objSentenceEnt.kb_id for objSentenceEnt in spotlight_object_sentence.ents)
            objSentenceEnts_dict = {objSentenceEnt.kb_id: objSentenceEnt for objSentenceEnt in spotlight_object_sentence.ents}

            sub_matches = objSentenceEnts & subTripleEnts
            obj_matches = objSentenceEnts & objTripleEnts

            subObj_combinations = [subObj_combination for subObj_combination in product(*[sub_matches, obj_matches])]

            for subObj_combination in subObj_combinations:
                entity_subject = objSentenceEnts_dict[subObj_combination[0]]
                entity_object = objSentenceEnts_dict[subObj_combination[1]]
                entitySubject_info = [entity_subject.text, entity_subject.start_char, entity_subject.end_char]
                entityObject_info = [entity_object.text, entity_object.start_char, entity_object.end_char]
                modified_spotlight_object_sentence, object_sub_startChar, object_obj_startChar = modify_sentence(subject, object, entitySubject_info, entityObject_info, spotlight_object_sentence)
                spotlight_obj_extractedSentences.append(modified_spotlight_object_sentence)
                spotlight_obj_startSubObj.append([object_sub_startChar, object_obj_startChar])

    return spotlight_sub_extractedSentences, spotlight_obj_extractedSentences, spotlight_sub_startSubObj, spotlight_obj_startSubObj

In [None]:
spotlightSubjectObject_dict = dict()

In [None]:
def subObj_spotlightEntities(subject, object, spotlightSubjectObject_dict):
    if subject in spotlightSubjectObject_dict:
        subSpotlightEnts = spotlightSubjectObject_dict[subject].ents
    else:
        cleaned_subject = subject.replace('_',' ')
        subSpotlightEnts = spotlight(cleaned_subject)
        spotlightSubjectObject_dict[subject] = subSpotlightEnts
        if subSpotlightEnts.ents:
            subSpotlightEnts = subSpotlightEnts.ents
        else:
            subSpotlightEnts = None

    if object in spotlightSubjectObject_dict:
        objSpotlightEnts = spotlightSubjectObject_dict[object].ents
    else:
        cleaned_object = object.replace('_',' ')
        objSpotlightEnts = spotlight(cleaned_object)
        spotlightSubjectObject_dict[object] = objSpotlightEnts
        if objSpotlightEnts.ents:
            objSpotlightEnts = objSpotlightEnts.ents
        else:
            objSpotlightEnts = None
    
    return subSpotlightEnts, objSpotlightEnts

Main function is divided in 3 parts (1st and 3rd in this colab, and 2nd one in "Coreference Resolution.ipynb" colab). Here are the steps to run in correctly:
1. Run the first part (extractSentencesFromTriple_EntityMatch_Wikipedia_1), with a triple as input. This part will save a file called "articles_dict.pickle" in the folder of the colab, download the file.
2. Open Coreference Resolution.ipynb and run the setup part. Then, import the previously downloaded file "articles_dict.pickle" in the colab folder, run the code, and download again "new_articles_dict.pickle" (to import it again in this colab).
3. Return to this colab, import the new "articles_dict.pickle" and run the final part (extractSentencesFromTriple_EntityMatch_Wikipedia_2), which will load the file imported and use the output of extractSentencesFromTriple_EntityMatch_Wikipedia_1 to finish returning the final sentence candidates (and the positions in the string where the subject and object start)

In [None]:
import pickle

In [55]:
'1 Triple function (part 1)'
def extractSentencesFromTriple_EntityMatch_Wikipedia_1(triple):
    [subject, property, object] = triple
    subSpotlightEnts, objSpotlightEnts = subObj_spotlightEntities(subject, object, spotlightSubjectObject_dict)
    if not subSpotlightEnts or not objSpotlightEnts:
        print('Entities not found for subject or object')
        print('Subject Ents:', subSpotlightEnts,'\t', 'Object Ents:', objSpotlightEnts)
        return [None, None]
    articles_subject = get_article(subject, articles_dict)
    articles_object = get_article(object, articles_dict)
    with open('articles_dict.pickle', 'wb') as file:
        pickle.dump(articles_dict, file)
    return [triple, subSpotlightEnts, objSpotlightEnts]

In [56]:
'1 Triple function (part 2)'
def extractSentencesFromTriple_EntityMatch_Wikipedia_2(triple, subSpotlightEnts, objSpotlightEnts):
    with open('/content/new_articles_dict.pickle', 'rb') as file:
        articles_dict = pickle.load(file)
    [subject, property, object] = triple
    sentences_subject = get_sentences_strictSearch(subject, spotlightSentences_dict)
    sentences_object = get_sentences_strictSearch(object, spotlightSentences_dict)
    if not sentences_subject or not sentences_object:
        return None, None
        
    spotlight_sub_extractedSentences, spotlight_obj_extractedSentences, spotlight_sub_startSubObj, spotlight_obj_startSubObj = spotlight_sentence_extraction(subject, object, sentences_subject, sentences_object, subSpotlightEnts, objSpotlightEnts)

    spotlight_extractedSentences = spotlight_sub_extractedSentences + spotlight_obj_extractedSentences
    spotlight_startSubObj = spotlight_sub_startSubObj + spotlight_obj_startSubObj

    return [spotlight_extractedSentences, spotlight_startSubObj]

In [70]:
'intermediate function'
def EntityMatch_Wikipedia_All1_0(triple): #Very similar to extractSentencesFromTriple_EntityMatch_Wikipedia_1 (intermediate)
    [subject, property, object] = triple
    subSpotlightEnts, objSpotlightEnts = subObj_spotlightEntities(subject, object, spotlightSubjectObject_dict)
    if not subSpotlightEnts or not objSpotlightEnts:
        print('Entities not found for subject or object')
        print('Subject Ents:', subSpotlightEnts,'\t', 'Object Ents:', objSpotlightEnts)
        return [None, None]
    articles_subject = get_article(subject, articles_dict)
    articles_object = get_article(object, articles_dict)
    return [triple, subSpotlightEnts, objSpotlightEnts]

In [72]:
'All Triples function (part 1)'
def EntityMatch_Wikipedia_Run_SaveAll1(triples):
    allVariables = []
    for triple in triples:
        variables = EntityMatch_Wikipedia_All1_0(triple) #[triple, subSpotlightEnts, objSpotlightEnts]
        allVariables.append(variables)
    with open('articles_dict.pickle', 'wb') as file:
        pickle.dump(articles_dict, file)
    return allVariables

In [71]:
'All Triples function (part 2)'

import os

def EntityMatch_Wikipedia_Run_SaveAll2(variables):
    outputFolder = 'ResultFiles_EntityMatchWikipedia/'
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    with open('/content/new_articles_dict.pickle', 'rb') as file:
        articles_dict = pickle.load(file)
    for i, [triple, subSpotlightEnts, objSpotlightEnts] in enumerate(variables):
        [subject, property, object] = triple
        sentences_subject = get_sentences_strictSearch(subject, spotlightSentences_dict)
        sentences_object = get_sentences_strictSearch(object, spotlightSentences_dict)
        if not sentences_subject or not sentences_object:
            spotlight_extractedSentences, spotlight_startSubObj = None, None
        else:
            spotlight_sub_extractedSentences, spotlight_obj_extractedSentences, spotlight_sub_startSubObj, spotlight_obj_startSubObj = spotlight_sentence_extraction(subject, object, sentences_subject, sentences_object, subSpotlightEnts, objSpotlightEnts)

            spotlight_extractedSentences = spotlight_sub_extractedSentences + spotlight_obj_extractedSentences
            spotlight_startSubObj = spotlight_sub_startSubObj + spotlight_obj_startSubObj

        [sub, prop, obj] = triple
        fileName_triple = str(i) + '_triple('+sub+'-'+prop+'-'+obj+').pkl'
        fileName_sentences = str(i) + '_sentences.pkl'
        fileName_startSubObj = str(i) + '_startSubObj.pkl'
        with open(outputFolder + fileName_triple, 'wb') as file:
            pickle.dump(triple, file)
        with open(outputFolder + fileName_sentences, 'wb') as file:
            pickle.dump(spotlight_extractedSentences, file)
        with open(outputFolder + fileName_startSubObj, 'wb') as file:
            pickle.dump(spotlight_startSubObj, file)

## Test

In [73]:
spotlightSentences_dict = dict()
articles_dict = dict()

In [74]:
spotlightSubjectObject_dict = dict()

In [48]:
subject = 'Xavi'
property = 'birthPlace'
object = 'Terrassa'

triple = [subject, property, object]

In [None]:
triple

In [64]:
[subject, object, subSpotlightEnts, objSpotlightEnts] = extractSentencesFromTriple_EntityMatch_Wikipedia_1(triple)

In [69]:
spotlight_extractedSentences, spotlight_startSubObj = extractSentencesFromTriple_EntityMatch_Wikipedia_2(subject, object, subSpotlightEnts, objSpotlightEnts)

In [75]:
variables = EntityMatch_Wikipedia_Run_SaveAll1([triple])

In [76]:
EntityMatch_Wikipedia_Run_SaveAll2(variables)

Xavi sentences spotlight:


  0%|          | 0/293 [00:00<?, ?it/s]

Terrassa sentences spotlight:


  0%|          | 0/198 [00:00<?, ?it/s]