## Initial imports and download files

In [1]:
from tqdm.auto import tqdm #For progress bars
import random
from IPython.display import clear_output #For clearing outputs of installs
import nltk
nltk.download('punkt')
from itertools import product
clear_output()

## Setup datasets and tools

### Simple Wikipedia

In [2]:
!pip3 install wikipedia-api
clear_output()

In [3]:
import wikipediaapi
simpleWiki = wikipediaapi.Wikipedia('simple')

In [4]:
def simpleWiki_searchArticle(title):
    try:
        article = simpleWiki.page(title.replace(' ','_')).text
        if article:
            return article
        else:
            return None
    except:
        return None

In [5]:
def get_sentences_from_text(text):
    paragraphs = [p for p in text.split('\n') if p]
    article_sentences = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        sentences = [sentence.replace(' ()','').lstrip(' ') for sentence in sentences]
        article_sentences.append(sentences)
    article_sentences = [sentence for sentences in article_sentences for sentence in sentences]
    return article_sentences

### Setting up DBpedia SparQL, Spacy NER

#### DBpedia SparQL (+ functions with queries)

In [6]:
!pip install sparqlwrapper
clear_output()
print('SPARQLWrapper installed!')

SPARQLWrapper installed!


In [7]:
from SPARQLWrapper import SPARQLWrapper, JSON

def sparqlQuery(entity):
    query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?obj {
    dbr: rdf:type ?obj
    FILTER strstarts(str(?obj), str(dbo:))
}'''

    new_query = query[:177] + entity + query[177:]
    return new_query

def sparql_entityType(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()
        return ret["results"]["bindings"][0]['obj']['value'].split('/')[-1]
    except Exception as e:
        return None
        #print(e)

In [8]:
def sparqlQuery2(entity):
    query = '''
PREFIX dbr: <http://dbpedia.org/resource/>

SELECT DISTINCT ?property {
    { dbr: ?property ?o }
    union
    { ?s ?property dbr: }
}'''
    new_query = query[:83] + entity + query[83:132] + entity + query[132:]
    return new_query

def sparql_properties(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery2(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        properties = set()
        for element in ret['results']['bindings']:
            properties.add(element['property']['value'].split('/')[-1])
        return properties
    except Exception as e:
        return None

In [9]:
def sparqlQuery3(type, property, limit):
    query = '''
PREFIX prop: <http://dbpedia.org/property/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?s {
    { ?s prop: ?o .
      ?s rdf:type dbo: }
} ORDER BY RAND() LIMIT 
'''
    
    new_query = query[:182] + property + query[182:210] + type + query[210:237] + str(limit) + query[237:]
    return new_query

def sparql_searchPages(type, property, limit=400, wexea=False):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery3(type, property, limit)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        titles = set()
        for element in ret['results']['bindings']:
            if wexea:
                titles.add(element['s']['value'].split('/')[-1])
            else:
                titles.add(element['s']['value'].split('/')[-1].replace('_',' '))
        return titles
    except Exception as e:
        print('Error:', e)
        return None

#### NER (Spacy)

In [10]:
import spacy
ner = spacy.load('en_core_web_sm')

## Pipeline common functions

In [11]:
def get_triple(text_triple):
    firstBarIndex = text_triple.find('|')-1
    secondBarIndex = text_triple.rfind('|')+2

    sub = text_triple[:firstBarIndex]
    prop = text_triple[firstBarIndex + 3 : secondBarIndex-3]
    obj = text_triple[secondBarIndex:]

    triple = [sub, prop, obj]
    return triple

In [12]:
def modify_sentence(subject, object, entitySubject_info, entityObject_info, sentence, remove_underscores = False, put_OriginalEntities = True):
 
    entitySubject_str, subj_start, subj_end = entitySubject_info
    entityObject_str, obj_start, obj_end = entityObject_info

    sentence = str(sentence)
    if subj_start < obj_start:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + subject + sentence[subj_end:obj_start].replace('_',' ') + object + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + subject + sentence[subj_end:obj_start] + object + sentence[obj_end:]

            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + subject + sentence[subj_end:obj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:]
            
            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start])
    else:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + object + sentence[obj_end:subj_start].replace('_',' ') + subject + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + object + sentence[obj_end:subj_start] + subject + sentence[subj_end:]
            
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + object + sentence[obj_end:subj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:]
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start])

    return modified_sentence, sub_startChar, obj_startChar

## Type match with Spacy

In [13]:
def get_articles_typeMatch(type, property, n=10000):
    if not type:
        print('No type to find articles')
        return None
    print('Finding articles with dbpedia type', type, '(and property "'+property+'")', end=' ')
    dbpTitles = sparql_searchPages(type, property, n)
    print('Done :)')

    return dbpTitles

In [14]:
nerSentences_dict = dict()
articles_dict = dict()

In [15]:
def get_article(title, articles_dict):
    if title in articles_dict:
        ner_article_sentences = articles_dict[title]
    else:
        article = simpleWiki_searchArticle(title)
        if article:
            articles_dict[title] = article
        else:
            articles_dict[title] = None
    return article

In [16]:
def get_ner_sentences(title, nerSentences_dict):
    if title in nerSentences_dict:
        ner_article_sentences = nerSentences_dict[title]
    else:
        if articles_dict[title]:
            article_sentences = get_sentences_from_text(articles_dict[title])
            ner_article_sentences = []
            for sentence in article_sentences:
                ner_sentence = ner(sentence)
                ner_article_sentences.append(ner_sentence)
            nerSentences_dict[title] = ner_article_sentences
        else:
            ner_article_sentences = None
            nerSentences_dict[title] = None
        
    return ner_article_sentences

In [17]:
def spacy_sentence_extraction(subject, object, ner_article_sentences, subSpacyTypes, objSpacyTypes, subObj_maxDistance=20):

    ner_article_extractedSentences = []
    ner_article_startSubObj = []
    if ner_article_sentences:
        print('Extracting NER sentences matching in type with subject and object:', end=' ')
        for ner_article_sentence in tqdm(ner_article_sentences):
        #for ner_article_sentence in (ner_article_sentences):
            articleSentenceEnts = set(articleSentenceEnt.label_ for articleSentenceEnt in ner_article_sentence.ents)
            articleSentenceEnts_dict = dict()
            for articleSentenceEnt in ner_article_sentence.ents:
                try:
                    articleSentenceEnts_dict[articleSentenceEnt.label_].append(articleSentenceEnt)
                except KeyError:
                    articleSentenceEnts_dict[articleSentenceEnt.label_] = [articleSentenceEnt]

            sub_matches = articleSentenceEnts & subSpacyTypes
            obj_matches = articleSentenceEnts & objSpacyTypes

            subObj_combinations = [subObj_combination for subObj_combination in product(*[sub_matches, obj_matches])]

            for subObj_combination in subObj_combinations:
                for [entity_subject, entity_object] in product(*[articleSentenceEnts_dict[subObj_combination[0]], articleSentenceEnts_dict[subObj_combination[1]]]):
                    if abs(entity_subject.start - entity_object.start) < subObj_maxDistance:
                        modified_ner_article_sentence, sub_startChar, obj_startChar = modify_sentence(subject, object, [str(entity_subject), entity_subject.start_char, entity_subject.end_char], [str(entity_object), entity_object.start_char, entity_object.end_char], ner_article_sentence, put_OriginalEntities=True)
                        ner_article_extractedSentences.append(modified_ner_article_sentence)
                        ner_article_startSubObj.append([sub_startChar, obj_startChar])
        
    return ner_article_extractedSentences, ner_article_startSubObj

In [18]:
NERSubjectObject_dict = dict()

In [19]:
def subObj_nerTypes(subject, object, NERSubjectObject_dict):
    subTypes = set()
    if subject in NERSubjectObject_dict:
        for ent in NERSubjectObject_dict[subject].ents:
            subTypes.add(ent.label_)
    else:
        cleaned_subject = subject.replace('_',' ')
        subNer = ner(cleaned_subject)
        subEnts = subNer.ents
        for ent in subEnts:
            subTypes.add(ent.label_)
        NERSubjectObject_dict[subject] = subNer

    objTypes = set()
    if object in NERSubjectObject_dict:
        for ent in NERSubjectObject_dict[object].ents:
            objTypes.add(ent.label_)
    else:
        cleaned_object = object.replace('_',' ')
        objNer = ner(cleaned_object)
        objEnts = objNer.ents
        for ent in objEnts:
            objTypes.add(ent.label_)
        NERSubjectObject_dict[object] = objNer

    
    return subTypes, objTypes

In [20]:
import pickle

Main function is divided in 3 parts (1st and 3rd in this colab, and 2nd one in "Coreference Resolution.ipynb" colab). Here are the steps to run in correctly:
1. Run the first part (extractSentencesFromTriple_TypeMatch_Wikipedia_1), with a triple as input. This part will save a file called "articles_dict.pickle" in the folder of the colab, download the file.
2. Open Coreference Resolution.ipynb and run the setup part. Then, import the previously downloaded file "articles_dict.pickle" in the colab folder, run the code, and download again "articles_dict.pickle", which has now been modified.
3. Return to this colab, import the new "articles_dict.pickle" and run the final part (extractSentencesFromTriple_TypeMatch_Wikipedia_2), which will load the file imported and use the output of extractSentencesFromTriple_TypeMatch_Wikipedia_1 to finish returning the final sentence candidates (and the positions in the string where the subject and object start)

In [21]:
'1 Triple function (part 1)'

def extractSentencesFromTriple_TypeMatch_Wikipedia_1(triple):
    [subject, property, object] = triple
    subTypes, objTypes = subObj_nerTypes(subject, object, NERSubjectObject_dict)
    if not subTypes or not objTypes:
        print('Types not found for subject or object')
        print('Subject Types:', subTypes,'\t', 'Object Types:', objTypes)
        return [None, None, None]
    subjectType = sparql_entityType(subject)
    n_dbpTitles = get_articles_typeMatch(subjectType, property)
    if not n_dbpTitles:
        return [None, None, None]
    for title in n_dbpTitles:
        article = get_article(title, articles_dict)
        with open('articles_dict.pickle', 'wb') as file:
            pickle.dump(articles_dict, file)
    return [n_dbpTitles, subTypes, objTypes]

In [22]:
'1 Triple function (part 2)'

def extractSentencesFromTriple_TypeMatch_Wikipedia_2(n_dbpTitles, subTypes, objTypes):
    with open('/content/articles_dict.pickle', 'rb') as file:
        articles_dict = pickle.load(file)
    allNer_article_sentences = []
    for title in tqdm(n_dbpTitles):
        ner_article_sentences = get_ner_sentences(title, nerSentences_dict)
        if ner_article_sentences:
            allNer_article_sentences.extend(ner_article_sentences)
    ner_article_extractedSentences, ner_article_startSubObj = spacy_sentence_extraction(subject, object, allNer_article_sentences, subTypes, objTypes)
    return ner_article_extractedSentences, ner_article_startSubObj

In [33]:
'intermediate function'
def TypeMatch_Wikipedia_All1_0(triple): #Very similar to extractSentencesFromTriple_TypeMatch_Wikipedia_1
    [subject, property, object] = triple
    subTypes, objTypes = subObj_nerTypes(subject, object, NERSubjectObject_dict)
    if not subTypes or not objTypes:
        print('Types not found for subject or object')
        print('Subject Types:', subTypes,'\t', 'Object Types:', objTypes)
        return [None, None, None, None]
    subjectType = sparql_entityType(subject)
    n_dbpTitles = get_articles_typeMatch(subjectType, property)
    if not n_dbpTitles:
        return [None, None, None, None]
    for title in n_dbpTitles:
        article = get_article(title, articles_dict)
    return [triple, n_dbpTitles, subTypes, objTypes]

In [24]:
'All Triples function (part 1)'
def TypeMatchWikipedia_Run_SaveAll1(triples):
    allVariables = []
    for triple in triples:
        variables = TypeMatch_Wikipedia_All1_0(triple) #[triple, n_dbpTitles, subTypes, objTypes]
        allVariables.append(variables)
    with open('articles_dict.pickle', 'wb') as file:
        pickle.dump(articles_dict, file)
    return allVariables

In [31]:
'All Triples function (part 2)'

import os

def TypeMatchWikipedia_Run_SaveAll2(variables):
    outputFolder = 'ResultFiles_TypeMatchSimpleWikipedia/'
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    with open('/content/new_articles_dict.pickle', 'rb') as file:
        articles_dict = pickle.load(file)
    for i, [triple, n_dbpTitles, subTypes, objTypes] in enumerate(variables):
        allNer_article_sentences = []
        for title in tqdm(n_dbpTitles):
            ner_article_sentences = get_ner_sentences(title, nerSentences_dict)
            if ner_article_sentences:
                allNer_article_sentences.extend(ner_article_sentences)
        ner_article_extractedSentences, ner_article_startSubObj = spacy_sentence_extraction(subject, object, allNer_article_sentences, subTypes, objTypes)

        [sub, prop, obj] = triple
        fileName_triple = str(i) + '_triple('+sub+'-'+prop+'-'+obj+').pkl'
        fileName_sentences = str(i) + '_sentences.pkl'
        fileName_startSubObj = str(i) + '_startSubObj.pkl'
        with open(outputFolder + fileName_triple, 'wb') as file:
            pickle.dump(triple, file)
        with open(outputFolder + fileName_sentences, 'wb') as file:
            pickle.dump(ner_article_extractedSentences, file)
        with open(outputFolder + fileName_startSubObj, 'wb') as file:
            pickle.dump(ner_article_startSubObj, file)

## Run

In [27]:
subject = 'Cristiano_Ronaldo'
property = 'birthPlace'
object = 'Portugal'

triple = [subject, property, object]

In [28]:
triple

['Cristiano_Ronaldo', 'birthPlace', 'Portugal']

In [46]:
nerSentences_dict = dict()
articles_dict = dict()

In [47]:
NERSubjectObject_dict = dict()

In [48]:
variables = TypeMatchWikipedia_Run_SaveAll1([triple])

Finding articles with dbpedia type Person (and property "birthPlace") Done :)


In [50]:
TypeMatchWikipedia_Run_SaveAll2(variables)

  0%|          | 0/10000 [00:00<?, ?it/s]

Extracting NER sentences matching in type with subject and object: 

  0%|          | 0/393 [00:00<?, ?it/s]