## Initial imports and download files

In [1]:
from tqdm.auto import tqdm #For progress bars
import random
from IPython.display import clear_output #For clearing outputs of installs
import nltk
nltk.download('punkt')
from itertools import product
clear_output()

In [2]:
!pip install --upgrade --no-cache-dir gdown

! gdown 1_4wBU1-YHqCsi7aJus_h1cphW1qwLyJh #wexea_dict.txt.gz
! gunzip /content/en_wexea.txt.gz #unzip wexea_dict.txt

! gdown 13sikzKkOtDogKMezxelEUnoWXcbB6elj #wexea_dict.pkl

clear_output()

## Setup datasets and tools

### DBpedia SparQL (+ functions with queries)

In [3]:
!pip install sparqlwrapper
clear_output()
print('SPARQLWrapper installed!')

SPARQLWrapper installed!


In [4]:
from SPARQLWrapper import SPARQLWrapper, JSON

def sparqlQuery(entity):
    query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?obj {
    dbr: rdf:type ?obj
    FILTER strstarts(str(?obj), str(dbo:))
}'''
    if ',' in entity:
        entity = entity.replace(',','\,')
    new_query = query[:177] + entity + query[177:]
    return new_query

def sparql_entityType(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()
        return ret["results"]["bindings"][0]['obj']['value'].split('/')[-1]
    except Exception as e:
        return None
        #print(e)

In [5]:
def sparqlQuery2(entity):
    query = '''
PREFIX dbr: <http://dbpedia.org/resource/>

SELECT DISTINCT ?property {
    { dbr: ?property ?o }
    union
    { ?s ?property dbr: }
}'''
    new_query = query[:83] + entity + query[83:132] + entity + query[132:]
    return new_query

def sparql_properties(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery2(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        properties = set()
        for element in ret['results']['bindings']:
            properties.add(element['property']['value'].split('/')[-1])
        return properties
    except Exception as e:
        return None

In [6]:
def sparqlQuery3(type, property, limit):
    query = '''
PREFIX prop: <http://dbpedia.org/property/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?s {
    { ?s (prop:|dbo:) ?o .
      ?s rdf:type dbo: }
} ORDER BY RAND() LIMIT 
'''
    
    new_query = query[:183] + property + query[183:188] + property + query[188:217] + type + query[217:244] + str(limit) + query[244:]

    return new_query

def sparql_searchPages(type, property, limit=400, wexea=False):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery3(type, property, limit)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()

        titles = set()
        for element in ret['results']['bindings']:
            if wexea:
                titles.add(element['s']['value'].split('/')[-1])
            else:
                titles.add(element['s']['value'].split('/')[-1].replace('_',' '))
        return titles
    except Exception as e:
        print('Error:', e)
        return None

## Pipeline common functions

In [7]:
def get_triple(text_triple):
    firstBarIndex = text_triple.find('|')-1
    secondBarIndex = text_triple.rfind('|')+2

    sub = text_triple[:firstBarIndex]
    prop = text_triple[firstBarIndex + 3 : secondBarIndex-3]
    obj = text_triple[secondBarIndex:]

    triple = [sub, prop, obj]
    return triple

In [8]:
def modify_sentence(subject, object, entitySubject_info, entityObject_info, sentence, remove_underscores = False, put_OriginalEntities = True):
 
    entitySubject_str, subj_start, subj_end = entitySubject_info
    entityObject_str, obj_start, obj_end = entityObject_info

    sentence = str(sentence)
    if subj_start < obj_start:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + subject + sentence[subj_end:obj_start].replace('_',' ') + object + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + subject + sentence[subj_end:obj_start] + object + sentence[obj_end:]

            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + subject + sentence[subj_end:obj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:]
            
            sub_startChar = subj_start
            obj_startChar = len(sentence[0:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:obj_start])
    else:
        if put_OriginalEntities:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + object + sentence[obj_end:subj_start].replace('_',' ') + subject + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + object + sentence[obj_end:subj_start] + subject + sentence[subj_end:]
            
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + object + sentence[obj_end:subj_start])
        else:
            if remove_underscores:
                modified_sentence = sentence[0:obj_start].replace('_',' ') + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start].replace('_',' ') + entitySubject_str.replace(' ','_') + sentence[subj_end:].replace('_',' ')
            else:
                modified_sentence = sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start] + entitySubject_str.replace(' ','_') + sentence[subj_end:]
            obj_startChar = obj_start
            sub_startChar = len(sentence[0:obj_start] + entityObject_str.replace(' ','_') + sentence[obj_end:subj_start])

    return modified_sentence, sub_startChar, obj_startChar

## Type match with WEXEA

### Set up wexea

In [9]:
wexeaFile_path = '/content/en_wexea.txt'
wexeaDict_path = '/content/wexea_dict.pkl'

In [10]:
import pickle 

def load_wexeaDict():   
    with open(wexeaDict_path, 'rb') as f:
        wexea_dict = pickle.load(f)
    return wexea_dict
    
wexea_dict = load_wexeaDict()

In [11]:
#Some info

#print('''Total Wikipedia articles: 6458670
#Total WEXEA articles: 2676086
#Wikipedia articles available at WEXEA: 2628436 (98.22% of WEXEA articles match with Wikipedia ones) (40.70% of Wikipedia articles)
#Wikipedia articles not available at WEXEA: 3830234''')

### Functions

In [12]:
def get_articles_typeMatch_WEXEA(type, property, n=30):
    if not type:
        print('No type to find articles')
        return None
    print('Finding articles with dbpedia type', type, '(and property "'+property+'")', end=' ')
    dbpTitles = sparql_searchPages(type, property, n*2, True)
    if len(dbpTitles) < n:
        n = len(dbpTitles)

    n_dbpTitles = []
    pbar = tqdm(total=n)
    for i, dbpTitle in enumerate(dbpTitles):
        if len(n_dbpTitles) >= n:
            break
        if dbpTitle in wexea_dict:
            n_dbpTitles.append(dbpTitle)
            pbar.update(1)
    return n_dbpTitles

In [13]:
import re

def replace_and_index(long_string, old_substring, new_substring):
    start_index = 0
    indexes = []
    replaced_substrings = []
    while old_substring in long_string[start_index:]:
        if old_substring in replaced_substrings:
            start_index += 1
            continue
        original_index = long_string.index(old_substring, start_index)
        end_index = original_index + len(old_substring)
        long_string = long_string[:original_index] + new_substring + long_string[end_index:]
        indexes = original_index, original_index + len(new_substring) #indexes.append((original_index, original_index + len(new_substring)))
        replaced_substrings.append(old_substring)
        start_index = original_index + len(new_substring)
    return long_string, [indexes, new_substring]

def extract_text(text):
    matches = re.findall(r'\[\[(.*?)\]\]', text)
    entities = set()
    entities_indexes = []
    for match in matches:
        parts = match.split('|')
        entity = parts[0].replace(' ','_')
        text, indexes = replace_and_index(text, '[[' + match + ']]', entity)
        #if indexes:
        #    entities_indexes.append(indexes)
        #if len(parts) == 3:
        #    entities.add(entity)
        
        if indexes and len(parts) == 3:
            entities.add(entity)
            entities_indexes.append(indexes)
    return entities, text, entities_indexes

def wexea_search(entity, file_path = wexeaFile_path, wexea_dict = wexea_dict):
    entity = entity.replace(' ','_')
    if entity not in wexea_dict:
        print(f"Article '{entity}' not found in file.")
        return None, None, None
    with open(file_path, 'rb') as f:
        f.seek(wexea_dict[entity])
        f.readline() # discard the first line
        lines = []
        for line in f:
            line = line.decode() #Convert bytes to str
            line = line.rstrip('\r\n') # remove the unwanted characters
            if line.startswith('###FILENAME###'):
                break
            lines.append(line)
        #print(lines) ##
    
    entities = set()
    texts = []
    entities_indexes = []
    for line in lines:
        entity, text, entities_index = extract_text(line)
        if len(entity) >= 2:
            entities.update(entity)
            entities_indexes.append(entities_index) #entities_indexes.append([list(entity), entities_index])
            if text:
                texts.append(text)
    return list(entities), texts, entities_indexes #entities, texts, lines, entities_indexes

In [14]:
def extractSentences_Wexea(entities, subject, object, subjectType, objectType, entities_indexes, texts, subObj_maxDistance=20):
    
    if [entities, texts, entities_indexes] == [None, None, None]:
        return None, None
    dict_entityIndex = dict()
    for entity in tqdm(entities):
        dict_entityIndex[entity] = sparql_entityType(entity) #99% of the time spent here (0.83 seconds for each entity)

    finalSentences = []
    final_entitiesIndexes = [] #startSubObj


    for i, entity_index in enumerate(tqdm(entities_indexes)): #entitat de wexea

        articleSentenceEnts = set(dict_entityIndex[entity] for [index, entity] in entity_index)
        articleSentenceEnts_dict = dict()
        articleSentenceEnts_dict[subjectType] = []
        articleSentenceEnts_dict[objectType] = []
        for [index, entity] in entity_index:
            try:
                articleSentenceEnts_dict[dict_entityIndex[entity]].append([entity, index])
            except KeyError:
                articleSentenceEnts_dict[dict_entityIndex[entity]] = [[entity, index]]

        #print(i, articleSentenceEnts_dict, subjectType, objectType)

        subObj_combinations = [subObj_combination for subObj_combination in product(*[articleSentenceEnts_dict[subjectType], articleSentenceEnts_dict[objectType]])]
        #print('subObj_combinations DONE')
        '''print(i)
        for subObj_combination in subObj_combinations:
            print(subObj_combination, subjectType, objectType)
        print()'''

        for subObj_combination in subObj_combinations:
            ([subject_ent, (sub_startChar, sub_endChar)], [object_ent, (obj_startChar, obj_endChar)]) = subObj_combination
            if abs(sub_startChar - obj_startChar) < subObj_maxDistance:
                #print(texts[i])
                #print(subject, object, [subject_ent, sub_startChar, sub_endChar], [object_ent, obj_startChar, obj_endChar])
                #break
                modified_ner_article_sentence, sub_startChar, obj_startChar = modify_sentence(subject, object, [subject_ent, sub_startChar, sub_endChar], [object_ent, obj_startChar, obj_endChar], texts[i], True)
                #modified_ner_article_sentence, sub_startChar, obj_startChar = modify_sentence(subject, object, [str(entity_subject), entity_subject.start_char, entity_subject.end_char], [str(entity_object), entity_object.start_char, entity_object.end_char], ner_article_sentence, put_OriginalEntities=True)

                finalSentences.append(modified_ner_article_sentence)
                final_entitiesIndexes.append([sub_startChar, obj_startChar])

    return finalSentences, final_entitiesIndexes


In [15]:
def extractSentencesFromTriple_TypeMatch_WEXEA(triple):
    [subject, property, object] = triple
    subjectType = sparql_entityType(subject)
    objectType = sparql_entityType(object)
    
    n_dbpTitles = get_articles_typeMatch_WEXEA(subjectType, property)

    if not n_dbpTitles:
        return None, None
    all_finalSentences = []
    all_final_entitiesIndexes = []
    print('articles:', end=' ')
    for title in tqdm(n_dbpTitles):
        entities, texts, entities_indexes = wexea_search(title)
        finalSentences, final_entitiesIndexes = extractSentences_Wexea(entities, subject, object, subjectType, objectType, entities_indexes, texts)
        all_finalSentences += finalSentences
        all_final_entitiesIndexes += final_entitiesIndexes
    return all_finalSentences, all_final_entitiesIndexes

In [16]:
import os

def WEXEA_Run_SaveAll(triples): #Main function to apply the method to several triples and save necessary files for next steps in pipeline
    outputFolder = 'ResultFiles_WEXEA/'
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)
    for i, triple in enumerate(triples):
        ner_article_extractedSentences, ner_article_startSubObj = extractSentencesFromTriple_TypeMatch_WEXEA(triple)
        [sub, prop, obj] = triple
        fileName_triple = str(i) + '_triple('+sub+'-'+prop+'-'+obj+').pkl'
        fileName_sentences = str(i) + '_sentences.pkl'
        fileName_startSubObj = str(i) + '_startSubObj.pkl'
        with open(outputFolder + fileName_triple, 'wb') as file:
            pickle.dump(triple, file)
        with open(outputFolder + fileName_sentences, 'wb') as file:
            pickle.dump(ner_article_extractedSentences, file)
        with open(outputFolder + fileName_startSubObj, 'wb') as file:
            pickle.dump(ner_article_startSubObj, file)

## Run

In [None]:
subject = 'Xavi'
property = 'birthPlace'
object = 'Terrassa'

triple = [subject, property, object]

In [17]:
subject = 'Cristiano_Ronaldo'
property = 'birthPlace'
object = 'Portugal'

triple = [subject, property, object]

In [18]:
triples = [triple]

In [36]:
WEXEA_Run_SaveAll(triples)

Finding articles with dbpedia type Person (and property "birthPlace") 

  0%|          | 0/30 [00:00<?, ?it/s]

articles: 

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/36 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/158 [00:00<?, ?it/s]

  0%|          | 0/26 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/72 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

In [24]:
i = 0
[sub,prop,obj] = triple

In [25]:
outputFolder = 'ResultFiles_WEXEA/'
fileName_triple = str(i) + '_triple('+sub+'-'+prop+'-'+obj+').pkl'
with open(outputFolder + fileName_triple, 'wb') as file:
    pickle.dump(triple, file)

In [26]:
with open('/content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl', 'rb') as file:
    triple = pickle.load(file)

In [37]:
with open('/content/ResultFiles_WEXEA/0_sentences.pkl', 'rb') as file:
    sentences = pickle.load(file)
with open('/content/ResultFiles_WEXEA/0_startSubObj.pkl', 'rb') as file:
    startSubObj = pickle.load(file)
with open('/content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl', 'rb') as file:
    triple = pickle.load(file)

In [38]:
len(sentences)

58

In [None]:
/content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl

In [23]:
with open('/content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl', 'rb') as file:
    triple = pickle.load(file)

UnpicklingError: ignored

In [39]:
!zip -r /content/ResultFiles_WEXEA.zip /content/ResultFiles_WEXEA

from google.colab import files
files.download("/content/ResultFiles_WEXEA.zip")

  adding: content/ResultFiles_WEXEA/ (stored 0%)
  adding: content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl (deflated 3%)
  adding: content/ResultFiles_WEXEA/0_startSubObj.pkl (deflated 50%)
  adding: content/ResultFiles_WEXEA/0_sentences.pkl (deflated 58%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [43]:
!zip -j /content/ResultFiles_WEXEA.zip /ResultFiles_WEXEA/


zip error: Nothing to do! (/content/ResultFiles_WEXEA.zip)


In [44]:
!unzip /content/ResultFiles_WEXEA.zip

Archive:  /content/ResultFiles_WEXEA.zip
   creating: content/ResultFiles_WEXEA/
  inflating: content/ResultFiles_WEXEA/0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl  
  inflating: content/ResultFiles_WEXEA/0_startSubObj.pkl  
  inflating: content/ResultFiles_WEXEA/0_sentences.pkl  
  inflating: 0_sentences.pkl         
  inflating: 0_startSubObj.pkl       
  inflating: 0_triple(Cristiano_Ronaldo-birthPlace-Portugal).pkl  
