In [1]:
import json
import pandas as pd
import time

# refer to this paper
# https://arxiv.org/pdf/2108.07337.pdf

In [2]:
def get_relation_label(rel_id):
    label = ''
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels',
        'ids': ''
    }
      
    params['ids'] = str(rel_id)

    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        label = response[str(rel_id)]['labels']['en']['value']
    except:
        return (rel_id, label)
    return (rel_id, label)

In [3]:
def get_entity_relation_ids(ent_id):
    relations = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'claims',
        'ids': ''
    }
        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        relations = list(response[str(ent_id)]['claims'].keys())
    except:
        return (ent_id, relations)
    return (ent_id, relations)

In [4]:
def get_entity_labels_and_aliases(ent_id):
    labels = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels|aliases',    
        'ids': ''
    }

        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        labels.append(response[str(ent_id)]['labels']['en']['value'])
        
        for alias in response[str(ent_id)]['aliases']['en']:
            labels.append(alias['value'])
    except:
        return (ent_id, labels)
    return (ent_id, labels)

In [5]:
from fuzzywuzzy import fuzz

# when creating a function the arguments are (questions and the results wikidata api after lower())
def find_nearest_word_in_question(questions, lower_wikidata_replies):
    entities_list = []
    found_match = False
    counter = 0
    for i in range(len(questions)):
        found_match = False
        for label in lower_wikidata_replies[i][1]:
            if label in questions[i].lower(): 
                found_match = True
                index = questions[i].lower().find(label)
                entities_list.append(questions[i][index : len(label)+index])
                break
        if not found_match:
            split_tokens = questions[i][1].split()
            list_of_comb = []
            list_of_comb.extend(split_tokens)
            scores = []
            for j in range(len(split_tokens)-1):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1])
            for j in range(len(split_tokens)-2):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2])
            for j in range(len(split_tokens)-3):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2] + ' ' + split_tokens[j+3])

            for label in lower_wikidata_replies[i][1]:
                for token in list_of_comb: 
                    scores.append((fuzz.ratio(label, token.lower()), token, label))
            scores = sorted(scores, key=lambda x: x[0], reverse=True)

            if len(scores) and scores[0][0] < 85:
                counter += 1
                entities_list.append('')
            elif not(len(scores)):
                counter += 1
                entities_list.append('')
            elif scores[0][0] >= 85:
                entities_list.append(scores[0][1])
    return (entities_list, counter)

In [6]:
# jupyter notebook never finishes processing using multiprocessing python 3
# https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3

from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task(func, iterable, *params): 
    with open(f'./tmp_func.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func.py')
    return res
#     else:
#         raise "Not in Jupyter Notebook"

In [7]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_2(func, iterable, *params): 
    with open(f'./tmp_func_2.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_2.py')
    return res

In [8]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_3(func, iterable, *params): 
    with open(f'./tmp_func_3.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_3.py')
    return res

## Simple_questions Training Data

In [9]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_train.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
training_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    
    training_data.append(line)
    
qids = [line[0] for line in training_data]
target_relation = [line[1].replace('R', 'P') for line in training_data]
questions = [line[2] for line in training_data]
# print(len(training_data))
# print(training_data[0])
# print(qids[0])
# print(questions[0])

In [10]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

869.7217333316803


34374

In [11]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [12]:
len(unique_relation_ids)

4901

In [13]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

75.85443043708801


In [14]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [15]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [16]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

670.6623435020447


In [17]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result[1]]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

[('Q126399', ['w', 'a', 'r', 'n', 'e', 'r', ' ', 'b', 'r', 'o', 's', '.']),
 ('Q12439', ['d', 'e', 't', 'r', 'o', 'i', 't'])]

In [18]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [19]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i]])

In [20]:
# training_data_results = pd.DataFrame([questions[:10], entities_with_relation_labels, columns= ['entity id', 'relation labels'])
training_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id'])
training_data_results

Unnamed: 0,question,entity label,relation labels,target relation id
0,what movie is produced by warner bros.,w,"[ Commons category , instance of , founded b...",P272
1,who is a musician born in detroit,d,"[ topic's main category , continent , countr...",P19
2,who produced the film rough house rosie,r,"[ IMDb ID , Commons category , instance of ,...",P162
3,what is the language in which mera shikar was ...,m,"[ IMDb ID , instance of , director , cast m...",P364
4,Whats the name of a battle that happened in ch...,c,"[ Dewey Decimal Classification , WOEID , top...",P276
...,...,...,...,...
34369,What is the nationality of anthony bailey,a,"[ instance of , sex or gender , given name ,...",P27
34370,who was homi k. bhabha especially influenced by,h,"[ VIAF ID , ISNI , Library of Congress autho...",P737
34371,which artist composes video game music,v,"[ topic's main category , Freebase ID , Comm...",P136
34372,What gender is gastón filgueira,g,"[ member of sports team , occupation , date ...",P21


In [21]:
training_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/training_data.csv', index=False)

## Simple_questions Validation set

In [22]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_valid.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
validation_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    validation_data.append(line)
    
qids = [line[0] for line in validation_data]
target_relation = [line[1].replace('R', 'P') for line in validation_data]
questions = [line[2] for line in validation_data]

In [23]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

101.32181668281555


4867

In [24]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [25]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

54.09446668624878


In [26]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [27]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [28]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

169.11288189888


In [29]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result[1]]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

[('Q3541144',
  ['j',
   'w',
   ' ',
   'm',
   'a',
   'r',
   'r',
   'i',
   'o',
   't',
   't',
   ' ',
   'p',
   'a',
   'n',
   'a',
   'm',
   'a']),
 ('Q318926',
  ['s', 'a', 's', 'h', 'a', ' ', 'v', 'u', 'j', 'a', 'č', 'i', 'ć'])]

In [30]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [31]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i]])

In [32]:
validation_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id'])
validation_data_results.head(10)

Unnamed: 0,question,entity label,relation labels,target relation id
0,Who was the trump ocean club international hot...,W,"[ coordinate location , Commons category , i...",P138
1,where was sasha vujačić born,s,"[ sex or gender , occupation , member of spo...",P19
2,What is a film directed by wiebke von carolsfeld?,W,"[ sex or gender , instance of , country of c...",P57
3,What was Seymour Parker Gilbert's profession?,s,"[ sex or gender , VIAF ID , GND ID , place ...",P106
4,in what french city did antoine de févin die,a,"[ VIAF ID , ISNI , MusicBrainz artist ID , ...",P20
5,What job does jamie hewlett have,j,"[ VIAF ID , ISNI , Library of Congress autho...",P106
6,what country is ghost house from,g,"[ instance of , original language of film or ...",P495
7,which country was the yamakinkarudu movie prod...,y,"[ IMDb ID , instance of , producer , cast m...",P495
8,What's the time zone in sub-saharan africa,s,"[ Commons category , topic's main category , ...",P421
9,who is the chid of fritz leiber?,f,"[ Library of Congress authority ID , VIAF ID ...",P40


In [33]:
validation_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/validation_data.csv', index=False)

## Simple_questions Testing Data

In [34]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_test.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
test_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    del line[1]
    test_data.append(line)
    
qids = [line[0] for line in test_data]
target_relation = [line[1].replace('R', 'P') for line in test_data]
questions = [line[1] for line in test_data]

In [35]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

207.55484914779663


9961

In [36]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [37]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

160.63125014305115


In [38]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [39]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [None]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

In [None]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result[1]]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

In [None]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [None]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i]])

In [None]:
test_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id'])
test_data_results.head(10)

In [None]:
test_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/test_data.csv', index=False)