In [1]:
import json
import pandas as pd
import time

# refer to this paper
# https://arxiv.org/pdf/2108.07337.pdf

In [2]:
def get_relation_label(rel_id):
    label = ''
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels',
        'ids': ''
    }
      
    params['ids'] = str(rel_id)

    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        label = response[str(rel_id)]['labels']['en']['value']
    except:
        return (rel_id, label)
    return (rel_id, label)

In [3]:
def get_entity_relation_ids(ent_id):
    relations = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'claims',
        'ids': ''
    }
        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        relations = list(response[str(ent_id)]['claims'].keys())
    except:
        return (ent_id, relations)
    return (ent_id, relations)

In [4]:
def get_entity_labels_and_aliases(ent_id):
    labels = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels|aliases',    
        'ids': ''
    }

        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        labels.append(response[str(ent_id)]['labels']['en']['value'])
        
        for alias in response[str(ent_id)]['aliases']['en']:
            labels.append(alias['value'])
    except:
        return (ent_id, labels)
    return (ent_id, labels)

In [5]:
from fuzzywuzzy import fuzz

# when creating a function the arguments are (questions and the results wikidata api after lower())
def find_nearest_word_in_question(questions, lower_wikidata_replies):
    entities_list = []
    found_match = False
    counter = 0
    for i in range(len(questions)):
        found_match = False
        for label in lower_wikidata_replies[i][1]:
            if label in questions[i].lower(): 
                found_match = True
                index = questions[i].lower().find(label)
                entities_list.append(questions[i][index : len(label)+index])
                break
        if not found_match:
            split_tokens = questions[i][1].split()
            list_of_comb = []
            list_of_comb.extend(split_tokens)
            scores = []
            for j in range(len(split_tokens)-1):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1])
            for j in range(len(split_tokens)-2):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2])
            for j in range(len(split_tokens)-3):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2] + ' ' + split_tokens[j+3])

            for label in lower_wikidata_replies[i][1]:
                for token in list_of_comb: 
                    scores.append((fuzz.ratio(label, token.lower()), token, label))
            scores = sorted(scores, key=lambda x: x[0], reverse=True)

            if len(scores) and scores[0][0] < 85:
                counter += 1
                entities_list.append('')
            elif not(len(scores)):
                counter += 1
                entities_list.append('')
            elif scores[0][0] >= 85:
                entities_list.append(scores[0][1])
    return (entities_list, counter)

In [6]:
# jupyter notebook never finishes processing using multiprocessing python 3
# https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3

from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task(func, iterable, *params): 
    with open(f'./tmp_func.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func.py')
    return res
#     else:
#         raise "Not in Jupyter Notebook"

In [7]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_2(func, iterable, *params): 
    with open(f'./tmp_func_2.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_2.py')
    return res

In [8]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_3(func, iterable, *params): 
    with open(f'./tmp_func_3.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_3.py')
    return res

## Simple_questions Training Data

In [9]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_train.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
training_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    
    training_data.append(line)
    
qids = [line[0] for line in training_data]
target_relation = [line[1].replace('R', 'P') for line in training_data]
questions = [line[2] for line in training_data]
# print(len(training_data))
# print(training_data[0])
# print(qids[0])
# print(questions[0])

In [10]:
print(target_relation[0])
len(target_relation)

P272


34374

In [11]:
get_entity_relation_ids('Q126399')[1]

['P373',
 'P31',
 'P112',
 'P749',
 'P856',
 'P966',
 'P646',
 'P159',
 'P910',
 'P1711',
 'P18',
 'P214',
 'P571',
 'P2002',
 'P345',
 'P2531',
 'P740',
 'P154',
 'P452',
 'P3221',
 'P2013',
 'P1278',
 'P3417',
 'P3762',
 'P1220',
 'P1056',
 'P166',
 'P169',
 'P1417',
 'P2581',
 'P3347',
 'P17',
 'P3219',
 'P2011',
 'P1424',
 'P1830',
 'P6150',
 'P691',
 'P244',
 'P463',
 'P1296',
 'P7632',
 'P2003',
 'P3836',
 'P355',
 'P7859',
 'P2627',
 'P138',
 'P2088',
 'P1889',
 'P8317',
 'P10297',
 'P8313',
 'P2388',
 'P460',
 'P213',
 'P7293',
 'P8179',
 'P409',
 'P1375',
 'P2163',
 'P8189',
 'P3630',
 'P1207',
 'P1695',
 'P3553',
 'P9885',
 'P1955',
 'P3943',
 'P7708',
 'P9807',
 'P7502',
 'P4264',
 'P6839',
 'P8895',
 'P1320',
 'P8687',
 'P648',
 'P3847',
 'P3222',
 'P4272',
 'P4342',
 'P6683',
 'P7982',
 'P6058',
 'P6760',
 'P10225',
 'P10804',
 'P8168',
 'P3185']

In [12]:
# qids[0]


target_relation[0]
# target_relation[0] in get_entity_relation_ids('Q126399')[1]

'P272'

In [13]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

KeyboardInterrupt: 

In [None]:
relation_ids_results[:2]

In [None]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [None]:
len(unique_relation_ids)

In [None]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

In [None]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [None]:
# target_relation

In [None]:
target_relation_labels = []
for id in target_relation:
    if id in relation_dict.keys():
        target_relation_labels.append(relation_dict[id])

In [None]:
len(target_relation_labels)*100/len(target_relation)

In [None]:
relation_strs = pd.DataFrame(target_relation_labels, columns=['label'])
relation_strs.head()

In [None]:
relation_strs['label'].str.len().hist(bins=30)

In [None]:
f'{len(relation_strs[relation_strs["label"].str.len()> 40])*100/len(relation_strs)}%'

In [None]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        if len(relation_dict[relation_id]) < 40:
            relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [None]:
entities_with_relation_labels

In [None]:
get_entity_labels_and_aliases(qids[0])

In [None]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

In [None]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

In [None]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [None]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i]])

In [None]:
# training_data_results = pd.DataFrame([questions[:10], entities_with_relation_labels, columns= ['entity id', 'relation labels'])
training_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id'])
training_data_results

In [None]:
training_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/training_data.csv', index=False)

## Simple_questions Validation set

In [None]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_valid.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
validation_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    validation_data.append(line)
    
qids = [line[0] for line in validation_data]
target_relation = [line[1].replace('R', 'P') for line in validation_data]
questions = [line[2] for line in validation_data]

In [None]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

In [None]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [None]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

In [None]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [None]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        if len(relation_dict[relation_id]) < 40:
            relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [None]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

In [None]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

In [None]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [None]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i]])

In [None]:
validation_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id'])
validation_data_results.head(10)

In [None]:
validation_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/validation_data.csv', index=False)

## Simple_questions Testing Data

In [9]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_test.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
test_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    test_data.append(line)

    
qids = [line[0] for line in test_data]
target_relation = [line[1].replace('R', 'P') for line in test_data]
questions = [line[2] for line in test_data]

In [10]:
len(qids)

9961

In [11]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

2190.7412259578705


9961

In [12]:
# relation_ids_results

In [13]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [14]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

888.3796243667603


In [15]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [31]:
target_relation

['P136',
 'P19',
 'P58',
 'P20',
 'P509',
 'P19',
 'P136',
 'P172',
 'P413',
 'P21',
 'P413',
 'P509',
 'P136',
 'P20',
 'P364',
 'P737',
 'P19',
 'P710',
 'P264',
 'P149',
 'P123',
 'P136',
 'P136',
 'P19',
 'P106',
 'P413',
 'P20',
 'P112',
 'P136',
 'P57',
 'P19',
 'P276',
 'P86',
 'P19',
 'P421',
 'P136',
 'P19',
 'P19',
 'P136',
 'P149',
 'P19',
 'P27',
 'P136',
 'P136',
 'P136',
 'P175',
 'P136',
 'P495',
 'P84',
 'P175',
 'P136',
 'P31',
 'P115',
 'P19',
 'P404',
 'P136',
 'P737',
 'P19',
 'P27',
 'P421',
 'P737',
 'P170',
 'P19',
 'P136',
 'P19',
 'P21',
 'P136',
 'P495',
 'P136',
 'P17',
 'P421',
 'P136',
 'P57',
 'P131',
 'P136',
 'P136',
 'P276',
 'P264',
 'P136',
 'P136',
 'P175',
 'P404',
 'P136',
 'P136',
 'P58',
 'P136',
 'P509',
 'P112',
 'P21',
 'P136',
 'P106',
 'P397',
 'P19',
 'P136',
 'P179',
 'P58',
 'P19',
 'P509',
 'P738',
 'P403',
 'P364',
 'P112',
 'P136',
 'P136',
 'P136',
 'P509',
 'P413',
 'P19',
 'P61',
 'P106',
 'P19',
 'P676',
 'P136',
 'P123',
 'P27',
 

In [29]:
# target_relation_labels = []
# for id in target_relation:
#     if id in relation_dict.keys():
#         target_relation_labels.append(relation_dict[id])

In [33]:
# len(target_relation_labels)

9902

In [16]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        if len(relation_dict[relation_id]) < 40:
            relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [17]:
start = time.time()
entity_labels_and_aliases = parallal_task_3(get_entity_labels_and_aliases, qids)
# entity_labels_and_aliases = [get_entity_labels_and_aliases(qid) for qid in qids]
print(time.time() - start)

2180.26535654068


In [18]:
lower_results = []
for result in entity_labels_and_aliases:
    results_for_one = [label.lower() for label in result]
    lower_results.append((result[0], results_for_one))
lower_results[:2]

[('Q5487302', ['q5487302', 'harder ... faster']),
 ('Q16330302', ['q16330302', 'alex golfis'])]

In [19]:
(entities_list, counter) = find_nearest_word_in_question(questions, lower_results)

In [27]:
c_empty = 0
for e in entities_list:
    if e == '':
        c_empty += 1
print(c_empty)
print(c_empty*100/len(entities_list))

909
9.125589800220862


In [34]:
len(questions)

9961

In [42]:
target_relation_labels = []
for id in target_relation:
    if id in relation_dict.keys():
        target_relation_labels.append(relation_dict[id])
    else:
        print(id)
        target_relation_labels.append('')

P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738
P738


In [43]:
get_relation_label('P738')

('P738', '')

In [39]:
len(target_relation_labels)

9961

In [40]:
rows = []
for i in range(len(questions)):
    rows.append([questions[i], entities_list[i], entities_with_relation_labels[i][1], target_relation[i], target_relation_labels[i]])

In [41]:
test_data_results = pd.DataFrame(rows, columns=['question', 'entity label', 'relation labels', 'target relation id', 'target relation labels'])
test_data_results.head(10)

Unnamed: 0,question,entity label,relation labels,target relation id,target relation labels
0,Which genre of album is harder.....faster?,,"[ record label , instance of , follows , fo...",P136,genre
1,what city was alex golfis born in,alex golfis,"[ instance of , sex or gender , occupation ,...",P19,place of birth
2,what film is by the writer phil hay?,phil hay,"[ instance of , sex or gender , country of c...",P58,screenwriter
3,Where did roger marquis die,roger marquis,"[ date of birth , date of death , instance o...",P20,place of death
4,what was the cause of death of yves klein,yves klein,"[ sex or gender , place of birth , VIAF ID ,...",P509,cause of death
5,Which equestrian was born in dublin?,dublin,"[ category for people born here , Dewey Decim...",P19,place of birth
6,What is a tv action show?,,"[ coordinate location , country , instance o...",P136,genre
7,what's akbar tandjung's ethnicity,,"[ VIAF ID , ISNI , member of political party...",P172,ethnic group
8,What position does carlos gomez play?,,"[ Commons category , date of birth , instanc...",P413,position played on team / speciality
9,how does engelbert zaschka identify,engelbert zaschka,"[ sex or gender , Commons category , VIAF ID...",P21,sex or gender


In [44]:
test_data_results

Unnamed: 0,question,entity label,relation labels,target relation id,target relation labels
0,Which genre of album is harder.....faster?,,"[ record label , instance of , follows , fo...",P136,genre
1,what city was alex golfis born in,alex golfis,"[ instance of , sex or gender , occupation ,...",P19,place of birth
2,what film is by the writer phil hay?,phil hay,"[ instance of , sex or gender , country of c...",P58,screenwriter
3,Where did roger marquis die,roger marquis,"[ date of birth , date of death , instance o...",P20,place of death
4,what was the cause of death of yves klein,yves klein,"[ sex or gender , place of birth , VIAF ID ,...",P509,cause of death
...,...,...,...,...,...
9956,who was the creator of the fictional character...,doctor faustus,"[ from narrative universe , Freebase ID , co...",P170,creator
9957,what's a college sporting event that took plac...,oklahoma city,"[ topic's main category , Dewey Decimal Class...",P276,location
9958,what celestial object is 2974 holden,2974 holden,"[ named after , discoverer or inventor , fol...",P31,instance of
9959,what is the film genre for snow falling on ced...,snow falling on cedars,"[ instance of , IMDb ID , director , cast m...",P136,genre


In [45]:
test_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/test_data.csv', index=False)