In [1]:
import json
import pandas as pd
import time

# refer to this paper
# https://arxiv.org/pdf/2108.07337.pdf

In [2]:
def get_relation_label(rel_id):
    labels = ''
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels',
        'ids': ''
    }
      
    params['ids'] = str(rel_id)

    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        label = response[str(rel_id)]['labels']['en']['value']
    except:
        return (rel_id, label)
    return (rel_id, label)

In [3]:
def get_entity_relation_ids(ent_id):
    relations = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'claims',
        'ids': ''
    }
        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        relations = list(response[str(ent_id)]['claims'].keys())
    except:
        return (ent_id, relations)
    return (ent_id, relations)

In [4]:
# jupyter notebook never finishes processing using multiprocessing python 3
# https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3

from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task(func, iterable, *params): 
    with open(f'./tmp_func.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func.py')
    return res
#     else:
#         raise "Not in Jupyter Notebook"

In [5]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_2(func, iterable, *params): 
    with open(f'./tmp_func_2.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=15)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_2.py')
    return res

## Simple_questions Training Data

In [6]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_train.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
training_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    del line[1]
    training_data.append(line)
    
qids = [line[0] for line in training_data]
questions = [line[1] for line in training_data]
# print(len(training_data))
# print(training_data[0])
# print(qids[0])
# print(questions[0])

In [7]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

0.8247733116149902


10

In [8]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [9]:
len(unique_relation_ids)

333

In [11]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

6.142807722091675


In [13]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [29]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [35]:
training_data_results = pd.DataFrame(entities_with_relation_labels, columns= ['entity id', 'relation labels'])
training_data_results.head(10)

Unnamed: 0,entity id,relation labels
0,Q126399,"[ Commons category , instance of , founded b..."
1,Q12439,"[ topic's main category , continent , countr..."
2,Q7370831,"[ IMDb ID , Commons category , instance of ,..."
3,Q6817891,"[ IMDb ID , instance of , director , cast m..."
4,Q1297,"[ Dewey Decimal Classification , WOEID , top..."
5,Q193592,"[ instance of , sport , Freebase ID , Commo..."
6,Q19896779,"[ instance of , part of , Freebase ID , Com..."
7,Q6849115,"[ member of sports team , occupation , insta..."
8,Q842256,"[ Commons category , subclass of , instance ..."
9,Q7273,"[ sex or gender , Library of Congress authori..."


In [36]:
training_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/training_data.csv', index=False)

## Simple_questions Validation set

In [37]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_valid.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
validation_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    del line[1]
    validation_data.append(line)
    
qids = [line[0] for line in validation_data]
questions = [line[1] for line in validation_data]

In [None]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

In [None]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [None]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

In [None]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [None]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [None]:
validation_data_results = pd.DataFrame(entities_with_relation_labels, columns= ['entity id', 'relation labels'])
validation_data_results.head(10)

In [None]:
validation_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/validation_data.csv', index=False)

## Simple_questions Testing Data

In [38]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_test.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
test_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    del line[1]
    test_data.append(line)
    
qids = [line[0] for line in test_data]
questions = [line[1] for line in test_data]

In [None]:
start = time.time()
relation_ids_results = parallal_task(get_entity_relation_ids, qids)
print(time.time() - start)
len(relation_ids_results)

In [None]:
unique_relation_ids = set()
all_test = []
for couple in relation_ids_results:
    unique_relation_ids.update(couple[1])

In [None]:
start = time.time()
result_relation_labels = parallal_task_2(get_relation_label, unique_relation_ids)
print(time.time() - start)

In [None]:
relation_dict = dict()
for (relation_id, label) in result_relation_labels:
    relation_dict[relation_id] = label

In [None]:
entities_with_relation_labels = []
for (entity_id, relation_ids) in relation_ids_results:
    relation_labels = ''
    for relation_id in relation_ids:
        relation_labels += f' {relation_dict[relation_id]} , '
    
    relation_labels = f'[{relation_labels[:-2]}]'
    entities_with_relation_labels.append([entity_id, relation_labels])

In [None]:
test_data_results = pd.DataFrame(entities_with_relation_labels, columns= ['entity id', 'relation labels'])
test_data_results.head(10)

In [None]:
test_data_results.to_csv('./3-Relation_Linking_Data/1-csv_format/test_data.csv', index=False)