In [1]:
import json
import pandas as pd
import time

# refer to this paper
# https://arxiv.org/pdf/2108.07337.pdf

In [2]:
# https://stackoverflow.com/questions/51419785/extract-data-from-wikidata-in-python


def get_candidates(candidate_entities):
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'search': ''
    }
    
    candidates_sent = []
    for label in candidate_entities:
        candidates_for_one = []
        if len(label) ==0:
            continue
        
        params['search'] = str(label)  # set search from the params to our query word
        try:
            r = requests.get(API_ENDPOINT, params = params).json()
        except:
            return "API failed  " + label
#             return label


        try:
            search_arr = r["search"]
        except:
            print (label)
            return label

        for c in search_arr:
            c_in =[]
            try:
                c_in.append(c["id"])
                c_in.append(c["label"])
                c_in.append(c["description"])
                candidates_for_one.append(c_in)
            except:
                pass

        candidates_sent.append(candidates_for_one)
            
    return candidates_sent

In [46]:
def get_labels(ent_id):
    labels = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
#         'props': 'labels|aliases',
        'props': 'labels',
        'ids': ''
    }

        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        labels.append(response[str(ent_id)]['labels']['en']['value'])
        
        for alias in response[str(ent_id)]['aliases']['en']:
            labels.append(alias['value'])
    except:
        return (ent_id, labels)
    return (ent_id, labels)

In [4]:
# jupyter notebook never finishes processing using multiprocessing python 3
# https://stackoverflow.com/questions/47313732/jupyter-notebook-never-finishes-processing-using-multiprocessing-python-3

from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task(func, iterable, *params): 

    with open(f'./tmp_func.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func import task
    pool = Pool(processes=10)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func.py')
    return res
#     else:
#         raise "Not in Jupyter Notebook"

In [5]:
from multiprocessing import Pool
from functools import partial
import inspect
import time
import os
import requests


def parallal_task_2(func, iterable, *params): 

    with open(f'./tmp_func_2.py', 'w') as file:
        file.write("import requests \n")
        file.write(inspect.getsource(func).replace(func.__name__, 'task'))

    from tmp_func_2 import task
    pool = Pool(processes=10)
    res = pool.map(task, iterable)
    pool.close()
    
    os.remove('./tmp_func_2.py')
    return res

In [6]:
from fuzzywuzzy import fuzz

# when creating a function the arguments are (questions and the results wikidata api after lower())
def find_nearest_word_in_question(questions, lower_wikidata_replies):
    entities_list = []
    found_match = False
    counter = 0
    for i in range(len(questions)):
        found_match = False
        for label in lower_wikidata_replies[i][1]:
            if label in questions[i].lower(): 
                found_match = True
                index = questions[i].lower().find(label)
                entities_list.append(questions[i][index : len(label)+index])
                break
        if not found_match:
            split_tokens = questions[i][1].split()
            list_of_comb = []
            list_of_comb.extend(split_tokens)
            scores = []
            for j in range(len(split_tokens)-1):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1])
            for j in range(len(split_tokens)-2):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2])
            for j in range(len(split_tokens)-3):
                list_of_comb.append(split_tokens[j] + ' ' + split_tokens[j+1] + ' ' + split_tokens[j+2] + ' ' + split_tokens[j+3])

            for label in lower_wikidata_replies[i][1]:
                for token in list_of_comb: 
                    scores.append((fuzz.ratio(label, token.lower()), token, label))
            scores = sorted(scores, key=lambda x: x[0], reverse=True)

            if len(scores) and scores[0][0] < 85:
                counter += 1
                entities_list.append('')
            elif not(len(scores)):
                counter += 1
                entities_list.append('')
            elif scores[0][0] >= 85:
                entities_list.append(scores[0][1])
    return (entities_list, counter)

In [17]:
with open('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_train.txt', encoding='utf-8') as file:
    lines = file.readlines()

# print(len(lines))
training_data = []
for line in lines:
    line = line.strip().split('\t', 3)
    del line[2]
    del line[1]
    training_data.append(line)
    
qids = [line[0] for line in training_data]
questions = [line[1] for line in training_data]
# print(len(training_data))
# print(training_data[0])
# print(qids[0])
# print(questions[0])

In [19]:
qids

['Q126399',
 'Q12439',
 'Q7370831',
 'Q6817891',
 'Q1297',
 'Q193592',
 'Q19896779',
 'Q6849115',
 'Q842256',
 'Q7273',
 'Q247643',
 'Q6135854',
 'Q4021138',
 'Q184622',
 'Q2405480',
 'Q145410',
 'Q4921907',
 'Q1698',
 'Q237514',
 'Q3738651',
 'Q388408',
 'Q93204',
 'Q3643721',
 'Q7063408',
 'Q707230',
 'Q590103',
 'Q384243',
 'Q71277',
 'Q2227506',
 'Q101625',
 'Q2321544',
 'Q5667368',
 'Q25989',
 'Q734861',
 'Q55442',
 'Q3986646',
 'Q590145',
 'Q438503',
 'Q7749083',
 'Q36322',
 'Q234472',
 'Q1530721',
 'Q43',
 'Q710600',
 'Q5327069',
 'Q1297',
 'Q6230278',
 'Q984029',
 'Q668083',
 'Q154581',
 'Q1348496',
 'Q6687871',
 'Q5553855',
 'Q5506118',
 'Q6836909',
 'Q56094',
 'Q1120576',
 'Q3154598',
 'Q222052',
 'Q43343',
 'Q1219013',
 'Q6249503',
 'Q2023740',
 'Q2138789',
 'Q7680816',
 'Q6050048',
 'Q10280325',
 'Q7349401',
 'Q3116899',
 'Q154194',
 'Q2605514',
 'Q1075868',
 'Q6221698',
 'Q6285529',
 'Q13636457',
 'Q147589',
 'Q5174096',
 'Q605186',
 'Q5466725',
 'Q29172',
 'Q6012446',
 'Q

In [32]:
labels = []
API_ENDPOINT = "https://www.wikidata.org/w/api.php"
params = {
    'action': 'wbgetentities',
    'format': 'json',
    'languages': 'en',
    'props': 'claims',    
    'ids': 'Q126399'
}

#     try:
response = requests.get(API_ENDPOINT, params = params).json()['entities']
response
#         labels.append(response[str(ent_id)]['labels']['en']['value'])
        
#         for alias in response[str(ent_id)]['aliases']['en']:
#             labels.append(alias['value'])
#     except:
#         return (ent_id, labels)

{'Q126399': {'type': 'item',
  'id': 'Q126399',
  'claims': {'P373': [{'mainsnak': {'snaktype': 'value',
      'property': 'P373',
      'hash': 'a628796d86bec5f43a27156210ee152fbee3cec1',
      'datavalue': {'value': 'Warner Bros. Entertainment', 'type': 'string'},
      'datatype': 'string'},
     'type': 'statement',
     'id': 'q126399$3B2A1FB8-16B9-4D74-BB8B-3823770A99F6',
     'rank': 'normal'}],
   'P31': [{'mainsnak': {'snaktype': 'value',
      'property': 'P31',
      'hash': '03f36c5d44daa15d8ad8a4f337bd7cae11b4d847',
      'datavalue': {'value': {'entity-type': 'item',
        'numeric-id': 1107679,
        'id': 'Q1107679'},
       'type': 'wikibase-entityid'},
      'datatype': 'wikibase-item'},
     'type': 'statement',
     'id': 'Q126399$EAF40450-3FCB-478E-98B4-5C2BD6CED344',
     'rank': 'normal'},
    {'mainsnak': {'snaktype': 'value',
      'property': 'P31',
      'hash': '09cda8dc5df05f496773981eeb13389efcccf3d1',
      'datavalue': {'value': {'entity-type': 'item

In [49]:
list(response['Q126399']['claims'].keys())[:15]

['P373',
 'P31',
 'P112',
 'P749',
 'P856',
 'P966',
 'P646',
 'P159',
 'P910',
 'P155',
 'P1711',
 'P1766',
 'P18',
 'P214',
 'P571']

In [56]:
len_of_results = 0
list_of_relations = []

for relation_id in list(response['Q126399']['claims'].keys()):
    label = get_labels(relation_id)[1][0]
    list_of_relations.append(label)
    len_of_results += len(label)

In [57]:
# list_of_relations
len_of_results

1662

In [15]:
start = time.time()
results = parallal_task(get_labels, qids)
print(time.time() - start)
len(results)

KeyboardInterrupt: 