In [1]:
import json
import pandas as pd
import os.path
import time

In [2]:
def get_entity_relation_ids(ent_id):
    relations = []
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'claims',
        'ids': ''
    }
        
    params['ids'] = str(ent_id)
    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        print(response)
        relations = list(response[str(ent_id)]['claims'].keys())
    except:
        return (ent_id, relations)
    return (ent_id, relations)

In [3]:
def get_relation_label(rel_id):
    label = ''
    API_ENDPOINT = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbgetentities',
        'format': 'json',
        'languages': 'en',
        'props': 'labels',
        'ids': ''
    }
      
    params['ids'] = str(rel_id)

    try:
        response = requests.get(API_ENDPOINT, params = params).json()['entities']
        label = response[str(rel_id)]['labels']['en']['value']
    except:
        return (rel_id, label)
    return (rel_id, label)

In [4]:
import requests
import time


def find_relation_names_and_ids(qid):
    relation_name_id = dict()

    url = 'https://query.wikidata.org/sparql'
    query = '''
    SELECT Distinct ?wdt ?wdLabel
    WHERE {
      VALUES (?o) {(wd:entity)}
      ?s ?wdt ?o .
      ?wd wikibase:directClaim ?wdt .
      ?wd rdfs:label ?wdLabel .
      FILTER (lang(?wdLabel) = "en")
      BIND (COALESCE(?sLabel, ?s) AS ?ssLabel)
     } 
    '''
    qid = qid.replace("'", '')
    copy_query = query.replace('entity', qid)
#     time.sleep(3)
    r = requests.get(url, params = {'format': 'json', 'query': copy_query})
    try:
        data = r.json()
    except:
        print(r)
        return [qid]
    for rel in data['results']['bindings']:
#             if len(rel['wdLabel']['value']) < 35:
        try:
            relation_name_id[rel['wdLabel']['value']] = rel['wdt']['value'].rsplit('/', 1)[1]
        except:
            continue
#     print('finished')
    return [qid, relation_name_id]

In [5]:
def find_count_not_found_relation_ids(data):
    counter = 0
    error_ids = []
    for i in range(len(data['relation_labels'])):
        if 'dict_values(' not in str(data.iloc[i]['relation_labels']):
            counter += 1
    return counter

In [6]:
def read_text_file(path):
    with open(path, encoding='utf-8') as file:
        lines = file.readlines()

        # print(len(lines))
        training_data = []
        for line in lines:
            line = line.strip().split('\t', 3)
            del line[2]

            training_data.append(line)

        # only read half of the file
        training_data = training_data[:round(len(training_data)/2)]
        
        qids = [line[0] for line in training_data]
        target_relation = [line[1].replace('R', 'P') for line in training_data]
        questions = [line[2] for line in training_data]
        training_data = pd.DataFrame({'qid':qids, 'target_relation':target_relation, 'questions':questions})
    return training_data

In [7]:
def find_labels_20(qids):
    futures = []

    start = time.time()
    counter = 0

    rate_limit = RateLimit(max_count=1, per=6)  # 2 requests per 5 seconds

    print('start finding the not found qids')
    with ThreadPoolExecutor() as executor:
        for qid in qids:
            counter += 1
            rate_limit.wait()  # wait before creating the task
            future = executor.submit(find_relation_names_and_ids, qid)
            futures.append(future)
            print(counter)
    print(time.time() - start)  
    return futures

In [8]:
def find_labels_10(qids):
    futures = []

    start = time.time()
    counter = 0

    rate_limit = RateLimit(max_count=10, per=60)  # 2 requests per 5 seconds

    print('start finding the not found qids')
    with ThreadPoolExecutor() as executor:
        for qid in list(not_found['qid']):
            counter += 1
            rate_limit.wait()  # wait before creating the task
            future = executor.submit(find_relation_names_and_ids, qid)
            futures.append(future)
            print(counter)
    print(time.time() - start)  
    return futures

In [9]:
def parse_futures(futures):
    output = []
    counter = 0
    counter_false = 0
    for future in futures:
        qid = ''
        ids = []
        ids_labels = []
        try:
            result = future.result()
            if len(result) > 1:
                qid = result[0]
                ids = list(result[1].keys())
                ids_labels = list(result[1].values())
            else:
                counter += 1
                qid = result
                ids = 'error'
                ids_labels = 'error'
        except:
            counter_false += 1
            qid = 'error'
            ids = 'error'
            ids_labels = 'error'

        output.append([qid, ids, ids_labels])
    return output

In [10]:
def remove_brackets(row):
    row['qid'] = str(row['qid']).replace('[', '')
    row['qid'] = row['qid'].replace(']', '')
    row['qid'] = row['qid'].replace("'", '')
    return str(row['qid'])

## Training data

In [10]:
relation_training_data = pd.read_csv('3-Relation_Linking_Data/0-preprocess/qid_object_relations_training.csv')

In [11]:
original_training_data = read_text_file('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_train.txt')
original_training_data = original_training_data.drop_duplicates()
original_training_data

Unnamed: 0,qid,target_relation,questions
0,Q126399,P272,what movie is produced by warner bros.
1,Q12439,P19,who is a musician born in detroit
2,Q7370831,P162,who produced the film rough house rosie
3,Q6817891,P364,what is the language in which mera shikar was ...
4,Q1297,P276,Whats the name of a battle that happened in ch...
...,...,...,...
17182,Q94033,P136,What kind of film is the twilight samurai?
17183,Q4933124,P413,What baseball position did bob lawsonp play
17184,Q1078039,P19,what is the place of birth for john tait rober...
17185,Q7289124,P364,What language is ramayan filmed in?


In [12]:
find_count_not_found_relation_ids(relation_training_data)

937

In [13]:
relation_training_data = relation_training_data.drop_duplicates()

In [14]:
find_count_not_found_relation_ids(relation_training_data)

746

In [15]:
result = pd.merge(original_training_data, relation_training_data, how="left", on='qid')
result = result.drop_duplicates(subset=['qid', 'target_relation', 'questions'])
result

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q126399,P272,what movie is produced by warner bros.,"dict_keys(['employer', 'owned by', 'record lab...","dict_values(['P108', 'P127', 'P264', 'P162', '..."
7,Q12439,P19,who is a musician born in detroit,"dict_keys(['place of birth', 'home port', 'loc...","dict_values(['P19', 'P504', 'P276', 'P495', 'P..."
9,Q7370831,P162,who produced the film rough house rosie,dict_keys([]),dict_values([])
10,Q6817891,P364,what is the language in which mera shikar was ...,dict_keys([]),dict_values([])
11,Q1297,P276,Whats the name of a battle that happened in ch...,"dict_keys(['place of birth', 'residence', 'pla...","dict_values(['P19', 'P551', 'P291', 'P276', 'P..."
...,...,...,...,...,...
36320,Q94033,P136,What kind of film is the twilight samurai?,"dict_keys(['notable work', 'follows'])","dict_values(['P800', 'P155'])"
36321,Q4933124,P413,What baseball position did bob lawsonp play,dict_keys([]),dict_values([])
36322,Q1078039,P19,what is the place of birth for john tait rober...,dict_keys([]),dict_values([])
36323,Q7289124,P364,What language is ramayan filmed in?,dict_keys([]),dict_values([])


In [16]:
not_found = result[result['relation_id'].isnull()]
not_found['qid']

114            Q43
316       Q7605291
325         Q82955
392         Q36180
611        Q262802
           ...    
36309       Q25973
36314     Q3072796
36315      Q594866
36316       Q14960
36317    Q16081732
Name: qid, Length: 765, dtype: object

In [17]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratemate import RateLimit

futures = find_labels_20(list(not_found['qid']))

start finding the not found qids
1
2
3
4
5
6
7
8
9
10
11
<Response [500]>
12
13
<Response [500]>
14
<Response [500]>
15
<Response [429]>
16
17
18
19
20
21
22
23
24
25
26
<Response [500]>
27
28
29
30
31
32
33
34
35
<Response [500]>
36
37
38
39
40
41
42
43
44
<Response [429]>
45
<Response [429]>
46
<Response [500]>
47
<Response [500]>
48
49
<Response [500]>
50
51
52
53
54
55
56
57
58
<Response [500]>
59
60
61
62
63
64
65
66
67
68
69
<Response [500]>
70
<Response [500]>
<Response [429]>
71
<Response [500]>
72
73
<Response [500]>
74
75
76
77
78
<Response [500]>
79
80
81
82
<Response [500]>
83
<Response [500]>
<Response [500]>
84
<Response [500]>
85
<Response [500]>
86
87
88
89
90
91
<Response [500]>
92
93
<Response [429]>
94
<Response [500]>
95
96
<Response [500]>
97
98
99
<Response [500]>
100
101
102
<Response [429]>
103
104
105
<Response [429]>
<Response [500]>
106
107
108
<Response [500]>
109
110
111
<Response [500]>
112
113
114
115
<Response [429]>
116
<Response [500]>
117
<Response [5

In [83]:
len(result)

17065

In [28]:
output = parse_futures(futures)
output = pd.DataFrame(output, columns=['qid', 'relation_id', 'relation_labels'])
output['qid'] = output.apply(remove_brackets, axis = 1)
output

In [91]:
output = output[output['relation_id'] != 'error']
output

Unnamed: 0,qid,relation_id,relation_labels
1,Q7605291,[],[]
4,Q262802,"[statistical leader, captain, flag bearer]","[P3279, P634, P3022]"
5,Q3710055,"[doctoral advisor, author, doctoral student, s...","[P184, P50, P185, P1066]"
6,Q2756904,"[present in work, derivative work]","[P1441, P4969]"
7,Q16832088,[],[]
...,...,...,...
760,Q25973,"[author, father, spouse, child, named after, m...","[P50, P22, P26, P40, P138, P921, P180, P2650, ..."
761,Q3072796,[has part(s)],[P527]
762,Q594866,"[named after, author, main subject, category's...","[P138, P50, P921, P301]"
763,Q14960,"[place of birth, work location, location, resi...","[P19, P937, P276, P551, P291, P180, P190, P301..."


In [92]:
training_output = pd.merge(not_found[['qid','target_relation','questions']], output, how="left", on='qid')
training_output = training_output.drop_duplicates(subset=['qid','target_relation','questions'])
training_output
# output = 
# pd.merge(original_training_data, relation_training_data, how="left", on='qid')

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q43,P17,what is a second level division of turkey,,
1,Q7605291,P495,what country does the film stealing a nation t...,[],[]
2,Q82955,P106,what is politician is founder and chairperson ...,,
3,Q36180,P106,what British science fiction author and writer...,,
4,Q262802,P19,Where in canada was christine sinclair born,"[statistical leader, captain, flag bearer]","[P3279, P634, P3022]"
...,...,...,...,...,...
764,Q25973,P738,what known person did hermann hesse influence,"[author, father, spouse, child, named after, m...","[P50, P22, P26, P40, P138, P921, P180, P2650, ..."
765,Q3072796,P178,who developed the cvg fire emblem: seisen no k...,[has part(s)],[P527]
766,Q594866,P20,where did anton tomaž linhart die,"[named after, author, main subject, category's...","[P138, P50, P921, P301]"
767,Q14960,P19,Who was a person that was born in brno,"[place of birth, work location, location, resi...","[P19, P937, P276, P551, P291, P180, P190, P301..."


In [93]:
len(not_found)

765

In [94]:
new_training_output = pd.concat([result, training_output])
new_training_output.head()

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q126399,P272,what movie is produced by warner bros.,"dict_keys(['employer', 'owned by', 'record lab...","dict_values(['P108', 'P127', 'P264', 'P162', '..."
7,Q12439,P19,who is a musician born in detroit,"dict_keys(['place of birth', 'home port', 'loc...","dict_values(['P19', 'P504', 'P276', 'P495', 'P..."
9,Q7370831,P162,who produced the film rough house rosie,dict_keys([]),dict_values([])
10,Q6817891,P364,what is the language in which mera shikar was ...,dict_keys([]),dict_values([])
11,Q1297,P276,Whats the name of a battle that happened in ch...,"dict_keys(['place of birth', 'residence', 'pla...","dict_values(['P19', 'P551', 'P291', 'P276', 'P..."


In [95]:
new_training_output = new_training_output[~new_training_output['relation_id'].isna()]

In [96]:
new_training_output.to_csv('3-Relation_Linking_Data/0-preprocess/training_data_find_obj.csv', index=False)

## Validation data

In [11]:
val_relation_data = pd.read_csv('3-Relation_Linking_Data/0-preprocess/qid_object_relations_valid.csv')

In [12]:
val_original_data = read_text_file('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_valid.txt')
val_original_data = val_original_data.drop_duplicates()
val_original_data

Unnamed: 0,qid,target_relation,questions
0,Q3541144,P138,Who was the trump ocean club international hot...
1,Q318926,P19,where was sasha vujačić born
2,Q2568216,P57,What is a film directed by wiebke von carolsfeld?
3,Q2275923,P106,What was Seymour Parker Gilbert's profession?
4,Q2856873,P20,in what french city did antoine de févin die
...,...,...,...
2429,Q1142644,P136,what kind of music does don snow make
2430,Q980677,P86,Who did the music for lage raho munna bhai?
2431,Q3566870,P21,What is wayne wade's gender?
2432,Q132311,P136,what book is in the genre fantasy?


In [13]:
find_count_not_found_relation_ids(val_relation_data)

100

In [14]:
val_relation_data = val_relation_data.drop_duplicates()

In [15]:
find_count_not_found_relation_ids(val_relation_data)

84

In [16]:
val_result = pd.merge(val_original_data, val_relation_data, how="left", on='qid')
val_result = val_result.drop_duplicates(subset=['qid', 'target_relation', 'questions'])
val_result

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q3541144,P138,Who was the trump ocean club international hot...,dict_keys(['location']),dict_values(['P276'])
1,Q318926,P19,where was sasha vujačić born,dict_keys([]),dict_values([])
2,Q2568216,P57,What is a film directed by wiebke von carolsfeld?,"dict_keys(['director', 'screenwriter', 'film e...","dict_values(['P57', 'P58', 'P1040'])"
3,Q2275923,P106,What was Seymour Parker Gilbert's profession?,dict_keys(['main subject']),dict_values(['P921'])
4,Q2856873,P20,in what french city did antoine de févin die,dict_keys(['main subject']),dict_values(['P921'])
...,...,...,...,...,...
2920,Q1142644,P136,what kind of music does don snow make,dict_keys(['has part(s)']),dict_values(['P527'])
2921,Q980677,P86,Who did the music for lage raho munna bhai?,dict_keys([]),dict_values([])
2922,Q3566870,P21,What is wayne wade's gender?,dict_keys([]),dict_values([])
2923,Q132311,P136,what book is in the genre fantasy?,"dict_keys(['instance of', 'field of work', 'ge...","dict_values(['P31', 'P101', 'P136', 'P135', 'P..."


In [17]:
val_not_found = val_result[val_result['relation_id'].isnull()]
val_not_found['qid']

39        Q444947
143         Q2736
144           Q43
228          Q262
230      Q4670857
          ...    
2833     Q6581072
2864    Q10860861
2883     Q5978442
2889     Q1918017
2893     Q1208949
Name: qid, Length: 83, dtype: object

In [18]:
len(val_not_found['qid'])

83

In [19]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratemate import RateLimit

val_futures = find_labels_20(list(val_not_found['qid']))

start finding the not found qids
1
2
3
4
5
6
7
8
9
10
11
12
<Response [500]>
13
<Response [500]>
14
<Response [500]>
15
16
17
<Response [500]>
18
<Response [429]>
19
20
21
<Response [500]>
22
<Response [500]>
23
24
25
26
27
28
29
30
31
32
<Response [500]>
33
34
35
36
<Response [500]>
37
<Response [500]>
38
39
40
41
42
43
<Response [500]>
44
<Response [500]>
45
46
47
48
<Response [500]>
49
<Response [500]>
<Response [429]>
50
<Response [500]>
<Response [429]>
51
52
<Response [500]>
53
54
<Response [500]>
55
56
57
58
59
60
61
62
63
64
<Response [500]>
65
66
67
68
69
<Response [500]>
70
71
72
73
74
75
76
77
78
79
80
<Response [500]>
81
82
83
<Response [500]>
<Response [500]>
531.888664484024


In [20]:
len(val_result)

2430

In [25]:
val_output = parse_futures(val_futures)
val_output = pd.DataFrame(val_output, columns=['qid', 'relation_id', 'relation_labels'])
val_output['qid'] = val_output.apply(remove_brackets, axis = 1)
val_output

Unnamed: 0,qid,relation_id,relation_labels
0,Q444947,"[spouse, child, named after, depicts, category...","[P26, P40, P138, P180, P301, P921, P3373]"
1,Q2736,error,error
2,Q43,error,error
3,Q262,error,error
4,Q4670857,"[follows, followed by]","[P155, P156]"
...,...,...,...
78,Q6581072,error,error
79,Q10860861,[],[]
80,Q5978442,[participant in],[P1344]
81,Q1918017,[],[]


In [26]:
val_output = val_output[val_output['relation_id'] != 'error']
val_output

Unnamed: 0,qid,relation_id,relation_labels
0,Q444947,"[spouse, child, named after, depicts, category...","[P26, P40, P138, P180, P301, P921, P3373]"
4,Q4670857,"[follows, followed by]","[P155, P156]"
5,Q107659,[],[]
7,Q5488318,[],[]
8,Q2605094,"[follows, followed by, part of]","[P155, P156, P361]"
9,Q5237018,"[producer, composer]","[P162, P86]"
12,Q1461,"[demonym of, item for this sense, place of bir...","[P6271, P5137, P19, P1855, P190, P36, P20, P47..."
13,Q460824,"[doctoral advisor, editor, doctoral student, o...","[P184, P98, P185, P1308, P527, P921]"
14,Q5246566,[],[]
15,Q937220,"[creator, said to be the same as]","[P170, P460]"


In [27]:
val_output_2 = pd.merge(val_not_found[['qid','target_relation','questions']], val_output, how="left", on='qid')
val_output_2 = val_output_2.drop_duplicates(subset=['qid','target_relation','questions'])
val_output_2
# output = 
# pd.merge(original_training_data, relation_training_data, how="left", on='qid')

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q444947,P138,Which city in Alabama was named after richard ...,"[spouse, child, named after, depicts, category...","[P26, P40, P138, P180, P301, P921, P3373]"
1,Q2736,P136,what is a football film on netflix?,,
2,Q43,P19,List an actor born in turkey,,
3,Q262,P17,where is the administrative division in algeria?,,
4,Q4670857,P31,how is (35063) 1988 fd classified,"[follows, followed by]","[P155, P156]"
...,...,...,...,...,...
84,Q6581072,P21,what character is a female?,,
85,Q10860861,P364,What language is there are things you dont kno...,[],[]
86,Q5978442,P364,What is the language of the film i melt with you?,[participant in],[P1344]
87,Q1918017,P495,is oxygen created in netherlands or finland,[],[]


In [28]:
len(val_not_found)

83

In [29]:
new_val_output = pd.concat([val_result, val_output_2])
new_val_output.head()

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q3541144,P138,Who was the trump ocean club international hot...,dict_keys(['location']),dict_values(['P276'])
1,Q318926,P19,where was sasha vujačić born,dict_keys([]),dict_values([])
2,Q2568216,P57,What is a film directed by wiebke von carolsfeld?,"dict_keys(['director', 'screenwriter', 'film e...","dict_values(['P57', 'P58', 'P1040'])"
3,Q2275923,P106,What was Seymour Parker Gilbert's profession?,dict_keys(['main subject']),dict_values(['P921'])
4,Q2856873,P20,in what french city did antoine de févin die,dict_keys(['main subject']),dict_values(['P921'])


In [30]:
new_val_output = new_val_output[~new_val_output['relation_id'].isna()]

In [31]:
new_val_output.to_csv('3-Relation_Linking_Data/0-preprocess/val_data_find_obj.csv', index=False)

## Test data

In [33]:
test_relation_data = pd.read_csv('3-Relation_Linking_Data/0-preprocess/qid_object_relations_test.csv')

In [34]:
original_test_data = read_text_file('2-NEL_Data/0-raw_data/simple_questions_v2/annotated_wd_data_test.txt')
original_test_data = original_test_data.drop_duplicates()
original_test_data

Unnamed: 0,qid,target_relation,questions
0,Q5487302,P136,Which genre of album is harder.....faster?
1,Q16330302,P19,what city was alex golfis born in
2,Q16225521,P58,what film is by the writer phil hay?
3,Q7358590,P20,Where did roger marquis die
4,Q154335,P509,what was the cause of death of yves klein
...,...,...,...
4975,Q2074562,P413,what position does corey webster play in football
4976,Q1633373,P21,what is the sex of hubert renfro knickerbocker?
4977,Q9759,P136,what is the name of a blues artist that plays ...
4978,Q740939,P20,What is the place of death of lane smith?


In [35]:
find_count_not_found_relation_ids(test_relation_data)

317

In [36]:
test_relation_data = test_relation_data.drop_duplicates()

In [37]:
find_count_not_found_relation_ids(test_relation_data)

265

In [38]:
test_result = pd.merge(original_test_data, test_relation_data, how="left", on='qid')
test_result = test_result.drop_duplicates(subset=['qid', 'target_relation', 'questions'])
test_result

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q5487302,P136,Which genre of album is harder.....faster?,"dict_keys(['follows', 'followed by', 'part of'])","dict_values(['P155', 'P156', 'P361'])"
1,Q16330302,P19,what city was alex golfis born in,dict_keys(['cast member']),dict_values(['P161'])
2,Q16225521,P58,what film is by the writer phil hay?,"dict_keys(['father', 'spouse', 'screenwriter',...","dict_values(['P22', 'P26', 'P58', 'P162'])"
3,Q7358590,P20,Where did roger marquis die,dict_keys(['different from']),dict_values(['P1889'])
4,Q154335,P509,what was the cause of death of yves klein,"dict_keys(['spouse', 'founded by', 'child', 'f...","dict_values(['P26', 'P112', 'P40', 'P22', 'P73..."
...,...,...,...,...,...
6900,Q2074562,P413,what position does corey webster play in football,dict_keys([]),dict_values([])
6901,Q1633373,P21,what is the sex of hubert renfro knickerbocker?,dict_keys([]),dict_values([])
6902,Q9759,P136,what is the name of a blues artist that plays ...,"dict_keys(['item for this sense', 'genre', 'Wi...","dict_values(['P5137', 'P136', 'P1855', 'P144',..."
6903,Q740939,P20,What is the place of death of lane smith?,"dict_keys(['cast member', 'performer', 'contri...","dict_values(['P161', 'P175', 'P767', 'P736'])"


In [39]:
test_not_found = test_result[test_result['relation_id'].isnull()]
test_not_found['qid']

50         Q1085
129     Q1627134
185     Q4760949
190     Q3735565
192     Q1063264
          ...   
6823      Q11678
6824    Q6429913
6826    Q7546039
6835    Q2724278
6836    Q2712639
Name: qid, Length: 284, dtype: object

In [40]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratemate import RateLimit

test_futures = find_labels_20(list(test_not_found['qid']))

start finding the not found qids
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
<Response [429]>
20
<Response [500]>
21
<Response [500]>
22
23
24
25
<Response [500]>
26
27
28
29
30
<Response [500]>
31
<Response [500]>
32
33
<Response [500]>
34
35
36
37
<Response [429]>
38
<Response [500]>
39
40
<Response [500]>
41
<Response [500]>
<Response [500]>
42
<Response [429]>
43
44
45
46
47
48
49
50
<Response [500]>
51
52
53
54
55
56
57
58
59
60
61
<Response [500]>
62
<Response [500]>
63
<Response [500]>
64
65
<Response [500]>
66
67
68
69
70
71
72
73
74
75
76
77
78
79
<Response [500]>
80
<Response [429]>
81
82
83
<Response [500]>
84
85
<Response [429]>
86
87
88
89
90
91
92
93
<Response [500]>
94
95
96
<Response [500]>
97
98
99
100
<Response [500]>
101
102
<Response [500]>
103
<Response [500]>
<Response [429]>
104
105
106
107
108
109
110
111
<Response [500]>
112
<Response [500]>
113
114
115
116
117
118
119
120
121
122
123
124
<Response [500]>
125
126
127
128
<Response [500]>
129
<Response [429]

In [41]:
len(test_result)

4970

In [42]:
test_output = parse_futures(test_futures)
test_output = pd.DataFrame(test_output, columns=['qid', 'relation_id', 'relation_labels'])
test_output['qid'] = test_output.apply(remove_brackets, axis = 1)
test_output

Unnamed: 0,qid,relation_id,relation_labels
0,error,error,error
1,Q1627134,[],[]
2,Q4760949,[],[]
3,Q3735565,"[follows, followed by]","[P155, P156]"
4,Q1063264,"[enemy of, characters]","[P7047, P674]"
...,...,...,...
279,Q11678,"[based on, follows, has part(s), edition or tr...","[P144, P155, P527, P629, P1441]"
280,Q6429913,[head coach],[P286]
281,Q7546039,"[performer, category's main topic, category co...","[P175, P301, P971]"
282,Q2724278,"[follows, followed by]","[P155, P156]"


In [43]:
test_output = test_output[test_output['relation_id'] != 'error']
test_output

Unnamed: 0,qid,relation_id,relation_labels
1,Q1627134,[],[]
2,Q4760949,[],[]
3,Q3735565,"[follows, followed by]","[P155, P156]"
4,Q1063264,"[enemy of, characters]","[P7047, P674]"
5,Q7795073,[],[]
...,...,...,...
279,Q11678,"[based on, follows, has part(s), edition or tr...","[P144, P155, P527, P629, P1441]"
280,Q6429913,[head coach],[P286]
281,Q7546039,"[performer, category's main topic, category co...","[P175, P301, P971]"
282,Q2724278,"[follows, followed by]","[P155, P156]"


In [44]:
test_output_2 = pd.merge(test_not_found[['qid','target_relation','questions']], test_output, how="left", on='qid')
test_output_2 = test_output_2.drop_duplicates(subset=['qid','target_relation','questions'])
test_output_2
# output = 
# pd.merge(original_training_data, relation_training_data, how="left", on='qid')

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q1085,P19,Who was born in prague,,
1,Q1627134,P404,What is a gampelay mode found in the game alle...,[],[]
2,Q4760949,P170,what American cartoonist is the creator of and...,[],[]
3,Q3735565,P676,Who is the main lyricist from everything dies,"[follows, followed by]","[P155, P156]"
4,Q1063264,P21,what gender is mohinder suresh,"[enemy of, characters]","[P7047, P674]"
...,...,...,...,...,...
284,Q11678,P156,what is the next book in the hunger games series,"[based on, follows, has part(s), edition or tr...","[P144, P155, P527, P629, P1441]"
285,Q6429913,P27,what is konstantin dzutsev's nationality?,[head coach],[P286]
286,Q7546039,P136,what type of music does is smoking popes known...,"[performer, category's main topic, category co...","[P175, P301, P971]"
287,Q2724278,P136,what sort of music is the album entangled in c...,"[follows, followed by]","[P155, P156]"


In [45]:
len(test_not_found)

284

In [46]:
new_test_output = pd.concat([test_result, test_output_2])
new_test_output.head()

Unnamed: 0,qid,target_relation,questions,relation_id,relation_labels
0,Q5487302,P136,Which genre of album is harder.....faster?,"dict_keys(['follows', 'followed by', 'part of'])","dict_values(['P155', 'P156', 'P361'])"
1,Q16330302,P19,what city was alex golfis born in,dict_keys(['cast member']),dict_values(['P161'])
2,Q16225521,P58,what film is by the writer phil hay?,"dict_keys(['father', 'spouse', 'screenwriter',...","dict_values(['P22', 'P26', 'P58', 'P162'])"
3,Q7358590,P20,Where did roger marquis die,dict_keys(['different from']),dict_values(['P1889'])
4,Q154335,P509,what was the cause of death of yves klein,"dict_keys(['spouse', 'founded by', 'child', 'f...","dict_values(['P26', 'P112', 'P40', 'P22', 'P73..."


In [47]:
new_test_output = new_test_output[~new_test_output['relation_id'].isna()]

In [48]:
new_test_output.to_csv('3-Relation_Linking_Data/0-preprocess/test_data_find_obj.csv', index=False)