In [23]:
import torch

import pandas as pd
import numpy as np
import json
from SPARQLWrapper import SPARQLWrapper, JSON
import requests 
import itertools
import spotlight
import tagme
import inflect
p = inflect.engine()
import re
import sys

from nltk.stem.porter import *
stemmer = PorterStemmer()

tagme.GCUBE_TOKEN = ""

In [24]:
with open('data/lc-quad/templates.json') as f:
    templates = json.load(f)

with open('data/lc-quad/test/id.txt') as f:
    ids = f.read().strip().split('\n')
    
with open('data/lc-quad/test/input.txt') as f:
    questions = f.read().strip().split('\n')
    
def preprocess_relations(file, prop=False):
    relations = {}
    with open(file) as f:
        content = f.readlines()
        for line in content:
            split_line = line.split()

            key = ' '.join(split_line[2:])[1:-3].lower()
            key = ' '.join([stemmer.stem(word) for word in key.split()])
            
            if key not in relations:
                relations[key] = []
            
            uri = split_line[0].replace('<', '').replace('>', '')
            
            if prop is True:                
                uri_property = uri.replace('/ontology/', '/property/')
                relations[key].extend([uri, uri_property])
            else:
                relations[key].append(uri)                
    return relations    

properties = preprocess_relations('dbpedia_3Eng_property.ttl', True)
classes = preprocess_relations('dbpedia_3Eng_class.ttl')

In [25]:
# Prediction k-templates
#saved_model = torch.load('checkpoints/Even more grammar changes,epoch=6,test_acc=0.8246887966804979.pt')
saved_model = torch.load('checkpoints/rnn-question-answering,epoch=7,test_acc=0.8143153526970954.pt')
trainer = saved_model['trainer']
test_dataset = torch.load('data/lc-quad/pth/lc_quad_test.pth')

In [26]:
y_true = []
y_pred = []
output_vocab = trainer.vocabs['output']

for index in range(len(test_dataset)):
    torch.no_grad()    
    _, toks_sent, _, _, _ = test_dataset[index]
    tree, emb, target = trainer.get_data(test_dataset[index], test_dataset.num_classes)
    output = trainer.model.forward(tree, emb, training=False)
    _, pred = torch.topk(output[0].squeeze(0), 2)
    
    pred = pred.numpy()
    target = target.numpy()
    
    pred_0 = output_vocab.idxToLabel[pred[0]]
    pred_1 = output_vocab.idxToLabel[pred[1]]
    target = output_vocab.idxToLabel[target[0]]    
    
    y_true.append(target)
    y_pred.append([pred_0, pred_1])



In [27]:
# Sorts based on descending order of values
def sort_dict_by_values(dictionary):
    keys = []
    for key, value in sorted(dictionary.items(), key=lambda item: (item[1], item[0]), reverse=True):
        keys.append(key)
    return keys

def get_earl_entities(query):
    THRESHOLD = 0.0001
    MAX_RESOURCES = 3
    MAX_PREDICATES = 5
    MAX_CLASSES = 3
    
    response = requests.post('http://sda.tech/earl/api/processQuery', 
                             json={"nlquery": query, "pagerankflag": False})
    
    json_response = json.loads(response.text)
    r_dict = {}
    c_dict = {}
    p_dict = {}
    
    reranked_lists = json_response['rerankedlists']
    for key in reranked_lists.keys():
        for result in reranked_lists[key]:
            if result[0] < THRESHOLD:
                continue                
            
            if result[1].startswith('http://dbpedia.org/resource/'):
                r_dict[result[1]] = result[0]
            elif result[1].startswith('http://dbpedia.org/'): 
                # Add to Classes
                if result[1].split('/')[-1][0].isupper(): 
                    c_dict[result[1]] = result[0]
                # Add to Predicates
                else:
                    p_dict[result[1]] = result[0]
        
    return {
        'r': sort_dict_by_values(r_dict)[:MAX_RESOURCES], 
        'p': sort_dict_by_values(p_dict)[:MAX_PREDICATES], 
        'c': sort_dict_by_values(c_dict)[:MAX_CLASSES]
    }

In [28]:
def get_tag_me_entities(query):
    MAX_ENTITIES = 5
    results = []
    response = requests.get("https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={}&text={}"
                           .format('1b4eb12e-d434-4b30-8c7f-91b3395b96e8-843339462', query))
    
    annotations = {}
    for annotation in json.loads(response.text)['annotations']:        
        annotations['http://dbpedia.org/resource/' + annotation['title'].replace(' ', '_')] = annotation['rho']
    return sort_dict_by_values(annotations)[:MAX_ENTITIES]

In [29]:
def get_nliwod_entities(query, hashmap, include_properties = False):
#     ignore_list = ['name', 'list']
    ignore_list = []
    entities = []
    singular_query = [stemmer.stem(word) if p.singular_noun(word) == False else stemmer.stem(p.singular_noun(word)) for word in query.lower().split(' ')]
    
    for key in hashmap.keys():
        if key in ' '.join(singular_query) and len(key) > 2 and key not in ignore_list:
            entities += hashmap[key]
    return list(set(entities))

In [30]:
def get_entities(query):    
    return {
        'r': get_tag_me_entities(query),
        'p': get_nliwod_entities(query, properties, True),
        'c': get_nliwod_entities(query, classes)
    }

In [31]:
def get_spotlight_entities(query):
    entities = []
    try:
        annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate', query, confidence=0.4)
        for annotation in annotations:
            entities.append(annotation['URI'])
    except:
        pass
    return entities

In [32]:
def make_sparql_query(query, return_var):
    sparql = SPARQLWrapper("http://akswnc9.aksw.uni-leipzig.de/dbpedia/sparql")
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    if return_var == 'boolean':
        if 'boolean' in results:            
            return results['boolean']
        else:
            return None
    
    output = []
    for result in results["results"]["bindings"]:
        if return_var in result:
            output.append(result[return_var]['value'])
    return output


def get_rdfs_label(prop):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
    SELECT ?label WHERE { <""" + prop + """> rdfs:label ?label . FILTER(lang(?label) = 'en') }
    """)
    sparql.setReturnFormat(JSON)    
    results = sparql.query().convert()
    
    for result in results["results"]["bindings"]:
        return result['label']['value']
    return None

In [33]:
df = pd.read_csv('data/lc-quad/dataset.csv')
def get_entities_from_answer(index):
#     string = df[df['_id'] == int(id)]['sparql_query'].tolist()[0]        
    string = qald[index]['sparql_query'] 
    matches = re.findall('<[^>]*>', string)
    matches = [uri.replace('<', '').replace('>', '') for uri in matches if uri != '<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>']
    
    results = {'r': set(), 'p': set(), 'c': set()}    
    for match in matches:
        if match.startswith('http://dbpedia.org/resource/'):
            results['r'].add(match)
        elif match.split('/')[-1][0].isupper():
            results['c'].add(match)
        else:
            results['p'].add(match)
    
    return results

FileNotFoundError: [Errno 2] No such file or directory: 'data/lc-quad/dataset.csv'

In [17]:
# # QALD Slot Filling
# hashmap = {}
# with open('data/qald/qald-7-train-multilingual.json', 'r') as f:
#     qald_train = json.load(f)

# with open('data/qald/qald.json', 'r') as f:
#     qald = json.load(f)

# for question in qald_train['questions']:
#     hashmap[question['question'][0]['string']] = question['query']['sparql']

# for index in range(len(qald)):
#     qald[index]['sparql_query'] = hashmap[qald[index]['question']]

In [34]:
def detect_entities(question, index):
    entities_list = get_entities_from_answer(index)
    detected_entities = get_earl_entities(question)
    detected_entities['r'] = get_tag_me_entities(question) + get_spotlight_entities(question)

    for e_index in range(len(detected_entities['p'])): # Adding property namespace to Earl predicates
        entity = detected_entities['p'][e_index]
        detected_entities['p'].append(entity.replace('/ontology/', '/property/'))

    detected_entities['p'] += get_nliwod_entities(question, properties)

    for e_index in range(len(detected_entities['p'])): # Adding plural properties to all detected predicates
        entity = detected_entities['p'][e_index]
        if entity.startswith('http://dbpedia.org/property/'):
            pred = entity.split('/')[-1].split('_')
            if len(pred) == 1:
                detected_entities['p'].append('http://dbpedia.org/property/' + p.plural_noun(pred[0]))

    detected_entities['r'] = set(detected_entities['r'])
    detected_entities['p'] = set(detected_entities['p'])

    unique_p = set()
    for entity in detected_entities['p']:
        e = p.singular_noun(entity.split('/')[-1])
        if e is not False:
            unique_p.add(e)

    for entity in unique_p:
        if entity == entity.lower():
            detected_entities['c'].append('http://dbpedia.org/ontology/' + entity[0].upper() + entity[1:])        
    detected_entities['c'] = set(detected_entities['c'])
    return detected_entities, entities_list, unique_p

In [36]:
start = 0
exists = 0
count = 1
results = []

# res_df = pd.read_csv('slot_results.csv')

for index in range(start, len(questions)):
#     df_row = res_df.loc[index,:]
#     if df_row['Correct'] == True:
#         continue
        
#     if df_row['C_R'] != df_row['N_R'] and df_row['C_P'] == df_row['N_P']:
    question = questions[index]

    print('\n' * 2)    
    print(index, question)
    
    detected_entities, entities_list, unique_p = detect_entities(question, index)
    row = [len(entities_list['r']), len(entities_list['p']), len(entities_list['c']),
           len(detected_entities['r']), len(unique_p), len(detected_entities['c']),

           len(entities_list['r'].intersection(detected_entities['r'])),
           len(entities_list['p'].intersection(detected_entities['p'])),
           len(entities_list['c'].intersection(detected_entities['c']))
          ]
    row.append(row[0] == row[-3] and row[1] == row[-2] and row[2] == row[-1]) # Check if number and correct are the same    

#     results.append(row)
#     res_df.loc[index,:] = row
    print('\n', entities_list['r'] - detected_entities['r'], '\n', entities_list['p'] - detected_entities['p'])    
    print(index, row)            
    break




0 Philadelphia City Council is the governing body of which city?


NameError: name 'get_entities_from_answer' is not defined

In [20]:
# res_df = pd.DataFrame(results, columns=['N_R', 'N_P', 'N_C', 'D_R', 'D_P', 'D_C', 'C_R', 'C_P', 'C_C', 'Correct'])
# res_df.to_csv('slot_results.csv', index=False)

In [21]:
# with open('props.csv') as f:
#     props = f.read().split('\n')
# props = props[1:]

# string = ''
# for prop in props:
#     label = get_rdfs_label(prop)
#     if label is not None:
#         print(prop, label)
#         string += '<' + prop + '>' + '<http://www.w3.org/2000/01/rdf-schema#label> "' + label + '" .\n'
        
# with open('props.txt', 'w') as f:
#     f.write(string)

In [22]:
correct = res_df['Correct']
accuracy = 0
top_2_accuracy = 0

for index in range(len(questions)):
    if correct[index] == True:
        if y_true[index] == y_pred[index][0]:
            accuracy += 1
            top_2_accuracy += 1
        elif y_true[index] == y_pred[index][1]:
            top_2_accuracy += 1
accuracy / len(correct), top_2_accuracy / len(correct), correct.sum() / len(correct)

NameError: name 'res_df' is not defined

In [30]:
len(res_df[res_df['N_R'] == res_df['C_R']]) / len(questions), len(res_df[res_df['N_P'] == res_df['C_P']]) / len(questions), len(res_df[(res_df['N_C'] == res_df['C_C']) & res_df['N_C'] > 0]) / len(res_df[res_df['N_C'] > 0])

(0.8132780082987552, 0.6182572614107884, 0.837465564738292)

In [49]:
def get_answer(entities, template_id):
    output = []
    sparql_query = ''
    
    template = templates[str(template_id)]
    slots = {}
    for slot in template['slots']:
        slots[slot] = entities[slot[0]]        
    
    # This means something probably went wrong and no predicates or resources were detected for the query
    if len(slots['p']) == 0 or len(slots['r']) == 0:
        return output, sparql_query
    
    ranges = []
    slot_keys = list(slots.keys())
    slot_len = len(slot_keys)
    for slot in slot_keys:
        ranges.append(range(len(slots[slot]))) 
    
    for i in itertools.product(*ranges):
        if slot_keys[0] == 'p' and slot_keys[1] == 'p2' and slots['p'][i[0]] == slots['p2'][i[1]]:
            continue
        if slot_keys[-2] == 'r' and slot_keys[-1] == 'r2' and slots['r'][i[-2]] == slots['r2'][i[-1]]:
            continue
            
        sparql_query = template['sparql']                
        for index in range(slot_len):
            sparql_query = sparql_query.replace('<' + slot_keys[index] + '>', '<' + slots[slot_keys[index]][i[index]] + '>')
        
        print('.', end='')
        
        output = make_sparql_query(sparql_query, template['return'])        
        if template['return'] == 'boolean':
            if output == None:
                output = []
            else:
                output = [output]

        if template['return'] == 'count' and int(output[0]) == 0:
            continue
            
        elif len(output) > 0:
            break

    return output, sparql_query

In [50]:
def filter_out_entities(resources, predicates):
    sparql = SPARQLWrapper("http://akswnc9.aksw.uni-leipzig.de/dbpedia/sparql")
    result = []
    
    if len(resources) == 0:
        return []
    
    query = """
    SELECT DISTINCT ?p WHERE {
        VALUES ?r {
            """ + '<' + '> <'.join(resources) + '>' + """
        }

        { ?r ?p ?x }
        UNION {?r ?p2 ?x . ?x ?p ?x2}
        UNION {?x ?p ?r}
        UNION {?x ?p2 ?r . ?x ?p ?x2}
    }
    """
    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)    
    result_set = sparql.query().convert()
    preds = set()    
    
    for result in result_set['results']['bindings']:
        preds.add(result['p']['value'])
    
    return list(predicates.intersection(preds))

def filter_out_classes(classes):
    results = []
    for _class in list(classes):
        if get_rdfs_label(_class) != None:
            results.append(_class)
    return results    

In [172]:
with open('data/qald/qald.json', 'r') as f:
    qald = json.load(f)

qald[45]

{'actual': '2',
 'predictions': '2,1',
 'question': 'Who is the mayor of Tel Aviv?',
 'sparql_query': 'SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Tel_Aviv> <http://dbpedia.org/ontology/leaderName> ?uri .}'}

In [53]:
# res_df = pd.read_csv('slot_results.csv')
with open('data/qald/qald.json', 'r') as f:
    qald = json.load(f)

answers = {
    "dataset": {
        "id": "LC-QuAD-Custom",
        "name": "LC-QuAD Custom Test Datset"
    },
    "questions": [
        
    ]
}

# answers_to_questions = []
start = 0
sparql = SPARQLWrapper("http://akswnc9.aksw.uni-leipzig.de/dbpedia/sparql")
questions = qald

for index in range(start, len(questions)):
    print("\n" * 3)
    print(index, questions[index], y_true[index])

#     if res_df.loc[index]['Correct'] == False:
#         print('SKIPPED')
#         continue
    
#     q = df[df['_id'] == int(ids[index])]
#     sparql_query = q['sparql_query'].tolist()[0].strip()
    template = int(questions[index]['actual'])
    
    if template == 151:
        answertype = "Boolean"
    elif template < 100:
        answertype = "ListOfResource"
    else:
        answertype = "Number"
    
    detected_entities, entities_list, unique_p = detect_entities(questions[index]['question'], index)
        
#     detected_entities['p'] = filter_out_entities(list(detected_entities['r'].intersection(entities_list['r'])), detected_entities['p'])
#     detected_entities['p'].sort(key = len, reverse=True)
    
#     detected_entities['r'] = list(detected_entities['r'])
#     detected_entities['c'] = filter_out_classes(detected_entities['c'])

    detected_entities['r'] = list(detected_entities['r'].intersection(entities_list['r']))
    detected_entities['p'] = list(detected_entities['p'].intersection(entities_list['p']))
    classes = list(detected_entities['c'].intersection(entities_list['c']))
    
    if len(classes) == 0:
        detected_entities['c'] = filter_out_classes(detected_entities['c'])
    else:
        detected_entities['c'] = classes
        
    output, sparql_query = get_answer(detected_entities, y_pred[index][0])
    
    if len(output) == 0:        
        output, sparql_query = get_answer(detected_entities, y_pred[index][1])
        
    if len(sparql_query) > 0:
        sparql.setQuery(sparql_query)
        sparql.setReturnFormat(JSON)    
        result_set = sparql.query().convert()
    else:
        result_set = []
        
    question = {
        "id": index,
        "metadata": {
            "answertype": answertype
        },
        "question": [{
            "language": "en",
            "string": questions[index]
        }],
        "query": {
            "sparql": sparql_query
        },
        "answers": result_set
    }
    
    answers_to_questions.append(question)





0 {'question': 'List all the musicals with music by Elton John.', 'predictions': '1,8', 'actual': '1', 'sparql_query': 'SELECT DISTINCT ?uri\nWHERE { \n        ?uri <http://dbpedia.org/ontology/musicBy> <http://dbpedia.org/resource/Elton_John> .\n        ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Musical> .\n\n}'} 1
..



1 {'question': 'How high is the lighthouse in Colombo?', 'predictions': '101,151', 'actual': '2', 'sparql_query': 'SELECT DISTINCT ?num WHERE {  <http://dbpedia.org/resource/Colombo_Lighthouse> <http://dbpedia.org/ontology/height> ?num . } '} 3




2 {'question': 'Who was the wife of U.S. president Lincoln?', 'predictions': '2,1', 'actual': '2', 'sparql_query': 'SELECT DISTINCT ?uri \nWHERE {\n\t<http://dbpedia.org/resource/Abraham_Lincoln? <http://dbpedia.org/ontology/spouse> ?uri.\n}'} 8




3 {'question': 'Who is the host of the BBC Wildlife Specials?', 'predictions': '2,1', 'actual': '2', 'sparql_query': 'SELECT DISTINC

In [227]:
with open('qald_results.json', 'r') as f:
    answers_to_questions = json.load(f)

with open('data/qald/qald.json', 'r') as f:
    qald = json.load(f)
    
start = 80
questions = qald
for index in range(start, start + 1):
    answer = answers_to_questions[index]
    if answer['answers'] != []:
        continue
    
    if answer['metadata']['answertype'] != 'ListOfResource':
        continue
    
#     if answer['question'][0]['string']['actual'] not in answer['question'][0]['string']['predictions'].split(','):
#         continue
    
    print(index, answer['question'][0]['string']['question'])
    
    detected_entities, entities_list, unique_p = detect_entities(questions[index]['question'], index)
    detected_entities['r'] = list(detected_entities['r'].intersection(entities_list['r'])) + ['http://dbpedia.org/resource/The_Pillars_of_the_Earth']
    detected_entities['p'] = list(detected_entities['p'].intersection(entities_list['p']))
    classes = list(entities_list['c'])
    
    if len(classes) == 0:
        detected_entities['c'] = [list(detected_entities['c'])[0]]
    else:
        detected_entities['c'] = classes
    
    template = int(questions[index]['actual'])
    output, sparql_query = get_answer(detected_entities, template)
            
    if len(sparql_query) > 0:
        sparql.setQuery(sparql_query)
        sparql.setReturnFormat(JSON)    
        result_set = sparql.query().convert()
    else:
        result_set = []
    
    answers_to_questions[index]['answers'] = result_set
    answers_to_questions[index]['query']['sparql'] = sparql_query

    print(index)
    print('\n' * 2)
    print(entities_list, '\n' * 3, detected_entities)    
    print(result_set)
    print('\n' * 5)

80 Which movies did Kurosawa direct?
.80



{'r': {'http://dbpedia.org/resource/Akira_Kurosawa'}, 'p': {'http://dbpedia.org/ontology/director'}, 'c': {'http://dbpedia.org/ontology/Film'}} 


 {'r': ['http://dbpedia.org/resource/Akira_Kurosawa', 'http://dbpedia.org/resource/The_Pillars_of_the_Earth'], 'p': ['http://dbpedia.org/ontology/director'], 'c': ['http://dbpedia.org/ontology/Film']}
{'head': {'link': [], 'vars': ['uri']}, 'results': {'distinct': False, 'ordered': True, 'bindings': [{'uri': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Rhapsody_in_August'}}, {'uri': {'type': 'uri', 'value': 'http://dbpedia.org/resource/The_Lower_Depths_(1957_film)'}}, {'uri': {'type': 'uri', 'value': "http://dbpedia.org/resource/The_Men_Who_Tread_on_the_Tiger's_Tail"}}, {'uri': {'type': 'uri', 'value': 'http://dbpedia.org/resource/Horse_(1941_film)'}}, {'uri': {'type': 'uri', 'value': 'http://dbpedia.org/resource/The_Idiot_(1951_film)'}}, {'uri': {'type': 'uri', 'value': 'http://dbpedia.or

In [228]:
# with open('qald_results.json', 'w') as f:
#     json.dump(answers_to_questions, f)

In [230]:
with open('qald_results.json', 'r') as f:
    answers_to_questions = json.load(f)

micro_p = []
micro_tp = []
micro_fp = []
micro_fn = []

def get_uris(answers):
    uris = set()
    if answers != []:
        for answer in answers['results']['bindings']:
            uris.add(answer[answers['head']['vars'][0]]['value'])
    return uris

def safe_div(x, y):
    if y == 0:
        return 0
    return x / y

for index in range(len(answers_to_questions)):
    answertype = answers_to_questions[index]['metadata']['answertype']
    
    answers = answers_to_questions[index]['answers']
    golden_answers = answers_to_questions[index]['golden_answers']
    
    if answertype == 'ListOfResource':
        answers_uris = get_uris(answers)
        golden_answers_uris = get_uris(golden_answers)        
    else:
        answers_uris = set()
        golden_answers_uris = set()
        
        if 'boolean' in answers:
            answers_uris.add(answers['boolean'])
        golden_answers_uris.add(golden_answers['boolean'])
    micro_p.append(len(golden_answers_uris))
    micro_tp.append(len(golden_answers_uris.intersection(answers_uris)))
    micro_fp.append(len(answers_uris - golden_answers_uris))
    micro_fn.append(len(golden_answers_uris - answers_uris))
    
pr = []
r = []
f = []
for index in range(len(answers_to_questions)):
    pr.append(safe_div(micro_tp[index], micro_tp[index] + micro_fp[index]))
    r.append(safe_div(micro_tp[index], micro_tp[index] + micro_fn[index]))
    f.append(safe_div(2 * pr[index] * r[index], pr[index] + r[index]))

In [231]:
micro_tp[80]

32

In [232]:
pd.Series(pr).mean(), pd.Series(r).mean(), pd.Series(f).mean()

(0.4164335664335664, 0.4230769230769231, 0.41723076923076924)

In [233]:
tp = sum(micro_tp)
fp = sum(micro_fp)
fn = sum(micro_fn)

p_macro = safe_div(tp, tp + fp)
r_macro = safe_div(tp, tp + fn)
p_macro, r_macro, safe_div(2 * p_macro * r_macro, p_macro + r_macro)

(0.7575757575757576, 0.4666666666666667, 0.5775577557755777)

In [241]:
p = 0.612
r = 0.466
p, r, 2 * p * r / (p + r)

(0.612, 0.466, 0.529113172541744)

In [207]:
for i in range(len(micro_fn)):
    if micro_fn[i] > 10:
        print(i, micro_fn[i])

36 12
46 13
74 16
80 32
101 34


In [157]:
micro_p

[4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 7,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 2,
 1,
 1,
 1,
 4,
 1,
 4,
 1,
 1,
 1,
 1,
 3,
 12,
 10,
 6,
 1,
 1,
 1,
 2,
 1,
 7,
 7,
 13,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 2,
 1,
 1,
 32,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 1,
 7,
 3,
 1,
 2,
 1,
 16,
 1,
 1,
 1,
 3,
 1,
 32,
 1,
 1,
 3,
 1,
 10,
 2,
 1,
 1,
 1,
 1,
 9,
 1,
 3,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 34,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 11,
 1,
 1,
 1,
 1,
 3,
 1,
 5,
 1]

In [114]:
with open('qald_results.json', 'r') as f:
    answers_to_questions = json.load(f)
    
for index in range(len(answers_to_questions)):
    answer = answers_to_questions[index]
    
    print(answer['question'][0]['string']['sparql_query'].replace('\n', ' ').replace('\t', ' '))
    sparql.setQuery(answer['question'][0]['string']['sparql_query'].replace('\n', ' ').replace('\t', ' '))
    sparql.setReturnFormat(JSON)    
    result_set = sparql.query().convert()
    answers_to_questions[index]['golden_answers'] = result_set

SELECT DISTINCT ?uri WHERE {          ?uri <http://dbpedia.org/ontology/musicBy> <http://dbpedia.org/resource/Elton_John> .         ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Musical> .  }
SELECT DISTINCT ?num WHERE {  <http://dbpedia.org/resource/Colombo_Lighthouse> <http://dbpedia.org/ontology/height> ?num . } 
SELECT DISTINCT ?uri  WHERE {  <http://dbpedia.org/resource/Abraham_Lincoln> <http://dbpedia.org/ontology/spouse> ?uri. }
SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/BBC_Wildlife_Specials> <http://dbpedia.org/ontology/presenter> ?uri . }
SELECT DISTINCT ?n WHERE {  <http://dbpedia.org/resource/Pulp_Fiction> <http://dbpedia.org/ontology/budget> ?n . } 
SELECT DISTINCT ?uri WHERE { <http://dbpedia.org/resource/Heineken> <http://dbpedia.org/ontology/manufacturer> ?x . ?x <http://dbpedia.org/ontology/locationCity> ?uri . }
SELECT DISTINCT ?uri WHERE {  <http://dbpedia.org/resource/Chile_Route_68> <http://dbpedia.org/ontology/r

In [102]:
index

68

In [301]:
detected_entities
q = df[df['_id'] == int(ids[0])]
print(q['sparql_query'].tolist()[0].strip())
print(answers_to_questions[0]['query']['sparql'])
print(index)

SELECT DISTINCT ?uri WHERE {?uri <http://dbpedia.org/property/governingBody> <http://dbpedia.org/resource/Philadelphia_City_Council> }
SELECT DISTINCT ?uri WHERE { ?uri <http://dbpedia.org/property/governingBody> <http://dbpedia.org/resource/Philadelphia_City_Council> . OPTIONAL { ?uri rdf:type <http://dbpedia.org/ontology/Company> } }
963


In [421]:
with open('answers_model.json', 'r') as f:
    answers_model = json.load(f)

with open('golden_answers.json', 'r') as f:
    golden_answers = json.load(f)['questions']

In [424]:
no_records = []    
for index in range(len(data)):
    if res_df.loc[index, 'Correct'] == True:
        continue
    
    if 'boolean' in golden_answers[index]['answers']:
        pass
    elif golden_answers[index]['answers']['head']['vars'][0] in ['uri', 'count']:
        var = golden_answers[index]['answers']['head']['vars'][0]
        
        g_answers = set()
        m_answers = set()
        
        for i in golden_answers[index]['answers']['results']['bindings']:
            g_answers.add(i[var]['value'])
        
        try:
            for i in answers_model[index]['answers']['results']['bindings']:
                m_answers.add(i[var]['value'])
                
            if g_answers - m_answers != set():     
                print('\n\n', index)
                print(len(golden_answers[index]['answers']['results']['bindings']))
                print(golden_answers[index]['query']['sparql'])

                print(len(answers_model[index]['answers']['results']['bindings']))
                print(answers_model[index]['query']['sparql'])
                print('\n' * 5)
        except:
            print(index, 'NO RECORDS IT SEEMS', answers_model[index]['answers'])
            
            detected_entities, entities_list, unique_p = detect_entities(questions[index], index)
            
            detected_entities['r'] = list(detected_entities['r'].intersection(entities_list['r']))
            detected_entities['p'] = list(detected_entities['p'].intersection(entities_list['p']))
            detected_entities['c'] = list(detected_entities['c'].intersection(entities_list['c']))
            
            print(detected_entities)
            
            print(entities_list)
            continue
            
            output, sparql_query = get_answer(detected_entities, y_true[index])
            
            if len(sparql_query) > 0:
                sparql.setQuery(sparql_query)
                sparql.setReturnFormat(JSON)    
                result_set = sparql.query().convert()
                                
                print('found the right answer')
                answers_model[index]['query']['sparql'] = sparql_query
                answers_model[index]['answers'] = result_set     
            else:
                print(index, 'STILL NOT WORKING')
            no_records.append(index)
            break

#     print(golden_answers[index]['answers'])
#     print(golden_answers[index]['query']['sparql'])

#     print('\n\n')
    
#     print(answers_model[index]['answers'])
#     print(answers_model[index]['query']['sparql'])    

2 NO RECORDS IT SEEMS []
{'r': [], 'p': ['http://dbpedia.org/ontology/portrayer'], 'c': ['http://dbpedia.org/ontology/FictionalCharacter']}
{'r': {'http://dbpedia.org/resource/Terminator_(franchise)', 'http://dbpedia.org/resource/Terminator_2:_Judgment_Day'}, 'p': {'http://dbpedia.org/ontology/portrayer', 'http://dbpedia.org/ontology/series'}, 'c': {'http://dbpedia.org/ontology/FictionalCharacter'}}
3 NO RECORDS IT SEEMS []
{'r': ['http://dbpedia.org/resource/Mariveles,_Bataan'], 'p': ['http://dbpedia.org/ontology/sport'], 'c': []}
{'r': {'http://dbpedia.org/resource/Mariveles,_Bataan'}, 'p': {'http://dbpedia.org/ontology/sport', 'http://dbpedia.org/property/city'}, 'c': set()}
6 NO RECORDS IT SEEMS []
{'r': ['http://dbpedia.org/resource/Ganz_UV'], 'p': ['http://dbpedia.org/ontology/nationality', 'http://dbpedia.org/ontology/operator'], 'c': []}
{'r': {'http://dbpedia.org/resource/Géza_Horváth', 'http://dbpedia.org/resource/Ganz_UV'}, 'p': {'http://dbpedia.org/ontology/nationality', 'h

KeyboardInterrupt: 

In [408]:
for index in range(len(data)):
    if res_df.loc[index, 'Correct'] == False:
        continue
    
    if 'boolean' in golden_answers[index]['answers']:
        pass
    elif golden_answers[index]['answers']['head']['vars'][0] in ['uri', 'count']:
        var = golden_answers[index]['answers']['head']['vars'][0]
        
        g_answers = set()
        m_answers = set()
        
        for i in golden_answers[index]['answers']['results']['bindings']:
            g_answers.add(i[var]['value'])
        
        try:
            for i in answers_model[index]['answers']['results']['bindings']:
                m_answers.add(i[var]['value'])
                
            if g_answers - m_answers != set():     
                print('\n\n', index)
                print(len(golden_answers[index]['answers']['results']['bindings']))
                print(golden_answers[index]['query']['sparql'])

                print(len(answers_model[index]['answers']['results']['bindings']))
                print(answers_model[index]['query']['sparql'])
                print('\n' * 5)
        except:
            print(index, 'NO RECORDS IT SEEMS', answers_model[index]['answers'])
            no_records.append(index)


31 NO RECORDS IT SEEMS []
34 NO RECORDS IT SEEMS []


 45
1
SELECT DISTINCT (COUNT(?uri) as ?count) WHERE { ?x <http://dbpedia.org/ontology/kingdom> <http://dbpedia.org/resource/Animal> . ?x <http://dbpedia.org/ontology/species> ?uri  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Species>}
1
SELECT DISTINCT (COUNT(?uri) as ?count) WHERE { ?x <http://dbpedia.org/ontology/kingdom> <http://dbpedia.org/resource/Animal> . ?x <http://dbpedia.org/ontology/species> ?uri . OPTIONAL { ?uri rdf:type <http://dbpedia.org/ontology/Species> } }








 61
5
SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/children> <http://dbpedia.org/resource/Levi_Lincoln,_Jr.> . ?x <http://dbpedia.org/property/successor> ?uri  . ?x <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Governor>}
2
SELECT DISTINCT ?uri WHERE { ?x <http://dbpedia.org/property/successor> <http://dbpedia.org/resource/Levi_Lincoln,_Jr.> . ?x <http://dbpedia.or

In [409]:
index = 222
print(golden_answers[index]['query']['sparql'])
print('\n' * 3)
print(answers_model[index]['query']['sparql'])
print(y_true[index], y_pred[index][0], y_pred[index][1])

SELECT DISTINCT (COUNT(?uri) as ?count) WHERE {?uri <http://dbpedia.org/property/training> <http://dbpedia.org/resource/San_Francisco_Art_Institute>  . ?uri <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Artist>}





101 105 106


In [412]:
detected_entities, entities_list, unique_p = detect_entities(questions[222], index)

In [414]:
detected_entities, entities_list

({'c': {'http://dbpedia.org/ontology/Artist',
   'http://dbpedia.org/ontology/Institution',
   'http://dbpedia.org/ontology/Institutions',
   'http://dbpedia.org/ontology/Number',
   'http://dbpedia.org/ontology/Occupation',
   'http://dbpedia.org/ontology/Place',
   'http://dbpedia.org/ontology/Relative',
   'http://dbpedia.org/ontology/Training',
   'http://dbpedia.org/ontology/Work'},
  'p': {'http://dbpedia.org/ontology/artist',
   'http://dbpedia.org/ontology/institution',
   'http://dbpedia.org/ontology/number',
   'http://dbpedia.org/ontology/occupation',
   'http://dbpedia.org/ontology/relative',
   'http://dbpedia.org/ontology/training',
   'http://dbpedia.org/property/artist',
   'http://dbpedia.org/property/artists',
   'http://dbpedia.org/property/institution',
   'http://dbpedia.org/property/institutions',
   'http://dbpedia.org/property/institutionss',
   'http://dbpedia.org/property/number',
   'http://dbpedia.org/property/numbers',
   'http://dbpedia.org/property/occupa

In [392]:
sparql = SPARQLWrapper("http://akswnc9.aksw.uni-leipzig.de/dbpedia/sparql")
index = 73
# for index in range(len(data)):
#     if answers_model[index]['query']['sparql'].startswith('SELECT ?uri'):
#         answers_model[index]['query']['sparql'] = answers_model[index]['query']['sparql'].replace('SELECT ?uri', 'SELECT DISTINCT ?uri')
# print(answers_model[index]['query']['sparql'])

sparql.setQuery(answers_model[index]['query']['sparql'])
sparql.setReturnFormat(JSON)    
result_set = sparql.query().convert()

answers_model[index]['answers'] = result_set
print('\n' * 5)







