In [61]:
import spacy
import json


In [79]:
with open("../data/predicates_clean.json") as f:
    preds = json.load(f)

In [93]:
similarity = fuzz.partial_ratio("MPA film rating", "What is the MPA film rating of Avatar?")
print(similarity)

64


In [118]:
def get_relation(question) -> list:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(question)
        relations = None
        for token in doc:
            print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                    token.shape_, token.is_alpha, token.is_stop)
        try:
            similarities = {}
            for v in preds.values():
                similarity = fuzz.partial_ratio(v, question)
                if similarity > 65:
                    similarities[v] = similarity

            similarities = {k: v for k, v in sorted(similarities.items(), key=lambda item: item[1])}  
            relations = list(similarities.keys())[-1]         
            assert relations, "No exact match found"
        
        except :
            try:
                relations = [
                    tok.lemma_
                    for tok in doc
                    if tok.dep_ in ("attr", "nsubj") and tok.pos_ in ("PROPN", "NOUN")
                ]
                assert relations, "No relation found ..."
                relations = " ".join(relations)  

            except AssertionError:
                print(2)
                relations = [
                    tok.lemma_
                    for tok in doc
                    if tok.pos_ in ("VERB")
                ]
        
        finally:
            return relations    
            
        


In [133]:
print(get_relation("Who directed Titanic?"))

Who who PRON WP nsubj Xxx True True
directed direct VERB VBD ROOT xxxx True False
Titanic Titanic PROPN NNP dobj Xxxxx True False
? ? PUNCT . punct ? False False
director


In [12]:
import spacy
import sys
sys.path.append("../models/")
import NER_CRF


class NLP_Operations:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")

    def get_ner(self, question):
        ner = NER_CRF.get_ner(question)
        return ner

    def get_relation(self, question):
        doc = self.nlp(question)
        relations = [
            tok.lemma_
            for tok in doc
            if tok.dep_ in ("attr", "nsubj") and tok.pos_ in ("NOUN")
        ]
        return relations


In [13]:
import json


def get_dicti(path):
    with open(path, "r") as f:
        dicti = json.load(f)
    return dicti


In [27]:
import rdflib


class GraphOperations:
    def __init__(self, graph_file="data/14_graph.nt"):
        self.graph = rdflib.Graph()
        # self.load_graph(graph_file)

    def load_graph(self, graph_file):
        print("loading graph ...")
        self.graph.parse(graph_file, format="turtle")
        print("loaded graph successfully!")

    def query(self, message):
        # remember to delete these 2 lines after this boring evaluation
        message = message.replace('"""', "").replace("'''", "")
        print("message in sparql")
        message = str(message)
        print(message)
        temp = [str(s) for s, in self.graph.query(message)]
        print(temp)
        return temp

    def query2(self, message):
        return calculate_answer(message)


In [49]:
# sys.path.append("../usecases/")
# import utils
# import nlp_operations
# import graph_operations
import copy
import editdistance


class AnswerCalculator:
    def __init__(self):
        self.nodes = get_dicti("../data/nodes.json")
        self.predicates = get_dicti("../data/predicates_clean.json")
        self.nlp_operator = NLP_Operations()
        self.graph_operator = GraphOperations()
        self.wh_words = [
            "What",
            "what",
            "when",
            "When",
            "where",
            "Where",
            "Who",
            "who",
            "Whom",
            "whom",
            "Which",
            "which",
            "Whose",
            "whose",
            "Why",
            "why",
            "How",
            "how",
        ]
        self.useless_words = [
            "am",
            "is",
            "are",
            "was",
            "were",
            "a",
            "an",
            "the",
            "that",
            "this",
            "these",
            "those",
            "above",
            "across",
            "against",
            "along",
            "among",
            "around",
            "at",
            "before",
            "behind",
            "below",
            "beneath",
            "beside",
            "between",
            "by",
            "down",
            "from",
            "in",
            "into",
            "near",
            "off",
            "on",
            "to",
            "woward",
            "under",
            "upon",
            "with",
            "and",
            "within",
            "of",
            "for",
            "since",
            "during",
            "over",
        ]
        self.all_delete_words = self.wh_words + self.useless_words

    def calculate_answer(self, question):
        question_list = question.split(" ")
        tag_list = self.nlp_operator.get_ner(
            question
        )  # returns e.g. [['O', 'O', 'O', 'O', 'O']]

        print("tag_list: ", tag_list)
        wh_word = question_list[0].upper()
        if wh_word == "WHEN":
            return self.calculate_when_answer(
                copy.deepcopy(question), tag_list[0]
            )  # this is supposed to be tag_list, right?
        else:
            return self.calculate_other_answer(copy.deepcopy(question), tag_list[0])

    # this is also not a question_list but a question
    def calculate_other_answer(self, question, tag_list):
        print("calculate other answer")
        question_list = question.split(" ")
        print(tag_list)
        try:
            # find entity
            indexes = [index for index, val in enumerate(tag_list) if val != "O"]
            print(indexes)
            entity = (
                " ".join(question_list[indexes[0] : indexes[-1] + 1])
                .rstrip("?")
            )
            print(entity)

            relations = self.nlp_operator.get_relation(question)
            print(relations)
            assert len(relations) == 1
            relations = relations[0]

            possible_answer = self.search_answer(entity, relations, 0)

        except:
            temp = copy.deepcopy(question_list)
            possible_answer = self.forcely_search(temp, 0)

        finally:
            return possible_answer

    # you are not passing the question list, but question?
    def calculate_when_answer(self, question_list, tag_list):
        try:
            # find entity
            indexes = [index for index, val in enumerate(tag_list) if val != "O"]
            entity = (
                " ".join(question_list[indexes[0] : indexes[-1] + 1])
                .rstrip("?")
                .rstrip('"')
                .rstrip("'")
            )

            entity_list = " ".join(question_list[indexes[0] : indexes[-1] + 1]).split(
                " "
            )

            # delete entity word
            temp = copy.deepcopy(question_list)
            for word in entity_list:
                temp.remove(word)

            for word in self.all_delete_words:
                try:
                    temp.remove(word)
                except:
                    pass

            assert len(temp) == 1
            relations = temp[0] + " time"

            possible_answer = []
            possible_answer = self.search_answer(entity, relations, 1)

        except:
            temp = copy.deepcopy(question_list)
            possible_answer = self.forcely_search(temp, 1)

        return possible_answer

    def search_answer(self, entity_word, related_word, is_when):
        search_loop = 0
        search_flag = 0
        edit_distance = 1

        node_distance_dict = self.calculate_node_distance(entity_word)
        pred_distance_dict = self.calculate_pred_distance(related_word)

        searched_answers = []
        for n_key in node_distance_dict.keys():
            search_loop += 1

            if node_distance_dict[n_key] == 0:
                edit_distance = 0
            else:
                edit_distance = 1

            for p_key in pred_distance_dict.keys():                
                if is_when:
                    query = f'''
                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?date WHERE{{    
                            wd:{n_key} wdt:{p_key} ?date.
                        }}
                        LIMIT 1
                        '''
                else:
                    query = f'''
                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?entity_name WHERE{{    
                            wd:{n_key} wdt:{p_key} ?temp.
                            ?temp rdfs:label ?entity_name.
                        }}
                        LIMIT 1
                    '''

                print(query)

                answers = graph_operator.query(query)
                print(answers)
                if len(answers) > 0:
                    search_flag = 1
                if search_flag == 1:
                    searched_answers.append(answers)
                    break
            if search_flag == 1 and edit_distance == 1:
                break
            if search_loop > 10:
                answers = []
                break

        return searched_answers

    def search_answer_for_all_O(self, entity_word, related_word, is_when):
        node_distance_dict = self.calculate_node_distance(entity_word)
        pred_distance_dict = self.calculate_pred_distance(related_word)

        search_flag = 0

        # entity distance seems reliable, try 5 times
        try_times = 0
        for n_key in node_distance_dict.keys():
            try_times += 1
            for p_key in pred_distance_dict.keys():
                if is_when:
                    query = f'''
                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?date WHERE{{    
                            wd:{n_key} wdt:{p_key} ?date.
                        }}
                        LIMIT 1
                        '''
                else:
                    query = f'''
                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?entity_name WHERE{{    
                            wd:{n_key} wdt:{p_key} ?temp.
                            ?temp rdfs:label ?entity_name.
                        }}
                        LIMIT 1
                    '''

                answers = [str(s) for s, in g.query(query)]
                if len(answers) > 0 and len(answers[0]) == 10:
                    search_flag = 1
                if search_flag == 1:
                    break
            if search_flag == 1:
                break
            if try_times > 10:
                break
        return answers

    def calculate_node_distance(self, word):
        distance_dict = {}
        print("entity matching for {}".format(word))
        for key, value in self.nodes.items():
            distance_dict[key.split("/")[-1]] = editdistance.eval(value, word)
        distance_dict = dict(sorted(distance_dict.items(), key=lambda x: x[1]))
        return distance_dict

    def calculate_pred_distance(self, related_word):
        pred_distance_dict = {}
        print("relation matching for {}".format(related_word))
        for key, value in self.predicates.items():
            pred_distance_dict[key.split("/")[-1]] = editdistance.eval(
                value, related_word
            )
        pred_distance_dict = dict(
            sorted(pred_distance_dict.items(), key=lambda x: x[1])
        )

        # don't comment this block
        try:
            del pred_distance_dict["rdf-schema#label"]
        except:
            pass

        return pred_distance_dict

    def forcely_search(self, question_list, is_when):
        for word in self.all_delete_words:
            try:
                question_list.remove(word)
            except:
                pass
        possible_word = copy.deepcopy(question_list)
        for i in range(len(possible_word)):
            possible_word[i] = (
                possible_word[i].replace("?", "").replace('"', "").replace("'", "")
            )

        possible_relation_word_first = possible_word[0]
        possible_entity_word_first = " ".join(possible_word[1:])
        possible_relation_word_last = possible_word[-1]
        possible_entity_word_last = " ".join(possible_word[0:-1])

        if is_when == 1:
            possible_relation_word_first += " time"
            possible_relation_word_last += +" time"

        possible_answer_list1 = self.search_answer_for_all_O(
            possible_entity_word_first, possible_relation_word_first
        )
        possible_answer_list2 = self.search_answer_for_all_O(
            possible_entity_word_last, possible_relation_word_last
        )

        possible_answer = []
        if len(possible_answer_list1) != 0:
            possible_answer = possible_answer_list1
        if len(possible_answer_list2) != 0:
            possible_answer = possible_answer_list2

        return possible_answer


In [21]:
graph_operator = GraphOperations()


loading graph ...
loaded graph successfully!


In [50]:
calculator = AnswerCalculator()
answer = calculator.calculate_other_answer("Who is the director of Star Wars: Episode VI - Return of the Jedi?", [['O', 'O', 'O', 'O', 'O', 'B-org', 'B-per', 'I-per', 'I-per', 'I-per', 'I-per', 'O', 'O', 'B-org']])

answer

calculate other answer
[['O', 'O', 'O', 'O', 'O', 'B-org', 'B-per', 'I-per', 'I-per', 'I-per', 'I-per', 'O', 'O', 'B-org']]
[0]
Who
['director']
entity matching for Who
relation matching for director

                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
                        SELECT ?entity_name WHERE{    
                            wd:Q58880906 wdt:P57 ?temp.
                            ?temp rdfs:label ?entity_name.
                        }
                        LIMIT 1
                    
message in sparql

                        PREFIX ddis: <http://ddis.ch/atai/>
                        PREFIX wd: <http://www.wikidata.org/entity/>
                        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
                        PREFIX schema: <http://schema.org/>
   

[['Dr. Seuss'], ['Daisy von Scherler Mayer']]

In [3]:
import random

In [21]:
def answers_in_template(s_a_e, s_a_r, s_a, n_k_l, p_k_l):
    # first_templates = [
    #     "In my opinion, ",
    #     "As far as I'm concerned, ",
    #     "In my point of view, ",
    #     "Personally speaking, ",
    # ]
    
    first_templates = "Here is some information I found: "
    
    #     R:@@@
    #     E:>>>
    #     A:<<<
    middle_templates = ["the @@@ of >>> is <<<", ">>>'s @@@ is <<<"]
    # concatenate_words = ["Also, ", "What's more, ", "In addition, "]
    concatenate_words = ", "

    # number1 = random.randint(0, 3)
    # number2 = random.randint(0, 1)
    
    number2 =random.randint(0, 1)
    answer_sentence = first_templates + middle_templates[number2].replace(
        "<<<", s_a[0][0]
    ).replace(
        ">>>", s_a_e[0][0]
    ).replace(
        "@@@", s_a_r[0][0]
    )

    # if len(s_a_e[0]) == 1: 
    #     answer_sentence += ". "
    
    for i in range(len(s_a_e) - 2):
        number2 = 0
        number3 = 0
        t2 = (
            middle_templates[number2]
            .replace("<<<", s_a[i + 1][0])
            .replace(">>>", s_a_e[i + 1][0])
            .replace("@@@", s_a_r[i + 1][0])
        )
        answer_sentence = answer_sentence + concatenate_words + t2
        answer_sentence += " "

    if len(s_a_e) -1 > 0:
        answer_sentence += " and " + (
                middle_templates[number2]
                .replace("<<<", s_a[len(s_a_e)-1][0])
                .replace(">>>", s_a_e[len(s_a_e)-1][0])
                .replace("@@@", s_a_r[len(s_a_e)-1][0])
            )
    
    swear_words = ". If you don't believe me, you can check it yourself. I will show you labels of entities and relations. "
    entities_words = str(n_k_l) + " "
    relations_words = str(p_k_l)
    answer_sentence = answer_sentence + swear_words + entities_words + relations_words
    return answer_sentence

In [22]:
s_a_e, s_a_r, s_a, n_k_l, p_k_l = [["1"]],[["1", "2", "3", "4", "5"]],[["1", "2", "3", "4", "5"]], "4", "5"
answers_in_template(s_a_e, s_a_r, s_a, n_k_l, p_k_l)

"Here is some information I found: the 1 of 1 is 1. If you don't believe me, you can check it yourself. I will show you labels of entities and relations. 4 5"