In [3]:
#What and who questions
q1 = '''What are the genres of Inception?'''
q2 = '''What is the title of Pulp Fiction?'''
q3 = '''What are the main subjects of Saving Private Ryan?'''
q4 = '''What are the given names of Leonardo DiCaprio?'''
q5 = '''What is the duration of Black Mirror?'''
q6 = '''What are the names of the Coen Brothers?'''
q7 = '''What is the country of origin of Black Mirror?'''
q8 = '''Who are the directors of Schindler's List?'''
q9 = '''What is the box office of Interstellar?'''
q10 = '''What is the native language of Gollum?'''

# New Questions
q11 = '''When did Alan Rickman die?'''
q12 = '''When was Pulp Fiction published?'''
q13 = '''Where was Morgan Freeman born?'''
q14 = '''Where does Home Alone originate?'''
q15 = '''Which movies are directed by Alice Wu?'''
q16 = '''How long is Pulp Fiction?'''
q17 = '''How many episodes does Twin Peaks have?'''
q18 = '''How long is Interstellar?'''
q19 = '''Which character was married by Aragorn'''
q20 = '''Which character did Aragorn marry?'''

In [5]:
import requests
from itertools import product
import spacy
from spacy import displacy
from spacy.matcher import Matcher

#Wikidata endpoints
url = 'https://query.wikidata.org/sparql'
url2 = 'https://www.wikidata.org/w/api.php'

#one could use WordNet instead of hardcoding these values but I wasn't
#sure if we were allowed to use it for this assignment
nominalization = {
    "die" : "death",
    "pass away" : "death",
    "bear" : "birth",
    "direct" : "director"
}

# load the model for analysing English text
nlp = spacy.load("en_core_web_sm")

"""Returns the subtree decending from a word"""
def phrase(word):
    children = []
    if word:
        for child in word.subtree:
            children.append(child)
    return children

"""Returns direct children and self if needed"""
def children(q, head, includeSelf):
    children = []
    for token in q:
        if (token != head and token.head == head) or (includeSelf and token == head):
            children.append(token)
    return children

"""Tokenize the question string"""
def tokenize(question):
    return nlp(question)

def stringify(tokens):
    string = ""
    tokens = strip(tokens, ['DET', 'PUNCT'], [])
    if len(tokens) > 0 and tokens[-1].pos_ == 'ADP':
        del tokens[-1]
    for token in tokens:
        if string != "" and token.text[0] != "'":
            string += " "
        if token.pos_ == "NOUN":
            string += token.lemma_
        else:
            string += token.text
    return string

"""Find list of IDs corresponding to a string
@prop True if should be a prop"""
def findIDs(entity, prop):
    res = []
    params = {'action':'wbsearchentities', 
          'language':'en',
          'format':'json'}
    if (prop):
        params['type'] = 'property'
    params['search'] = entity
    json = requests.get(url2,params).json()
    try:
        for result in json['search']:
            res.append(result['id'])
    except:
        print("Failed to retrieve the entity or property.")
    return res

def passiveChecker(q):
    if findDep(q, ['nsubjpass']):
        return True
    return False

"""Returns the first token corresponding to the dependency. Else false"""
def findDep(q, dep):
    for word in q:
        if word.dep_ in dep:
            return word
    return False

"""Returns the first token corresponding to the pos. Else false"""
def findPos(q, pos):
    for word in q:
        if word.pos_ in pos:
            return word
    return False

"""Removes tokens of specified POS's or exclusionSet"""
def strip(tokens, pos, exclusionSet):
    ret = []
    for token in tokens:
        if token.pos_ not in pos and token not in exclusionSet:
            ret.append(token)
    return ret

"""Returns head of entity"""
def entities(tokens, q):
    entities = []
    for token in tokens:
        if entities != [] or (token.text.istitle() and token.pos_ != 'ADV'):
            entities.append(token)
    if entities == []:
        for ent in q.ents:
            entities += ent
    if entities == []:
        entities = phrase(findDep(tokens, ['prep', 'agent']))
    return entities

"""Retrieve answer to sparql query"""
def sparqlRequest(query):
    res = []
    data = requests.get(url, params={'query': query, 'format': 'json'})
    while data.status_code != 200:
        data = requests.get(url, params={'query': query, 'format': 'json'}) 
    jsonData = data.json()
    if not jsonData['results']['bindings']:
        return False
    for item in jsonData['results']['bindings']:
        for var in item:
            if (var == "itemLabel"):
                if jsonData:
                    print(item[var]['value'])
    return True


def simpleRequest(entity, prop, extra):
    possibleEntityIDs = findIDs(entity, False)
    possiblePropIDs = findIDs(prop, True)
    for entity, prop in product(possibleEntityIDs, possiblePropIDs):
        request = '''SELECT ?item ?itemLabel WHERE { wd:'''+entity+' wdt:'+prop+' ?item .'+extra+'''
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }}'''
        if sparqlRequest(request):
            return
    print("No answer found.")

def passiveRequest(entity, prop):
    possibleEntityIDs = findIDs(entity, False)
    possiblePropIDs = findIDs(prop, True)
    for entity, prop in product(possibleEntityIDs, possiblePropIDs):
        request = '''SELECT ?item ?itemLabel WHERE {
                      ?item wdt:'''+prop+''' wd:'''+entity+''' .
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
                  }'''
        if sparqlRequest(request):
            break

"""The following functions query particular question types"""
def simpleQ(q):
    #head of the relevant part of the sentence
    relPart = findDep(q, 'nsubj')
    if relPart:
        if relPart.pos_ == 'PRON':
            #alternative: find child with largest subtree
            root = findDep(q, ['ROOT'])
            relPart = children(q, root, False)[0]
            for child in children(q, root, False):
                if len(phrase(child)) > len(phrase(relPart)):
                    relPart = child
        entity = entities(phrase(relPart), q)
        prop = strip(phrase(relPart), [], entity)
    if not relPart or not entity:
        entity = entities(q[1:], q)
        attr = findDep(q[1:], 'attr')
        if attr:
            prop = strip(phrase(attr), [], entity)
        else:
            prop = strip(q[1:], [], entity)
    return entity, prop

def whatOrWho(q):
    entity, prop = simpleQ(q)
    return simpleRequest(stringify(entity), stringify(prop), "")

def whenOrWhere(q):
    verb = findPos(q, ["VERB"])
    if (q[0].text == 'When'):
        extra = "FILTER ( datatype(?item) = xsd:dateTime)"
    else:
        extra = ""
    prop = verb.text
    #head of the relevant part of the sentence
    relPart = findDep(q, 'nsubj')
    if relPart.pos_ == 'PRON':
        #alternative: find cfhild with largest subtree
        root = findDep(q, ['ROOT'])
        relPart = children(q, root, False)[0]
        for child in children(q, root, False):
            if len(phrase(child)) > len(phrase(relPart)):
                relPart = child
    entity = entities(phrase(relPart), q)
    simpleRequest(stringify(entity), prop, extra)

def passive(q):
    pobj = phrase(findDep(q, ['pobj']))
    entity = entities(pobj, q)
    verb = findPos(q, ['VERB'])
    prop = verb.lemma_
    passiveRequest(stringify(entity), prop)

def howMany(q):
    dobj = findDep(q, ['dobj'])
    addition = "number of "
    entity, _ = simpleQ(q)
    simpleRequest(stringify(entity), addition+dobj.text, "")
    
def howLong(q):
    addition = "duration"
    entity, prop = simpleQ(q)
    simpleRequest(stringify(entity), addition, "")

"""Select question type"""
def questionType(q):
    if passiveChecker(q):
        passive(q)
        return
    options = {
            'What' : whatOrWho,
            'Who' : whatOrWho,
            'When' : whenOrWhere,
            'Where' : whenOrWhere,
            'Howlong' : howLong,
            'Howmany' : howMany}
    if (q[0].text+q[1].text in options):
        options[q[0].text+q[1].text](q)
    elif (q[0].text in options):
        options[q[0].text](q)
    else:
        print("question type not supported, but we'll try...")
        passive(q)

What's on your mind? (-1: Hit me with some facts)
Inception was directed by whom?
Answer(s):
Failed to retrieve the entity or property.


In [None]:
x = input('''What's on your mind? (-1: Hit me with some facts)\n''')
if x == "-1":
    for i in range(1,21):
        tokens = tokenize(globals()['q'+str(i)])
        print('Answer(s) to \"'+globals()['q'+str(i)]+'\" (q'+str(i)+'):')
        questionType(tokens)
else
    tokens = tokenize(x)
    print('Answer(s):')
    questionType(tokens)