In [199]:
import spacy
nlp = spacy.load('en')
s1 = "The promotion of IronMan was held in the Greater Union Theater at George Street, Sydney, on April 14, 2008."
s2 = "Black Panther premiered in Riyadh, Saudi Arabia on April 18, 2018"
s3 = "Ironman screened at the Toronto International Film Festival, the San Sebastián International Film Festival, and the Zurich Film Festival in September 2018."
s4 = "A screening of the Captain America during the 2010 Fordham University Law School Film festival, Supreme Court Justice Sonia Sotomayor stated that seeing 12 Angry Men while she was in college influenced her decision to pursue a career in law."


In [200]:
match_premiere_event(s1)

Movie Name:IronMan
Event type:promotion
Location:the Greater Union Theater
Time:April 14, 2008


In [192]:
def match_premiere_event(sent):
    promotionNoun = findPromotionNoun(sent)
    promotionVerb = findPromotionVerb(sent)
    
    result = None
    if promotionNoun != None:
        result = parseNounPattern(sent, promotionNoun)
    elif promotionVerb != None :
        result = parseVerbPattern(sent, promotionVerb)
        
    printResult(result)
    

In [197]:
def findPromotionNoun(sent):
    for token in nlp(sent):
        if token.lemma_ in ["premiere","screening", "promotion"]:
            if token.pos_ == "NOUN":
                return token.lemma_
    return None

In [196]:
def findPromotionVerb(sent):
    for token in nlp(sent):
        if token.lemma_ in ["premier","screen", "promote"]:
            if token.pos_ == "VERB":
                return token.lemma_
    return None

In [179]:
findPromotionVerb(s2)

'premier'

In [180]:
def parseNounPattern(sent, promotionNoun):
    for token in nlp(sent):
        #noun
        if token.lemma_ == promotionNoun:
            if token.head.pos_ == "VERB":
                location = getLocationOfVerbToken(sent, token.head)
                time = getTimeOfVerbToken(sent, token.head)
                movie = getEntityRelatedByOF(sent, token)
                return [movie,promotionNoun,location, time]
    return None

In [181]:
def parseVerbPattern(sent, promotionVerb):
    for token in nlp(sent):
        #verb
        if token.lemma_ == promotionVerb:
            location = getLocationOfVerbToken(sent, token)
            time = getTimeOfVerbToken(sent, token)
            movie = getSubjectEntityOfVerbToken(sent, token)
            return [movie,promotionVerb,location, time]
    return None

In [182]:
def getSubjectEntityOfVerbToken(sent, token):
    for child in token.children:
        if "subj" in str(child.dep_):
            entity = getEntityThatContainsToken(sent, child)
            if entity == None:
                entity = getPhraseThatContainsToken(sent, child)
            return entity

In [183]:
parsePremiereNounPattern(s1)

['IronMan', 'Premiere', 'the Greater Union Theater', 'April 14, 2008']

In [184]:
def getEntityRelatedByOF(sent, token):
    for child in token.children: 
                #usually movie name for noun attached with "on"
                if child.text in ["of"]:
                    #['of', 'prep', 'premiere', 'NOUN', [IronMan]]
                    for subChild in child.children:
                        entity = getEntityThatContainsToken(sent, subChild)
                        if(entity != None):
                            return entity
    return "<no-match>"

In [60]:
def getTimeOfVerbToken(sent, verbToken):
    for child in verbToken.children: 
                #usually release time starts with "on"
                if child.text in ["on","at"]:
                    #usually "on"'s child is [August] 
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(sent, subChild)
                        if(temporalEntity != None):
                            return temporalEntity
    return "<no-match>"

In [114]:
def getLocationOfVerbToken(sent, verbToken):
    for child in verbToken.children: 
        #usually release location starts with "in"
        if child.text in ["in","throughout", "at"]:
            for subChild in child.children:
                GPEEntity = getLocationThatContainsToken(sent, subChild)
                if(GPEEntity != None):
                    return GPEEntity
    return "<no-match>"

In [115]:
def getLocationThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ != "TIME" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [37]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]

In [58]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [130]:
def getEntityThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [142]:
def getPhraseThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for chunk in doc.noun_chunks:
        if(token.text in chunk.text and tokenStartPos >= chunk.start_char and tokenStartPos <= chunk.end_char):
            return chunk.text
    return None

In [91]:
def printResult(result):
    if(result!=None):
        print("Movie Name:"+result[0])
        print("Event type:"+result[1])
        print("Location:"+result[2])
        print("Time:"+result[3])
    else:
        print("Could not parse")