In [1]:
import spacy
nlp = spacy.load('en')

In [141]:
s1 = "Filming of Ironman at Edwards Air Force Base began in April 2, and ended on May 2."
s2 = "Filming of Ironman began in April at Edwards Air Force Base and ended on May 2."
s3 = "Filming of Fallout was slated to start in Paris on April 10, 2017."
s4 = "Filming was scheduled to take place that month in the Northern Quarter of Manchester, where parts of the 2004 film Alfie and the 2009 Sherlock Holmes had been shot, followed by the Stanley Dock area of Liverpool, both doubling for the period's Lower East Side of Manhattan."
s5 = "On April 10, 2017, filming of Fallout was slated to start in Paris."

In [4]:
def match_filming_event(sent):
    result = pattern1Match(sent)
    
    printResult(result)

In [134]:
def pattern1Match(sent):
    """
    this should match pattern "Filming of Ironman at Edwards Air Force Base began in mid-April, and ended on May 2."
    it need Filming/Shooting 
    then it's verb (begun here) should have location or time
    movie/scene can be attached as "of" with filming/shooting
    """
    filmingToken = getFilmingNounToken(sent)
    if(filmingToken == None):
        return
    else:
        time = getTimeOfToken(sent, filmingToken.head)
        location = getLocationOfToken(sent, filmingToken)
        if(location == "<no-match>"):
            location = getLocationOfToken(sent, filmingToken.head)
        
        #take if single exist
        if location == "<no-match>":
            """this might open for wrong match but it could give better result at the end"""
            location = getFirstGPE(sent)
        if time == "<no-match>":
            time = getFirstEntity(sent,["TIME","DATE"])
        
        if(location != "<no-match>" or time != "<no-match>"):
            """when atleast one of those matched."""
            movieName = getEntityRelatedByOF(sent,filmingToken)
            return [movieName, location, time]

In [142]:
# pattern1Match(s2)
# pattern1Match("Filming of Ironman began in April at Edwards Air Force Base and ended on May 2.")

match_filming_event(s5)

Movie/Scene:Fallout
Location:Paris
Time:April 10, 2017


In [6]:
def getFilmingNounToken(sent):
    """
    This will look for frequent noun of "filming" or "shooting"
    """
    for token in nlp(sent):
        if "NOUN" == token.pos_ and token.lemma_ in ["filming", "shooting"]:
            return token

In [15]:
def printResult(result):
    if(result != None):
        print("Movie/Scene:"+str(result[0]))
        print("Location:"+str(result[1]))
        print("Time:"+str(result[2]))
    else:
        print("No Parse")
        
    

In [86]:
def getTimeOfToken(sent, token):
    for child in token.children: 
                #usually time starts with "on", "in", "at"
                if child.text in ["on","at","in"]:
                    #usually "on"'s child is [August] 
#                     print("checking children of "+child.text)
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(sent, subChild)
                        if(temporalEntity != None):
                            return temporalEntity
    return "<no-match>"

In [30]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [87]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]

In [130]:
def getLocationOfToken(sent, token):
    for child in token.children: 
        #usually release location starts with "in"
        if child.text in ["in","throughout", "at"]:
            for subChild in child.children:
#                 print("in loop for:"+subChild.text)
                GPEEntity = getLocationThatContainsToken(sent, subChild)
                if GPEEntity == None:
#                     print("no gpe found")
                    #this might be slipplary when "at" comes. Might confuse with time.
                    canConfuseWithTime = (getTemporalThatContainsToken(sent, subChild) != None)
#                     print("Can confuse:"+subChild.text+":"+str(canConfuseWithTime))
                    if not canConfuseWithTime:
                        GPEEntity = getPhraseThatContainsToken(sent, subChild)
                    
                if GPEEntity != None:
                    return GPEEntity
    return "<no-match>"

In [131]:
def getLocationThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ != "TIME" and ent.label_ != "DATE"  and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [90]:
def getPhraseThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for chunk in doc.noun_chunks:
        if(token.text in chunk.text and tokenStartPos >= chunk.start_char and tokenStartPos <= chunk.end_char):
            return chunk.text
    return None

In [104]:
def getEntityRelatedByOF(sent, token):
    for child in token.children: 
                #usually movie name for noun attached with "on"
                if child.text in ["of"]:
                    #['of', 'prep', 'premiere', 'NOUN', [IronMan]]
                    for subChild in child.children:
                        entity = getEntityThatContainsToken(sent, subChild)
                        if entity == None:
                            entity = getPhraseThatContainsToken(sent, subChild)
                        if(entity != None):
                            return entity
    return "<no-match>"

In [102]:
def getEntityThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [118]:
def getFirstGPE(sent):
    doc = nlp (sent)
    for ent in doc.ents:
        if(ent.label_ == "GPE"):
            return ent.text
    return "<no-match>"

In [126]:
def getFirstEntity(sent, expectedLabels):
    doc = nlp (sent)
    for ent in doc.ents:
        if(ent.label_ in expectedLabels):
            return ent.text
    return "<no-match>"