In [12]:
import spacy
nlp = spacy.load('en')

In [90]:
s1 = "As of December 6, 2018, Ironman has grossed $194.4 million in the United States and Canada, and $170.1 million in other territories, for a total worldwide gross of $364.5 million, against a production budget of $36–40 million."
s2 = "Black Panther grossed $700.1 million in the United States and Canada, and $646.9 million in other territories, for a worldwide total of $1.347 billion."
s3 = "In March 2018 Deadline Hollywood estimated the net profit of the film would be $461 million, accounting for production budgets, P&A, talent participations and other costs, with box office grosses and ancillary revenues from home media."
#no parse because rooted with estimate.
s4 = "On Friday, The Captain America opened at the number one spot at the American and Canadian box office with $25.7 million."
#location is parsed "the number one spot" 
s5 = "Mission: Impossible – Fallout grossed $220.2 million in the United States and Canada, and $570.9 million in other territories, for a total worldwide gross of $791 million, against a production budget of $178 million."
s6 = "The film made $6 million in the United States"
s7 = "As of December 2, 2018, First Man has grossed $44.7 million in the United States and Canada, and $55.2 million in other territories, for a total worldwide gross of $99.9 million, against a production budget of $59 million."
s8 = "It earned $81.5 million in theatrical rentals in the USA and Canada during its initial release, increasing its earnings to $85.7 million through a reissue in 1973, and including a limited re-release in 1997 it ultimately earned an equivalent exhibition gross of $135 million."
s9 = "Iron Man earned $318.4 million in the United States and Canada and $266.8 million in other territories, for a worldwide gross of $585.2 million."
s10 = "Mission: Impossible – Rogue Nation grossed $195 million in the U.S. and Canada, and $487.7 million in other countries, for a worldwide total of $682.7 million."
s11 = "Searching has grossed $26 million in the United States and Canada, and $44.8 million in other territories, for a total worldwide gross of $70.8 million."
s12 = "The Dark Knight earned $534.9 million in North America and $469.7 million in other territories for a worldwide total of $1 billion."

In [95]:
sent = s1
parse(sent)

---- Earning template ----
Movie:Ironman
Amount:$194.4 million
Location:the United States


In [92]:
def parse(sent):
    result = pattern1Match(sent)
    printResult(result)

In [54]:
def pattern1Match(sent):
    """
    this should match pattern "As of December 6, 2018, A Star Is Born has grossed $194.4 million in the United States and Canada, and $170.1 million in other territories, for a total worldwide gross of $364.5 million, against a production budget of $36–40 million."
    it need profit/earn/make/gross as verb 
    in addition money should exist in statement
    movie/scene can be attached as subject of verb
    """
    earnToken = getEarnVerbToken(sent)
    if(earnToken == None):
        return
    else:
        moneyToken = findChildMoneyToken(sent, earnToken)
        
        if moneyToken == None:
            #if earning amount is not found then return
            return
        
        amount = getMoneyThatContainsToken(sent, moneyToken)
        
        #worldwide test
        location = "<no-match>"
        isWorldwide = doesTokenChildrenIncludeWorldWide(moneyToken) or doesTokenChildrenIncludeWorldWide(earnToken)
        if isWorldwide:
            location = "worldwide"
        else:
            location = getLocationOfToken(sent, moneyToken)
            if location == "<no-match>":
                location = getLocationOfToken(sent, earnToken)
        
        subjectToken = getSubjectTokenOfVerb(earnToken)
        
        movieName = getPhraseThatContainsToken(sent, subjectToken)
       
        return [movieName, amount, location]

In [91]:
match_earning_template(s4)

Movie:The Captain America
Amount:$25.7 million
Location:the number one spot


In [67]:
def getEarnVerbToken(sent):
    """
    This will look for frequent noun of "profit", "earn", "make", "gross"
    Other synonyms can be used from wrodnet but we extracted a subset of synonys based on domain.
    """
    for token in nlp(sent):
        if "VERB" == token.pos_ and token.lemma_ in ["profit", "earn", "make", "gross", "open"]:
            return token

In [94]:
def printResult(result):
    print("---- Earning template ----")
    if(result != None):
        print("Movie:"+str(result[0]))
        print("Amount:"+str(result[1]))
        print("Location:"+str(result[2]))
    else:
        print("No Parse")
        
    

In [23]:
def getTimeOfToken(sent, token):
    for child in token.children: 
                #usually time starts with "on", "in", "at"
                if child.text in ["on","at","in"]:
                    #usually "on"'s child is [August] 
#                     print("checking children of "+child.text)
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(sent, subChild)
                        if(temporalEntity != None):
                            return temporalEntity
    return "<no-match>"

In [24]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [25]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]

In [26]:
def getLocationOfToken(sent, token):
    for child in token.children: 
        #usually release location starts with "in"
        if child.text in ["in","throughout", "at"]:
            for subChild in child.children:
#                 print("in loop for:"+subChild.text)
                GPEEntity = getLocationThatContainsToken(sent, subChild)
                if GPEEntity == None:
#                     print("no gpe found")
                    #this might be slipplary when "at" comes. Might confuse with time.
                    canConfuseWithTime = (getTemporalThatContainsToken(sent, subChild) != None)
#                     print("Can confuse:"+subChild.text+":"+str(canConfuseWithTime))
                    if not canConfuseWithTime:
                        GPEEntity = getPhraseThatContainsToken(sent, subChild)
                    
                if GPEEntity != None:
                    return GPEEntity
    return "<no-match>"

In [27]:
def getLocationThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ != "TIME" and ent.label_ != "DATE"  and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [28]:
def getPhraseThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for chunk in doc.noun_chunks:
        if(token.text in chunk.text and tokenStartPos >= chunk.start_char and tokenStartPos <= chunk.end_char):
            return chunk.text
    return None

In [29]:
def getEntityRelatedByOF(sent, token):
    for child in token.children: 
                #usually movie name for noun attached with "on"
                if child.text in ["of"]:
                    #['of', 'prep', 'premiere', 'NOUN', [IronMan]]
                    for subChild in child.children:
                        entity = getEntityThatContainsToken(sent, subChild)
                        if entity == None:
                            entity = getPhraseThatContainsToken(sent, subChild)
                        if(entity != None):
                            return entity
    return "<no-match>"

In [30]:
def getEntityThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [31]:
def getFirstGPE(sent):
    doc = nlp (sent)
    for ent in doc.ents:
        if(ent.label_ == "GPE"):
            return ent.text
    return "<no-match>"

In [32]:
def getFirstEntity(sent, expectedLabels):
    doc = nlp (sent)
    for ent in doc.ents:
        if(ent.label_ in expectedLabels):
            return ent.text
    return "<no-match>"

In [33]:
def findChildMoneyToken(sent, token):
    for child in token.children:
        if getMoneyThatContainsToken(sent, child) != None:
            return child
    return None

In [34]:
def getMoneyThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "MONEY" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [40]:
def doesTokenChildrenIncludeWorldWide(token):
    for child in token.children:
        if "worldwide" in child.text.lower():
            return True
    return False

In [51]:
def getSubjectTokenOfVerb(verbToken):
    for child in verbToken.children:
        if "subj" in child.dep_:
            return child
    return None