In [1]:
import spacy
nlp = spacy.load('en')

In [37]:
s1 = "The trailer was watched 224.6 million times in its first 24 hours, becoming the 2nd most viewed trailer in that time period."
s2 = "It was a big hit. Iron Man was watched 503.2 million times in first weekend."

In [38]:
sent = s2
parse(sent)

---- Movie Views ----
Movie:Iron Man
View Count:503.2 million
Time span:first weekend


In [29]:
def parse(sent):
    result = pattern1Match(sent)
    printResult(result)

In [25]:
def pattern1Match(sent):
    """
    this should match pattern "As of December 6, 2018, A Star Is Born has grossed $194.4 million in the United States and Canada, and $170.1 million in other territories, for a total worldwide gross of $364.5 million, against a production budget of $36–40 million."
    it need profit/earn/make/gross as verb 
    in addition money should exist in statement
    movie/scene can be attached as subject of verb
    """
    viewToken = getViewVerbToken(sent)
    if(viewToken == None):
        return
    else:
        movie = getThemeOfVerbToken(sent, viewToken)
        viewCount = getFirstEntity(sent, ["CARDINAL"])
        duration = getFirstEntity(sent, ["TIME","DATE"])
        return [movie, viewCount, duration]

In [3]:
def getAgentOfVerbToken(sent, verbToken):
    for child in verbToken.children:
        if(child.dep_ == "agent"): #for "by"
            for subChild in child.children:
                return getPhraseThatContainsToken(sent, subChild)

    for child in verbToken.children:
        if(child.dep_ == "pobj" or child.dep_ == "nsubj"):
            return getPhraseThatContainsToken(sent, child)

In [4]:
def getThemeOfVerbToken(sent, verbToken):
    for child in verbToken.children:
        if(child.dep_ == "dobj" or child.dep_ == "nsubjpass"):
            return getPhraseThatContainsToken(sent, child)

In [33]:
def getViewVerbToken(sent):
    """
    This will look for frequent verb of "view", "watch"
    Other synonyms can be used from wrodnet but we extracted a subset of synonys based on domain.
    """
    for token in nlp(sent):
        if "VERB" == token.pos_ and token.lemma_ in ["view", "watch"]:
            return token

In [6]:
def getSellVerbToken(sent):
    """
    This will look for frequent verb of "sell"
    Other synonyms can be used from wrodnet but we extracted a subset of synonys based on domain.
    """
    for token in nlp(sent):
        if "VERB" == token.pos_ and token.lemma_ in ["sell"]:
            return token

In [7]:
def getPhraseOfFromChild(sent):
    if " from " in sent:
        for child in nlp(sent):
            if child.text == "from":
                for grandChild in child.children:
                    return getPhraseThatContainsToken(sent, grandChild)

In [8]:
def getPhraseOfToChild(sent):
    if " to " in sent:
        for child in nlp(sent):
            if child.text == "to":
                for grandChild in child.children:
                    return getPhraseThatContainsToken(sent, grandChild)

In [28]:
def printResult(result):
    #[buyer, product, seller, amount, time, location]
    print("---- Movie Views ----")
    if(result != None):
        print("Movie:"+str(result[0]))
        print("View Count:"+str(result[1]))
        print("Time span:"+str(result[2]))
    else:
        print("No Parse")

In [10]:
def getTimeOfToken(sent, token):
    for child in token.children: 
                #usually time starts with "on", "in", "at"
                if child.text in ["on","at","in"]:
                    #usually "on"'s child is [August] 
#                     print("checking children of "+child.text)
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(sent, subChild)
                        if(temporalEntity != None):
                            return temporalEntity
    return "<no-match>"

In [11]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [12]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]

In [13]:
def getLocationOfToken(sent, token):
    for child in token.children: 
        #usually release location starts with "in"
        if child.text in ["in","throughout", "at"]:
            for subChild in child.children:
#                 print("in loop for:"+subChild.text)
                GPEEntity = getLocationThatContainsToken(sent, subChild)
                if GPEEntity == None:
#                     print("no gpe found")
                    #this might be slipplary when "at" comes. Might confuse with time.
                    canConfuseWithTime = (getTemporalThatContainsToken(sent, subChild) != None)
#                     print("Can confuse:"+subChild.text+":"+str(canConfuseWithTime))
                    if not canConfuseWithTime:
                        GPEEntity = getPhraseThatContainsToken(sent, subChild)
                    
                if GPEEntity != None:
                    return GPEEntity
    return "<no-match>"

In [14]:
def getLocationThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ != "TIME" and ent.label_ != "DATE"  and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [15]:
def getPhraseThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for chunk in doc.noun_chunks:
        if(token.text in chunk.text and tokenStartPos >= chunk.start_char and tokenStartPos <= chunk.end_char):
            return chunk.text
    return None

In [16]:
def getEntityRelatedByOF(sent, token):
    for child in token.children: 
                #usually movie name for noun attached with "on"
                if child.text in ["of"]:
                    #['of', 'prep', 'premiere', 'NOUN', [IronMan]]
                    for subChild in child.children:
                        entity = getEntityThatContainsToken(sent, subChild)
                        if entity == None:
                            entity = getPhraseThatContainsToken(sent, subChild)
                        if(entity != None):
                            return entity
    return "<no-match>"

In [17]:
def getEntityThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [18]:
def getFirstGPE(sent):
    doc = nlp (sent)
    for ent in doc.ents:
        if(ent.label_ == "GPE"):
            return ent.text
    return "<no-match>"

In [19]:
def getFirstEntity(sent, expectedLabels):
    doc = nlp(sent)
    for ent in doc.ents:
        if(ent.label_ in expectedLabels):
            return ent.text
    return "<no-match>"

In [20]:
def findChildMoneyToken(sent, token):
    for child in token.children:
        if getMoneyThatContainsToken(sent, child) != None:
            return child
    return None

In [21]:
def getMoneyThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "MONEY" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return None

In [22]:
def doesTokenChildrenIncludeWorldWide(token):
    for child in token.children:
        if "worldwide" in child.text.lower():
            return True
    return False

In [23]:
def getSubjectTokenOfVerb(verbToken):
    for child in verbToken.children:
        if "subj" in child.dep_:
            return child
    return None

In [24]:
def getEntityRelatedByBy(sent, token):
    for child in token.children: 
                if child.text in ["by"]:
                    for subChild in child.children:
                        entity = getEntityThatContainsToken(sent, subChild)
                        if(entity != None):
                            return entity
    return "<no-match>"