In [9]:
import spacy
nlp = spacy.load('en')

In [84]:
s1 = 'The IronMan premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released on January 24, 2018 in the United States by Mr. Bhalani.'
s2 = 'Ironman was released in the United States on May 2, 2008.'
s3 = 'The film was theatrically released in the United States on October 5, 2018, distributed by Warner Bros.'
s4 = "Black Panther premiered in Los Angeles on January 29, 2018, and was released theatrically in the United States on February 16, in 2D, 3D, IMAX and other premium large formats."
s5 = 'Captain America: The First Avenger premiered in Hollywood on July 19, 2011, and was released in the United States on July 22, 2011.'
s6 = 'Mission: Impossible – Fallout had its world premiere in Paris on July 12, 2018 and was released in the United States on July 27, 2018.'
s7 = 'First Man had its world premiere at the Venice Film Festival on August 29, 2018, and was theatrically released in the United States on October 12, 2018, by Universal Pictures.'
s8 = 'The Godfather was commercially released on March 24, 1972, throughout the rest of the United States.'
s8_1 = 'The Godfather was commercially released on March 24, 1972, throughout the United States.'
s9 = 'The Film began releasing in international markets on April 30, and was released in the United States on May 2, 2008.'
s10 = 'A sequel, Mission: Impossible – Fallout, was released on July 27, 2018 with McQuarrie returning as writer and director.'
s11 = 'The Film premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.'

In [89]:
sent = s1
parse(sent)

Release time:January 24, 2018
Movie name is:The IronMan
Location:the United States
Owner:Bhalani
Formats: <no-match>


In [87]:
def parse(statement):
    isValidForReleaseTemplate = checkReleaseMovieTemplateEligibility(statement)
    if(isValidForReleaseTemplate):
        sTime = extractReleaseTimeSyntactically(statement)
        print("Release time:"+str(sTime))
        movieName = extractMovieName(statement)
        print("Movie name is:"+str(movieName))
        location = extractReleaseLocationSyntactically(statement)
        print("Location:"+str(location))
        owner = extractOwnerEntitySyntactically(statement)
        print("Owner:"+str(owner))
        formats = extractMovieFormat(statement)
        print("Formats:", formats)
    else:
        print("No-Parse")

In [56]:
def extractMovieFormat(statement):
    validFormats = ["2D", "3D", "IMAX", "Blu-ray", "DVD"]
    formats = []
    for validFormat in validFormats:
        if validFormat in statement:
            formats.append(validFormat)
    if len(formats) > 0:
        return formats
    else:
        return "<no-match>"
    

In [78]:
def extractMovieName(statement):
    targetVerbLemmas = ['release', 'launch']
    verbText = getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas)
    verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)
    movieName = "<uninitialized>"
    
#     handle when release is with premiered.
    while verbTextDNode[1] == 'conj':
        verbText = verbTextDNode[2]
        verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)

#     print(verbText)
    nounSubject = getNounSubject(statement, verbText)
#     print("ns:"+nounSubject)
    movieName = getNounChunkThatContainsNoun(statement, nounSubject)

    return movieName

In [57]:
def getNounChunkThatContainsNoun(statement, nounSubject):
    doc = nlp(statement)
    for chunk in doc.noun_chunks:
        if nounSubject in chunk.text:
            return chunk.text
    return "<no-match>"

In [58]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    
    return "<no-match>"
        

In [59]:
def getGPEThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "GPE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return "<no-match>"

In [60]:
def getORGorPersonThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and (ent.label_ == "ORG" or ent.label_ == "PERSON") and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return "<no-match>"

In [61]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]


In [62]:
def getNounSubject(statement, verbText):
    doc = nlp(statement)
    for token in doc:
        if((token.dep_ == "nsubjpass" or token.dep_ == "nsubj") and token.head.text == verbText):
            return token.text
    return "<no-match>"

In [63]:
def getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas):
    """
    this will reutrn exact apperance of verb in statement.
    ex: statement = "Movie was written by James.", targetVerbLemmas = ['write','script']
    this will return "written" as it's lemma matches with one of the targetVerbLemmas
    """
    verbTokens = filterVerbTokens(statement)
    for token in verbTokens:
        if token.lemma_ in targetVerbLemmas:
            return token.text
    return "<no-match>"

In [64]:
def extractReleaseTimeSyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release time starts with "on"
                if child.text == "on":
                    #usually "on"'s child is [August] 
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(statement, subChild)
                        return temporalEntity
    return "<no-match>"

In [65]:
def extractReleaseLocationSyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release location starts with "in"
                if child.text in ["in","throughout"]:
                    for subChild in child.children:
                        GPEEntity = getGPEThatContainsToken(statement, subChild)
                        return GPEEntity
    return "<no-match>"

In [66]:
def extractOwnerEntitySyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release location starts with "in"
                if child.text == "by":
                    for subChild in child.children:
                        ORGEntity = getORGorPersonThatContainsToken(statement, subChild)
                        return ORGEntity
    return "<no-match>"

In [76]:
def isExpectedToken(token):
    return token.lemma_ in ["release","launch"]

In [82]:
def checkReleaseMovieTemplateEligibility(statement):
    hasValidVerb = check_verb_match(statement)
#     print("Has valid verb:"+str(hasValidVerb))
    
    hasDateEntity = check_date_entity_match(statement)
#     print("Has DATE included:"+str(hasDateEntity))
    return hasValidVerb and hasDateEntity

In [69]:
def check_date_entity_match(statement):
    doc = nlp(statement)
    result = len(list(filter(lambda entity: entity.label_ == "DATE" , doc.ents))) > 0
    return result

In [70]:
def check_verb_match(statement):
    verbTokens = filterVerbTokens(statement)
    targetVerbs = ['release', 'launch']
    for token in verbTokens:
        if token.lemma_ in targetVerbs:
            return True
    return False

In [71]:
def filterVerbTokens(statement):
    doc = nlp(statement)
    result = list(filter(lambda token: token.pos_ == "VERB" and token.lemma_ != "be", doc))
    return result

In [72]:
def getRootToken(doc):
    for token in doc:
        if token.dep_ == "ROOT":
            return token
    return "None"

In [73]:
def get_token_dist(word1, word2):
    token1 = nlp(word1)[0]
    token2 = nlp(word2.lemma_)[0]
    return token1.similarity(token2)

In [74]:
def get_dependency_tree_nodes(sentence):
    nodes = []
    doc = nlp(sentence)
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes