In [83]:
import spacy
nlp = spacy.load('en')
doc = nlp('The film premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.')

doc = nlp('Ironman was released in the United States on May 2, 2008.')

In [40]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

The the DET DT det Xxx True False
film film NOUN NN ROOT xxxx True False
premiered premier VERB VBD acl xxxx True False
at at ADP IN prep xx True True
the the DET DT det xxx True True
Sundance sundance PROPN NNP compound Xxxxx True False
Film film PROPN NNP compound Xxxx True False
Festival festival PROPN NNP pobj Xxxxx True False
on on ADP IN prep xx True True
January january PROPN NNP pobj Xxxxx True False
21 21 NUM CD nummod dd False False
, , PUNCT , punct , False False
2018 2018 NUM CD nummod dddd False False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
was be VERB VBD auxpass xxx True True
theatrically theatrically ADV RB advmod xxxx True False
released release VERB VBN conj xxxx True False
in in ADP IN prep xx True True
the the DET DT det xxx True True
United united PROPN NNP compound Xxxxx True False
States states PROPN NNP pobj Xxxxx True False
on on ADP IN prep xx True True
August august PROPN NNP pobj Xxxxx True False
24 24 NUM CD nummod dd False False
,

In [41]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_)

The the DET DT
film film NOUN NN
premiered premier VERB VBD
at at ADP IN
the the DET DT
Sundance sundance PROPN NNP
Film film PROPN NNP
Festival festival PROPN NNP
on on ADP IN
January january PROPN NNP
21 21 NUM CD
, , PUNCT ,
2018 2018 NUM CD
, , PUNCT ,
and and CCONJ CC
was be VERB VBD
theatrically theatrically ADV RB
released release VERB VBN
in in ADP IN
the the DET DT
United united PROPN NNP
States states PROPN NNP
on on ADP IN
August august PROPN NNP
24 24 NUM CD
, , PUNCT ,
2018 2018 NUM CD
, , PUNCT ,
by by ADP IN
Screen screen PROPN NNP
Gems gems PROPN NNP
. . PUNCT .


In [279]:
doc = nlp ("The IronMan premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released on January 24, 2018 in the United States by Bhalani.")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

IronMan 4 11 ORG
the Sundance Film Festival 25 51 EVENT
January 21, 2018 55 71 DATE
January 24, 2018 106 122 DATE
the United States 126 143 GPE
Bhalani 147 154 PERSON


In [283]:
statement = 'The IronMan premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released on January 24, 2018 in the United States by Mr. Bhalani.'
isValidForReleaseTemplate = checkReleaseMovieTemplateEligibility(statement)
if(isValidForReleaseTemplate):
    sTime = extractReleaseTimeSyntactically(statement)
    print("Release time:"+str(sTime))
    movieName = extractMovieName(statement)
    print("Movie name is:"+str(movieName))
    location = extractReleaseLocationSyntactically(statement)
    print("Location:"+str(location))
    owner = extractOwnerEntitySyntactically(statement)
    print("Owner:"+str(owner))
else:
    print("Statement does not talk about movie release or did not match common pattern.")
#     location = extractLocation

Matching with release template: The IronMan premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released on January 24, 2018 in the United States by Mr. Bhalani.
Release time:January 24, 2018
Movie name is:The IronMan
Location:the United States
Owner:Bhalani


In [272]:
def extractMovieName(statement):
    targetVerbLemmas = ['release', 'launch']
    verbText = getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas)
    verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)
    movieName = "<uninitialized>"
    
#     handle when release is with premiered.
    while verbTextDNode[1] == 'conj':
        verbText = verbTextDNode[2]
        verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)

#     print(verbText)
    nounSubject = getNounSubject(statement, verbText)
#     print("ns:"+nounSubject)
    movieName = getNounChunkThatContainsNoun(statement, nounSubject)

    return movieName

In [157]:
extractMovieName("The Dark Night premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.")

released
ns:Night


'The Dark Night'

In [203]:
def getNounChunkThatContainsNoun(statement, nounSubject):
    doc = nlp(statement)
    for chunk in doc.noun_chunks:
        if nounSubject in chunk.text:
            return chunk.text
    return "<no-match>"

In [257]:
def getTemporalThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "DATE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    
    return "<no-match>"
        

In [258]:
def getGPEThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and ent.label_ == "GPE" and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return "<no-match>"

In [284]:
def getORGorPersonThatContainsToken(statement, token):
    doc = nlp (statement)
    tokenStartPos = calculateStartPositionOfToken(statement, token)
    for ent in doc.ents:
        if(token.text in ent.text and (ent.label_ == "ORG" or ent.label_ == "PERSON") and tokenStartPos >= ent.start_char and tokenStartPos <= ent.end_char):
            return ent.text
    return "<no-match>"

In [274]:
import re
def calculateStartPositionOfToken(statement, targetToken):
    """
    this should return start position of the token
    """
    matches = [m.start() for m in re.finditer(targetToken.text, statement)]
    if(len(matches)==1):
        return matches[0]
    

    huristicPosition = 0
    doc = nlp(statement)
    
    i = 0
    for token in doc:
        if i < targetToken.i:
            huristicPosition = huristicPosition + len(token.text)+1
            i = i+1
        else:
            break
        
    distances = []
    for match in matches:
        distances.append(abs(match-huristicPosition))
        
    minIndex = distances.index(min(distances))
    
    return matches[minIndex]


In [154]:
def getNounSubject(statement, verbText):
    doc = nlp(statement)
    for token in doc:
        if((token.dep_ == "nsubjpass" or token.dep_ == "nsubj") and token.head.text == verbText):
            return token.text
    return "<no-match>"

In [101]:
def getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas):
    """
    this will reutrn exact apperance of verb in statement.
    ex: statement = "Movie was written by James.", targetVerbLemmas = ['write','script']
    this will return "written" as it's lemma matches with one of the targetVerbLemmas
    """
    verbTokens = filterVerbTokens(statement)
    for token in verbTokens:
        if token.lemma_ in targetVerbLemmas:
            return token.text
    return "<no-match>"

In [187]:
def extractReleaseTime(statement):
    doc = nlp(statement)
    dates = list(filter(lambda entity: entity.label_ == "DATE" , doc.ents))
    if(len(dates)==0):
        return "No Date found."
    if(len(dates)==1):
        return dates[0]
    else:
        verbTokens = filterVerbTokens(statement)
        matchVerbIndex = 0
        for i in range(len(verbTokens)):
            if(verbTokens[i].lemma_ in ['release', 'launch']):
                matchVerbIndex = i
        
        if(len(dates)> matchVerbIndex):
            return dates[matchVerbIndex]
        else:
            return "<MultiDate Confusion>"
        
        

In [268]:
def extractReleaseTimeSyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release time starts with "on"
                if child.text == "on":
                    #usually "on"'s child is [August] 
                    for subChild in child.children:
                        temporalEntity = getTemporalThatContainsToken(statement, subChild)
                        return temporalEntity
    return "<no-match>"

In [269]:
def extractReleaseLocationSyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release location starts with "in"
                if child.text == "in":
                    for subChild in child.children:
                        GPEEntity = getGPEThatContainsToken(statement, subChild)
                        return GPEEntity
    return "<no-match>"

In [288]:
def extractOwnerEntitySyntactically(statement):
    doc = nlp(statement)
    verbTokens = filterVerbTokens(statement)
    
    for token in verbTokens:
        if isExpectedToken(token):
#             print("Expected token is:"+token.text)
            #expected children [was, theatrically, in, on, by]
            for child in token.children: 
                #usually release location starts with "in"
                if child.text == "by":
                    for subChild in child.children:
                        ORGEntity = getORGorPersonThatContainsToken(statement, subChild)
                        return ORGEntity
    return "<no-match>"

In [255]:
extractOwnerEntitySyntactically('The Ironman premiered at the Sundance Film Festival on August 21, 2018, and was theatrically released in the United States, by Screen Gems.')

Expected token is:released
TOken start pos is :131


'Screen Gems'

In [256]:
def isExpectedVerbToken(token):
    return token.lemma_ in ["release","launch"]

In [266]:
def checkReleaseMovieTemplateEligibility(statement):
    print("Matching with release template: "+statement)
    hasValidVerb = check_verb_match(statement)
#     print("Has valid verb:"+str(hasValidVerb))
    
    hasDateEntity = check_date_entity_match(statement)
#     print("Has DATE included:"+str(hasDateEntity))
    return hasValidVerb and hasDateEntity

In [66]:
def check_date_entity_match(statement):
    doc = nlp(statement)
    result = len(list(filter(lambda entity: entity.label_ == "DATE" , doc.ents))) > 0
    return result

In [57]:
def check_verb_match(statement):
    verbTokens = filterVerbTokens(statement)
    targetVerbs = ['release', 'launch']
    for token in verbTokens:
        if token.lemma_ in targetVerbs:
            return True
    return False

In [163]:
def filterVerbTokens(statement):
    doc = nlp(statement)
    result = list(filter(lambda token: token.pos_ == "VERB" and token.lemma_ != "be", doc))
    return result

In [164]:
verbs = filterVerbTokens("Ironman was premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.")
print(verbs)

[premiered, released]


In [237]:
doc = nlp("'The film premiered at the Sundance Film Festival on August 21, 2018, and was theatrically released in the United States, by Screen Gems.'")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

' PUNCT punct released VERB []
The DET det film NOUN []
film NOUN nsubjpass released VERB [The, premiered, and]
premiered VERB acl film NOUN [at, on, ,]
at ADP prep premiered VERB [Festival]
the DET det Festival PROPN []
Sundance PROPN compound Festival PROPN []
Film PROPN compound Festival PROPN []
Festival PROPN pobj at ADP [the, Sundance, Film]
on ADP prep premiered VERB [August]
August PROPN pobj on ADP [21, ,, 2018]
21 NUM nummod August PROPN []
, PUNCT punct August PROPN []
2018 NUM nummod August PROPN []
, PUNCT punct premiered VERB []
and CCONJ cc film NOUN []
was VERB auxpass released VERB []
theatrically ADV advmod released VERB []
released VERB ROOT released VERB [', film, was, theatrically, in, ,, by, ., ']
in ADP prep released VERB [States]
the DET det States PROPN []
United PROPN compound States PROPN []
States PROPN pobj in ADP [the, United]
, PUNCT punct released VERB []
by ADP agent released VERB [Gems]
Screen PROPN compound Gems PROPN []
Gems PROPN pobj by ADP [Screen

In [7]:
def getRootToken(doc):
    for token in doc:
        if token.dep_ == "ROOT":
            return token
    return "None"

In [8]:
getRootToken(doc)

released

In [9]:
def get_token_dist(word1, word2):
    token1 = nlp(word1)[0]
    token2 = nlp(word2.lemma_)[0]
    return token1.similarity(token2)

In [13]:
get_token_dist("resign", getRootToken(doc))

0.61975265

In [247]:
def get_dependency_tree_nodes(sentence):
    nodes = []
    doc = nlp(sentence)
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes

In [287]:
get_dependency_tree_nodes("Avengers was released on 21 October.")
get_dependency_tree_nodes("Avengers premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States by Screen Gems.")

[['Avengers', 'nsubj', 'premiered', 'VERB', []],
 ['premiered',
  'ROOT',
  'premiered',
  'VERB',
  [Avengers, at, on, ,, and, released, .]],
 ['at', 'prep', 'premiered', 'VERB', [Festival]],
 ['the', 'det', 'Festival', 'PROPN', []],
 ['Sundance', 'compound', 'Festival', 'PROPN', []],
 ['Film', 'compound', 'Festival', 'PROPN', []],
 ['Festival', 'pobj', 'at', 'ADP', [the, Sundance, Film]],
 ['on', 'prep', 'premiered', 'VERB', [January]],
 ['January', 'pobj', 'on', 'ADP', [21, ,, 2018]],
 ['21', 'nummod', 'January', 'PROPN', []],
 [',', 'punct', 'January', 'PROPN', []],
 ['2018', 'nummod', 'January', 'PROPN', []],
 [',', 'punct', 'premiered', 'VERB', []],
 ['and', 'cc', 'premiered', 'VERB', []],
 ['was', 'auxpass', 'released', 'VERB', []],
 ['theatrically', 'advmod', 'released', 'VERB', []],
 ['released', 'conj', 'premiered', 'VERB', [was, theatrically, in, by]],
 ['in', 'prep', 'released', 'VERB', [States]],
 ['the', 'det', 'States', 'PROPN', []],
 ['United', 'compound', 'States', 'PR

In [37]:
s1 = nlp("Prisioner was released from the prision")
s2 = nlp("They planned to release the movie on Nov. 22")
s1.similarity(s2)

0.6178405037901543

In [90]:

doc = nlp("The Dark Knight was released in the United States on May 2, 2008.")
for chunk in doc.noun_chunks:
    print(chunk)

The Dark Knight
the United States
May
