In [83]:
import spacy
nlp = spacy.load('en')
doc = nlp('The film premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.')

doc = nlp('Ironman was released in the United States on May 2, 2008.')

In [40]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

The the DET DT det Xxx True False
film film NOUN NN ROOT xxxx True False
premiered premier VERB VBD acl xxxx True False
at at ADP IN prep xx True True
the the DET DT det xxx True True
Sundance sundance PROPN NNP compound Xxxxx True False
Film film PROPN NNP compound Xxxx True False
Festival festival PROPN NNP pobj Xxxxx True False
on on ADP IN prep xx True True
January january PROPN NNP pobj Xxxxx True False
21 21 NUM CD nummod dd False False
, , PUNCT , punct , False False
2018 2018 NUM CD nummod dddd False False
, , PUNCT , punct , False False
and and CCONJ CC cc xxx True True
was be VERB VBD auxpass xxx True True
theatrically theatrically ADV RB advmod xxxx True False
released release VERB VBN conj xxxx True False
in in ADP IN prep xx True True
the the DET DT det xxx True True
United united PROPN NNP compound Xxxxx True False
States states PROPN NNP pobj Xxxxx True False
on on ADP IN prep xx True True
August august PROPN NNP pobj Xxxxx True False
24 24 NUM CD nummod dd False False
,

In [41]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_)

The the DET DT
film film NOUN NN
premiered premier VERB VBD
at at ADP IN
the the DET DT
Sundance sundance PROPN NNP
Film film PROPN NNP
Festival festival PROPN NNP
on on ADP IN
January january PROPN NNP
21 21 NUM CD
, , PUNCT ,
2018 2018 NUM CD
, , PUNCT ,
and and CCONJ CC
was be VERB VBD
theatrically theatrically ADV RB
released release VERB VBN
in in ADP IN
the the DET DT
United united PROPN NNP
States states PROPN NNP
on on ADP IN
August august PROPN NNP
24 24 NUM CD
, , PUNCT ,
2018 2018 NUM CD
, , PUNCT ,
by by ADP IN
Screen screen PROPN NNP
Gems gems PROPN NNP
. . PUNCT .


In [179]:
doc = nlp ("The Dark Night was theatrically released in the United States on August 24, 2018, and premiered at the Sundance Film Festival on January 21, 2018, by Screen Gems.")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

The Dark Night 0 14 WORK_OF_ART
the United States 44 61 GPE
August 24, 2018 65 80 DATE
the Sundance Film Festival 99 125 ORG
January 21, 2018 129 145 DATE
Screen Gems 150 161 ORG


In [178]:
statement = 'The Dark Night was theatrically released in the United States on August 24, 2018, and premiered at the Sundance Film Festival on January 21, 2018, by Screen Gems.'
isValidForReleaseTemplate = checkReleaseMovieTemplateEligibility(statement)
if(isValidForReleaseTemplate):
    time = extractReleaseTime(statement)
    print("extracted time is:"+str(time))
    
    movieName = extractMovieName(statement)
    print("extracted movie name is:"+str(movieName))

Matching with release template: for The Dark Night was theatrically released in the United States on August 24, 2018, and premiered at the Sundance Film Festival on January 21, 2018, by Screen Gems.
Has valid verb:True
Has DATE included:True
extracted time is:August 24, 2018
released
ns:Night
extracted movie name is:The Dark Night


In [170]:
def extractMovieName(statement):
    targetVerbLemmas = ['release', 'launch']
    verbText = getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas)
    verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)
    movieName = "<uninitialized>"
    
#     handle when release is with premiered.
    while verbTextDNode[1] == 'conj':
        verbText = verbTextDNode[2]
        print("verbTextChanged to"+verbText)
        verbTextDNode = next(obj for obj in get_dependency_tree_nodes(statement) if obj[0] == verbText)

    print(verbText)
    nounSubject = getNounSubject(statement, verbText)
    print("ns:"+nounSubject)
    movieName = getNounChunkThatContainsNoun(statement, nounSubject)

    return movieName

In [157]:
extractMovieName("The Dark Night premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.")

released
ns:Night


'The Dark Night'

In [119]:
def getNounChunkThatContainsNoun(statement, nounSubject):
    doc = nlp(statement)
    for chunk in doc.noun_chunks:
        if nounSubject in chunk.text:
            return chunk.text
    return "<no-match>"

In [154]:
def getNounSubject(statement, verbText):
    doc = nlp(statement)
    for token in doc:
        if((token.dep_ == "nsubjpass" or token.dep_ == "nsubj") and token.head.text == verbText):
            return token.text
    return "<no-match>"

In [101]:
def getTargetVerbAsAppearedInStatement(statement, targetVerbLemmas):
    """
    this will reutrn exact apperance of verb in statement.
    ex: statement = "Movie was written by James.", targetVerbLemmas = ['write','script']
    this will return "written" as it's lemma matches with one of the targetVerbLemmas
    """
    verbTokens = filterVerbTokens(statement)
    for token in verbTokens:
        if token.lemma_ in targetVerbLemmas:
            return token.text
    return "<no-match>"

In [169]:
def extractReleaseTime(statement):
    doc = nlp(statement)
    dates = list(filter(lambda entity: entity.label_ == "DATE" , doc.ents))
    if(len(dates)==0):
        return "No Date found."
    if(len(dates)==1):
        return dates[0]
    else:
        verbTokens = filterVerbTokens(statement)
        matchVerbIndex = 0
        for i in range(len(verbTokens)):
            if(verbTokens[i].lemma_ in ['release', 'launch']):
                matchVerbIndex = i
        
        if(len(dates)> matchVerbIndex):
            return dates[matchVerbIndex]
        else:
            return "<MultiDate Confusion>"
        
        

In [92]:
def checkReleaseMovieTemplateEligibility(statement):
    print("Matching with release template: for "+statement)
    hasValidVerb = check_verb_match(statement)
    print("Has valid verb:"+str(hasValidVerb))
    
    hasDateEntity = check_date_entity_match(statement)
    print("Has DATE included:"+str(hasDateEntity))
    return hasValidVerb and hasDateEntity

In [66]:
def check_date_entity_match(statement):
    doc = nlp(statement)
    result = len(list(filter(lambda entity: entity.label_ == "DATE" , doc.ents))) > 0
    return result

In [57]:
def check_verb_match(statement):
    verbTokens = filterVerbTokens(statement)
    targetVerbs = ['release', 'launch']
    for token in verbTokens:
        if token.lemma_ in targetVerbs:
            return True
    return False

In [163]:
def filterVerbTokens(statement):
    doc = nlp(statement)
    result = list(filter(lambda token: token.pos_ == "VERB" and token.lemma_ != "be", doc))
    return result

In [164]:
verbs = filterVerbTokens("The film premiered at the Sundance Film Festival on January 21, 2018, and was theatrically released in the United States on August 24, 2018, by Screen Gems.")
print(verbs)

[premiered, released]


In [89]:
doc = nlp("The Dark Knight was released in the United States on May 2, 2008.")
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])

The DET det Knight PROPN []
Dark PROPN compound Knight PROPN []
Knight PROPN nsubjpass released VERB [The, Dark]
was VERB auxpass released VERB []
released VERB ROOT released VERB [Knight, was, in, on, .]
in ADP prep released VERB [States]
the DET det States PROPN []
United PROPN compound States PROPN []
States PROPN pobj in ADP [the, United]
on ADP prep released VERB [May]
May PROPN pobj on ADP [2, ,, 2008]
2 NUM nummod May PROPN []
, PUNCT punct May PROPN []
2008 NUM nummod May PROPN []
. PUNCT punct released VERB []


In [7]:
def getRootToken(doc):
    for token in doc:
        if token.dep_ == "ROOT":
            return token
    return "None"

In [8]:
getRootToken(doc)

released

In [9]:
def get_token_dist(word1, word2):
    token1 = nlp(word1)[0]
    token2 = nlp(word2.lemma_)[0]
    return token1.similarity(token2)

In [13]:
get_token_dist("resign", getRootToken(doc))

0.61975265

In [172]:
def get_dependency_tree_nodes(sentence):
    nodes = []
    doc = nlp(sentence)
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes

In [177]:
get_dependency_tree_nodes("Avengers was released on 21 October.")
get_dependency_tree_nodes("The Dark Night was theatrically released in the United States on August 24, 2018, and premiered at the Sundance Film Festival on January 21, 2018, by Screen Gems.")

[['The', 'det', 'Night', 'PROPN', []],
 ['Dark', 'compound', 'Night', 'PROPN', []],
 ['Night', 'nsubjpass', 'released', 'VERB', [The, Dark]],
 ['was', 'auxpass', 'released', 'VERB', []],
 ['theatrically', 'advmod', 'released', 'VERB', []],
 ['released',
  'ROOT',
  'released',
  'VERB',
  [Night, was, theatrically, in, on, ,, and, premiered, .]],
 ['in', 'prep', 'released', 'VERB', [States]],
 ['the', 'det', 'States', 'PROPN', []],
 ['United', 'compound', 'States', 'PROPN', []],
 ['States', 'pobj', 'in', 'ADP', [the, United]],
 ['on', 'prep', 'released', 'VERB', [August]],
 ['August', 'pobj', 'on', 'ADP', [24, ,, 2018]],
 ['24', 'nummod', 'August', 'PROPN', []],
 [',', 'punct', 'August', 'PROPN', []],
 ['2018', 'npadvmod', 'August', 'PROPN', []],
 [',', 'punct', 'released', 'VERB', []],
 ['and', 'cc', 'released', 'VERB', []],
 ['premiered', 'conj', 'released', 'VERB', [at, on, ,, by]],
 ['at', 'prep', 'premiered', 'VERB', [Festival]],
 ['the', 'det', 'Festival', 'PROPN', []],
 ['Sundan

In [37]:
s1 = nlp("Prisioner was released from the prision")
s2 = nlp("They planned to release the movie on Nov. 22")
s1.similarity(s2)

0.6178405037901543

In [90]:

doc = nlp("The Dark Knight was released in the United States on May 2, 2008.")
for chunk in doc.noun_chunks:
    print(chunk)

The Dark Knight
the United States
May
