In [22]:
import spacy
from nltk.corpus import wordnet

In [23]:
nlp = spacy.load('en')
doc = nlp('Ironman was released in the United States on May 2, 2008')

In [62]:
def get_sentence_tokens(paragraph):
    doc =  nlp(paragraph)
    return [sent.string.strip() for sent in doc.sents]

In [63]:
def get_word_tokens(sentence):
    doc =  nlp(sentence)
    return [token.text for token in doc]

In [79]:
def tokenize_POS_tag(sentence):
    doc = nlp(sentence)
    token_dict = {}
    
    for token in doc:
        token_dict[token]= token.tag_
    
    return token_dict

In [80]:
def get_lemmas(sentence):
    doc =  nlp(sentence)
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)
    return lemmas

In [81]:
def get_dependency_tree_nodes(sentence):
    nodes = []
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes

In [82]:
def get_hypernym(token):
    hypernym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernym_list.append(lemma.name())
    
    return hypernym_list

In [83]:
def get_holonym(token):
    holonym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for holonym in synset.member_holonyms():
            for lemma in holonym.lemmas():
                holonym_list.append(lemma.name())
    
    return holonym_list

In [84]:
def get_hyponym(token):
    hyponym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponym_list.append(lemma.name())
    
    return hyponym_list

In [85]:
def get_meronym(token):
    meronym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for meronym in synset.part_meronyms():
            for lemma in meronym.lemmas():
                meronym_list.append(lemma.name())
    
    return meronym_list

In [86]:
def get_word_net_data(token):
    data = {}
    
    hypernyms = get_hypernym(token)
    hyponyms = get_hyponym(token)
    meronyms = get_meronym(token)
    holonyms = get_holonym(token)
    
    data['hypernyms'] = hypernyms
    data['hyponyms'] = hyponyms
    data['meronyms'] = meronyms
    data['holonyms'] = holonyms
    
    return data

In [87]:
word_net_data = get_word_net_data('Jorge')
word_net_data

{'holonyms': [], 'hypernyms': [], 'hyponyms': [], 'meronyms': []}

In [88]:
def sentence_data_dict(sentence):
    mega_dict = {}
    
    word_tokens = get_word_tokens(sentence)
    POS_tag = tokenize_POS_tag(sentence)
    lemmas = get_lemmas(sentence)
    
    word_net_info_dict = {} #{lemma1: {hyper:  , hypo:   , ....}, lemma2: {hyper:  , hypo:   , ....}}
    
    for lemma in lemmas:
        word_net_info_dict[lemma] = get_word_net_data(lemma)
        
    dependency_tree = get_dependency_tree_nodes(sentence)
    
    mega_dict['POS_tag'] = POS_tag
    mega_dict['lemmas'] = lemmas
    mega_dict['word_net_info'] = word_net_info_dict
    mega_dict['dependency_tree'] = dependency_tree
    
    return mega_dict

In [89]:
def lexical_segmantation(paragraph):
    lexical_dict = {}
    
    sentences = get_sentence_tokens(paragraph)
    for sentence in sentences:
        lexical_dict[sentence] = sentence_data_dict(sentence)
    
    return lexical_dict

In [90]:
paragraph = "It was selected by the American Film Institute as one of the ten best films of the year. Filming at Edwards Air Force Base began in mid-April, and ended on May 2."
lexical_dict = lexical_segmantation(paragraph)

In [91]:
lexical_dict

{'Filming at Edwards Air Force Base began in mid-April, and ended on May 2.': {'POS_tag': {Filming: 'VBG',
   at: 'IN',
   Edwards: 'NNP',
   Air: 'NNP',
   Force: 'NNP',
   Base: 'NNP',
   began: 'VBD',
   in: 'IN',
   mid: 'JJ',
   -: 'HYPH',
   April: 'NNP',
   ,: ',',
   and: 'CC',
   ended: 'VBD',
   on: 'IN',
   May: 'NNP',
   2: 'CD',
   .: '.'},
  'dependency_tree': [['Ironman', 'nsubjpass', 'released', 'VERB', []],
   ['was', 'auxpass', 'released', 'VERB', []],
   ['released', 'ROOT', 'released', 'VERB', [Ironman, was, in, on]],
   ['in', 'prep', 'released', 'VERB', [States]],
   ['the', 'det', 'States', 'PROPN', []],
   ['United', 'compound', 'States', 'PROPN', []],
   ['States', 'pobj', 'in', 'ADP', [the, United]],
   ['on', 'prep', 'released', 'VERB', [May]],
   ['May', 'pobj', 'on', 'ADP', [2, ,, 2008]],
   ['2', 'nummod', 'May', 'PROPN', []],
   [',', 'punct', 'May', 'PROPN', []],
   ['2008', 'nummod', 'May', 'PROPN', []]],
  'lemmas': ['film',
   'at',
   'edwards',
   '