In [19]:
import spacy
from nltk.corpus import wordnet
nlp = spacy.load('en')

In [59]:
paragraph = "It was selected by the American Film Institute as one of the ten best films of the year. Filming at Edwards Air Force Base began in mid-April, and ended on May 2."
lexical_dict = lexical_segmantation(paragraph)


In [60]:
lexical_dict

{'Filming at Edwards Air Force Base began in mid-April, and ended on May 2.': {'POS_tag': {Filming: 'VBG',
   at: 'IN',
   Edwards: 'NNP',
   Air: 'NNP',
   Force: 'NNP',
   Base: 'NNP',
   began: 'VBD',
   in: 'IN',
   mid: 'JJ',
   -: 'HYPH',
   April: 'NNP',
   ,: ',',
   and: 'CC',
   ended: 'VBD',
   on: 'IN',
   May: 'NNP',
   2: 'CD',
   .: '.'},
  'dependency_tree': [['Filming', 'nsubj', 'began', 'VERB', [at]],
   ['at', 'prep', 'Filming', 'VERB', [Base]],
   ['Edwards', 'compound', 'Base', 'PROPN', []],
   ['Air', 'compound', 'Force', 'PROPN', []],
   ['Force', 'compound', 'Base', 'PROPN', [Air]],
   ['Base', 'pobj', 'at', 'ADP', [Edwards, Force]],
   ['began', 'ROOT', 'began', 'VERB', [Filming, in, ,, and, ended, .]],
   ['in', 'prep', 'began', 'VERB', []],
   ['mid', 'amod', 'April', 'PROPN', []],
   ['-', 'punct', 'April', 'PROPN', []],
   ['April', 'punct', ',', 'PUNCT', [mid, -]],
   [',', 'punct', 'began', 'VERB', [April]],
   ['and', 'cc', 'began', 'VERB', []],
   ['end

In [20]:
def get_sentence_tokens(paragraph):
    doc =  nlp(paragraph)
    return [sent.string.strip() for sent in doc.sents]

In [21]:
def get_word_tokens(sentence):
    doc =  nlp(sentence)
    return [token.text for token in doc]

In [22]:
def tokenize_POS_tag(sentence):
    doc = nlp(sentence)
    token_dict = {}
    
    for token in doc:
        token_dict[token]= token.tag_
    
    return token_dict

In [23]:
def get_lemmas(sentence):
    doc =  nlp(sentence)
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)
    return lemmas

In [46]:
def get_dependency_tree_nodes(sentence):
    doc = nlp(sentence)
    nodes = []
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes

In [47]:
def get_hypernym(token):
    hypernym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernym_list.append(lemma.name())
    
    return hypernym_list

In [48]:
def get_holonym(token):
    holonym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for holonym in synset.member_holonyms():
            for lemma in holonym.lemmas():
                holonym_list.append(lemma.name())
    
    return holonym_list

In [49]:
def get_hyponym(token):
    hyponym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponym_list.append(lemma.name())
    
    return hyponym_list

In [50]:
def get_meronym(token):
    meronym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for meronym in synset.part_meronyms():
            for lemma in meronym.lemmas():
                meronym_list.append(lemma.name())
    
    return meronym_list

In [51]:
def get_synonym(token):
    synonym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for lemma in synset.lemmas():
            synonym_list.append(lemma.name())
    
    return list(set(synonym_list))

In [52]:
print(get_synonym('release'))

['press_release', 'outlet', 'exhaust', 'unloose', 'release', 'relinquish', 'handout', 'loose', 'button', 'tone_ending', 'put_out', 'sacking', 'spillage', 'spill', 'unfreeze', 'freeing', 'eject', 'liberation', 'unloosen', 'exit', 'loss', 'expiration', 'publish', 'waiver', 'unblock', 'issue', 'bring_out', 'dismissal', 'discharge', 'going', 'acquittance', 'departure', 'free', 'give_up', 'dismission', 'resign', 'secrete', 'firing', 'let_go_of', 'sack', 'vent', 'passing', 'let_go', 'expel', 'turn', 'liberate']


In [53]:
def get_word_net_data(token):
    data = {}
    
    hypernyms = get_hypernym(token)
    hyponyms = get_hyponym(token)
    meronyms = get_meronym(token)
    holonyms = get_holonym(token)
    synonyms = get_synonym(token)
    
    data['hypernyms'] = hypernyms
    data['hyponyms'] = hyponyms
    data['meronyms'] = meronyms
    data['holonyms'] = holonyms
    data['synonyms'] = synonyms
    
    return data

In [54]:
word_net_data = get_word_net_data('Jorge')
word_net_data

{'holonyms': [],
 'hypernyms': [],
 'hyponyms': [],
 'meronyms': [],
 'synonyms': []}

In [55]:
def sentence_data_dict(sentence):
    mega_dict = {}
    
    word_tokens = get_word_tokens(sentence)
    POS_tag = tokenize_POS_tag(sentence)
    lemmas = get_lemmas(sentence)
    
    word_net_info_dict = {} #{lemma1: {hyper:  , hypo:   , ....}, lemma2: {hyper:  , hypo:   , ....}}
    
    for lemma in lemmas:
        word_net_info_dict[lemma] = get_word_net_data(lemma)
        
    dependency_tree = get_dependency_tree_nodes(sentence)
    
    mega_dict['POS_tag'] = POS_tag
    mega_dict['lemmas'] = lemmas
    mega_dict['word_net_info'] = word_net_info_dict
    mega_dict['dependency_tree'] = dependency_tree
    
    return mega_dict

In [56]:
def lexical_segmantation(paragraph):
    lexical_dict = {}
    
    sentences = get_sentence_tokens(paragraph)
    for sentence in sentences:
        lexical_dict[sentence] = sentence_data_dict(sentence)
    
    return lexical_dict

In [57]:
paragraph = "It was selected by the American Film Institute as one of the ten best films of the year. Filming at Edwards Air Force Base began in mid-April, and ended on May 2."
lexical_dict = lexical_segmantation(paragraph)

In [58]:
lexical_dict

{'Filming at Edwards Air Force Base began in mid-April, and ended on May 2.': {'POS_tag': {Filming: 'VBG',
   at: 'IN',
   Edwards: 'NNP',
   Air: 'NNP',
   Force: 'NNP',
   Base: 'NNP',
   began: 'VBD',
   in: 'IN',
   mid: 'JJ',
   -: 'HYPH',
   April: 'NNP',
   ,: ',',
   and: 'CC',
   ended: 'VBD',
   on: 'IN',
   May: 'NNP',
   2: 'CD',
   .: '.'},
  'dependency_tree': [['Filming', 'nsubj', 'began', 'VERB', [at]],
   ['at', 'prep', 'Filming', 'VERB', [Base]],
   ['Edwards', 'compound', 'Base', 'PROPN', []],
   ['Air', 'compound', 'Force', 'PROPN', []],
   ['Force', 'compound', 'Base', 'PROPN', [Air]],
   ['Base', 'pobj', 'at', 'ADP', [Edwards, Force]],
   ['began', 'ROOT', 'began', 'VERB', [Filming, in, ,, and, ended, .]],
   ['in', 'prep', 'began', 'VERB', []],
   ['mid', 'amod', 'April', 'PROPN', []],
   ['-', 'punct', 'April', 'PROPN', []],
   ['April', 'punct', ',', 'PUNCT', [mid, -]],
   [',', 'punct', 'began', 'VERB', [April]],
   ['and', 'cc', 'began', 'VERB', []],
   ['end