In [4]:
import spacy
from nltk.corpus import wordnet

In [5]:
nlp = spacy.load('en')
doc = nlp('Ironman was released in the United States on May 2, 2008')

In [6]:
def get_sentence_tokens(paragraph):
    doc =  nlp(paragraph)
    return [sent.string.strip() for sent in doc.sents]

In [7]:
def get_word_tokens(sentence):
    doc =  nlp(sentence)
    return [token.text for token in doc]

In [8]:
def tokenize_POS_tag(sentence):
    doc = nlp(sentence)
    token_dict = {}
    
    for token in doc:
        token_dict[token]= token.tag_
    
    return token_dict

In [9]:
def get_lemmas(sentence):
    doc =  nlp(sentence)
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_)
    return lemmas

In [10]:
def get_dependency_tree_nodes(sentence):
    nodes = []
    for token in doc:
        nodes.append([token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children]])
    return nodes

In [11]:
def get_hypernym(token):
    hypernym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hypernym in synset.hypernyms():
            for lemma in hypernym.lemmas():
                hypernym_list.append(lemma.name())
    
    return hypernym_list

In [12]:
def get_holonym(token):
    holonym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for holonym in synset.member_holonyms():
            for lemma in holonym.lemmas():
                holonym_list.append(lemma.name())
    
    return holonym_list

In [13]:
def get_hyponym(token):
    hyponym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for hyponym in synset.hyponyms():
            for lemma in hyponym.lemmas():
                hyponym_list.append(lemma.name())
    
    return hyponym_list

In [14]:
def get_meronym(token):
    meronym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for meronym in synset.part_meronyms():
            for lemma in meronym.lemmas():
                meronym_list.append(lemma.name())
    
    return meronym_list

In [15]:
def get_synonym(token):
    synonym_list = []
    
    synsets = wordnet.synsets(token)
    
    for synset in synsets:
        for lemma in synset.lemmas():
            synonym_list.append(lemma.name())
    
    return list(set(synonym_list))

In [16]:
print(get_synonym('release'))

['expel', 'unloosen', 'turn', 'unfreeze', 'free', 'release', 'outlet', 'loose', 'press_release', 'freeing', 'firing', 'liberation', 'acquittance', 'loss', 'issue', 'liberate', 'vent', 'resign', 'exit', 'let_go', 'discharge', 'button', 'publish', 'waiver', 'departure', 'expiration', 'passing', 'going', 'give_up', 'sack', 'spillage', 'exhaust', 'unblock', 'relinquish', 'sacking', 'spill', 'handout', 'let_go_of', 'unloose', 'eject', 'put_out', 'bring_out', 'dismission', 'tone_ending', 'secrete', 'dismissal']


In [17]:
def get_word_net_data(token):
    data = {}
    
    hypernyms = get_hypernym(token)
    hyponyms = get_hyponym(token)
    meronyms = get_meronym(token)
    holonyms = get_holonym(token)
    synonyms = get_synonym(token)
    
    data['hypernyms'] = hypernyms
    data['hyponyms'] = hyponyms
    data['meronyms'] = meronyms
    data['holonyms'] = holonyms
    data['synonyms'] = synonyms
    
    return data

In [18]:
word_net_data = get_word_net_data('Jorge')
word_net_data

{'hypernyms': [],
 'hyponyms': [],
 'meronyms': [],
 'holonyms': [],
 'synonyms': []}

In [19]:
def sentence_data_dict(sentence):
    mega_dict = {}
    
    word_tokens = get_word_tokens(sentence)
    POS_tag = tokenize_POS_tag(sentence)
    lemmas = get_lemmas(sentence)
    
    word_net_info_dict = {} #{lemma1: {hyper:  , hypo:   , ....}, lemma2: {hyper:  , hypo:   , ....}}
    
    for lemma in lemmas:
        word_net_info_dict[lemma] = get_word_net_data(lemma)
        
    dependency_tree = get_dependency_tree_nodes(sentence)
    
    mega_dict['POS_tag'] = POS_tag
    mega_dict['lemmas'] = lemmas
    mega_dict['word_net_info'] = word_net_info_dict
    mega_dict['dependency_tree'] = dependency_tree
    
    return mega_dict

In [20]:
def lexical_segmantation(paragraph):
    lexical_dict = {}
    
    sentences = get_sentence_tokens(paragraph)
    for sentence in sentences:
        lexical_dict[sentence] = sentence_data_dict(sentence)
    
    return lexical_dict

In [21]:
paragraph = "It was selected by the American Film Institute as one of the ten best films of the year. Filming at Edwards Air Force Base began in mid-April, and ended on May 2."
lexical_dict = lexical_segmantation(paragraph)

In [22]:
lexical_dict

{'It was selected by the American Film Institute as one of the ten best films of the year.': {'POS_tag': {It: 'PRP',
   was: 'VBD',
   selected: 'VBN',
   by: 'IN',
   the: 'DT',
   American: 'NNP',
   Film: 'NNP',
   Institute: 'NNP',
   as: 'IN',
   one: 'CD',
   of: 'IN',
   the: 'DT',
   ten: 'CD',
   best: 'JJS',
   films: 'NNS',
   of: 'IN',
   the: 'DT',
   year: 'NN',
   .: '.'},
  'lemmas': ['-PRON-',
   'be',
   'select',
   'by',
   'the',
   'american',
   'film',
   'institute',
   'as',
   'one',
   'of',
   'the',
   'ten',
   'good',
   'film',
   'of',
   'the',
   'year',
   '.'],
  'word_net_info': {'-PRON-': {'hypernyms': [],
    'hyponyms': [],
    'meronyms': [],
    'holonyms': [],
    'synonyms': []},
   'be': {'hypernyms': ['metallic_element',
     'metal',
     'typify',
     'symbolize',
     'symbolise',
     'stand_for',
     'represent',
     'take',
     'occupy',
     'use_up',
     'stay',
     'remain',
     'rest',
     'be'],
    'hyponyms': ['abound