In [3]:
import spacy
import pandas as pd
from spacy import displacy
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')

In [5]:
original_input = "Frank lives in San Francisco and Elizabeth lives in Los Angeles. If the flight time is 2 hrs when will Elizabeth reach Frank if she starts at 8am in the morning?"

In [6]:
processed_text = nlp(original_input)

In [7]:
keyword_set = set()
entity_mapping = []

In [8]:
for token in processed_text.ents:
    if token.text not in keyword_set:
        keyword_set.add(token.text)
        entity_mapping.append((token.text, token.label_))

In [9]:
print(entity_mapping)

[('Frank', 'PERSON'), ('San Francisco', 'GPE'), ('Elizabeth', 'PERSON'), ('Los Angeles', 'GPE'), ('2', 'CARDINAL'), ('8am in the morning', 'TIME')]


In [10]:
displacy.render(processed_text, style='ent', jupyter=True)

In [11]:
keep_entities_list = ['PERSON', "GPE", "FAC", "ORG", "PRODUCT", "NORP",
                     "MONEY", "LOC", "WORK_OF_ART", "LAW", "LANGUAGE",
                     "QUANTITY"]

In [13]:
finalized_entity_mapping = {}
for ent in entity_mapping:
    if ent[1] in keep_entities_list:
        finalized_entity_mapping[ent[0]] = []

In [14]:
finalized_entity_mapping

{'Frank': [], 'San Francisco': [], 'Elizabeth': [], 'Los Angeles': []}

In [15]:
from gensim.models.keyedvectors import KeyedVectors

In [17]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file='/home/achint/Raghava/AI&ML/NLP/wordembeddings/glove.6B.300d.txt',
              word2vec_output_file='/home/achint/Raghava/AI&ML/NLP/wordembeddings/gensim_word2vec_file_300d.txt')

(400000, 300)

In [18]:
glove_model = KeyedVectors.load_word2vec_format("/home/achint/Raghava/AI&ML/NLP/wordembeddings/gensim_word2vec_file_300d.txt",
                                               binary=False)

In [19]:
word_vectors = glove_model.wv

  """Entry point for launching an IPython kernel.


In [32]:
origin_country = "usa"
target_country = 'india'

final_mapping = {}

In [33]:
for word in finalized_entity_mapping:
    word = word.strip()
    word = word.replace(' ', '_')
    try:
        similar_words_list = glove_model.most_similar(positive = [target_country, word],
                                                     negative=[origin_country], topn=10)
        
        similar_words_list = [choices[0].replace('_', ' ') for choice in similar_words_list]
        
    except:
        similar_words_list = []
    print(word, similar_words_list)

Frank []
San_Francisco []
Elizabeth []
Los_Angeles []


In [43]:
glove_model.most_similar(positive = [target_country, 'san_francisco'],
                         negative=[origin_country], topn=10)

KeyError: "word 'san_francisco' not in vocabulary"

In [35]:
glove_model.most_similar("frank")

[('walter', 0.5094894170761108),
 ('j.', 0.49901607632637024),
 ('jr.', 0.4745505452156067),
 ('joe', 0.46593624353408813),
 ('terry', 0.4624418616294861),
 ('sr.', 0.4598439335823059),
 ('sinatra', 0.4509432911872864),
 ('miller', 0.44492608308792114),
 ('herbert', 0.4441884160041809),
 ('moore', 0.44388148188591003)]