## This file explores pre-trained w2v model using examples and tensorboard visualization

In [1]:
import pandas as pd
import numpy as np
import re
import os
import sys
import csv 

from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn import metrics
import gensim
from gensim.models import Word2Vec

#### apply pre-trained word2vec to the positive and negative word lists to find most similar words

In [2]:
#1. download word2vec model
# ##specify download path and extract path 
# download_path = "imf_w2v.zip"
# download_link = "https://www.dropbox.com/sh/6um97x52kweebfx/AACSxB0E9weItCbyQwUqvuWRa?dl=1"
# extract_path = './data'
# data_util.download_data(download_path,download_link,extract_path)

#2. load pre-trained imf w2v model
model_path = os.path.join('model','imf_160.w2v')
imf_w2v = Word2Vec.load(model_path)

#### Some examples to understand the model

In [10]:
imf_w2v.wv.most_similar('singapore')

[('malaysia', 0.7892636060714722),
 ('hong_kong', 0.7774065732955933),
 ('hong_kong_sar', 0.7603132128715515),
 ('thailand', 0.7355126738548279),
 ('korea', 0.7185163497924805),
 ('philippines', 0.6760509610176086),
 ('indonesia', 0.6705522537231445),
 ('taiwan_province', 0.649632453918457),
 ('new_zealand', 0.6455516219139099),
 ('sri_lanka', 0.6316699981689453)]

In [19]:
def nearest_similarity_cosmul(start1, end1, end2):
    '''find similar words'''
    similarities = imf_w2v.wv.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    
    start2 = similarities[0][0]
    print("{start1} is to {end1}, like {start2} is to {end2}".format(**locals()))
    
    return None

In [20]:
nearest_similarity_cosmul('rmb','china','india')
nearest_similarity_cosmul('manufacturing','china','singapore')

rmb is to china, like rs is to india
manufacturing is to china, like restaurants is to singapore


In [3]:
imf_w2v.wv.most_similar(positive =['china','manufacturing'],negative =['india'], topn =1)

[('electronics', 0.6150177121162415)]

#### save model to tensorflow [embedding projector](https://projector.tensorflow.org/)

In [5]:
def save_for_visulization(model, output_path, meta_file = "imf_w2v_metadata.tsv", vector_file = "imf_w2v_vectordata.tsv"):

    with open(os.path.join(output_path,vector_file), 'wb') as file_vector:
        with open(os.path.join(output_path,meta_file), 'wb') as file_metadata:
            
            for word in model.wv.index2word:
                file_metadata.write('{0}'.format(word).encode('utf-8') + '\n'.encode('utf-8'))                
                vector_row = '\t'.join(map(str, model.wv.word_vec(word)))
                file_vector.write(gensim.utils.to_utf8(vector_row) + '\n'.encode('utf-8'))

    return None

save_for_visulization(model = imf_w2v, output_path = 'model/tensorboard')

# word2vec_model_path = model_path
# tensor_filename = 'model\tensorboard'
# gensim.scripts.word2vec2tensor(word2vec_model_path,tensor_filename,binary=False)