By Amy Weng

Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization

https://radimrehurek.com/gensim/models/word2vec.html

In [25]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = Word2Vec(bigram_transformer[data], min_count=5,sg=1)
    # save model so we can reload later  
    model.save('/home/rapiduser/Materials/embeddings/'+title+'.model')
    # save model in a format that can be converted to tensor TSV 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/tensor/'+title+'.model')
    return(model)

def similar(model,word,num):
    if word in model.wv.key_to_index.keys():
        words = []
        first = 0
        score = 0
        for w, s in model.wv.most_similar(word,topn=num):
            if first == 0:
                score = s
                first += 1
            words.append(w)
        #     words.append((w,s))
        # print(word + ': ',words)
        print(word + ': '+' '.join(words))
        print('The most similar n-gram has cosine distance '+str(score))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/tensor/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [2]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')
tensor('publica_eic_monopoly')

In [None]:
publica_eic_monopoly = Word2Vec.load('/home/rapiduser/Materials/embeddings/publica_eic_monopoly.model')

In [11]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic.csv'
readFile = pd.read_csv(myCSV)
# read text information into a dataframe
publica_eic = embed(readFile,'publica_eic')

In [17]:
myCSV = '/home/rapiduser/Materials/topic model/religio/eic.csv'
readFile = pd.read_csv(myCSV)
# read text information into a dataframe
religio_eic = embed(readFile,'religio_eic')

In [26]:
similar(publica_eic,'corruption',15)
similar(publica_eic,'monopoly',15)
similar(publica_eic,'monopolies',15)
similar(publica_eic,'body_politic',15)
similar(publica_eic,'odious',15)
similar(publica_eic,'east_india',15)

corruption: multiply influenc simplicity_original health_infirmity sovereign_authority arts_industry reasonableness endeavour_show subjecting utility genuine task_masters became_tyrants annihilate humane_nature
The most similar n-gram has cosine distance 0.854022741317749
monopoly: joynt_stocks monopolies reas joint_stocks retraints advisable wisdom_nation enlarging_trade national_advantage regulated_companies exclusive_others equitable excluive_others etablihment_new enable_carry
The most similar n-gram has cosine distance 0.8654782176017761
monopolies: retraints erected_maintenance freedom_trade ordering_trade restraints restrictions monopoly grant_sole legally_excluded companies_societies excluding_others prohibitory aerted patents_granted retraining
The most similar n-gram has cosine distance 0.9139752984046936
body_politic: corporate bodies_politic ingroing whole_buying excluding_others ealing_uing selling_using erected_maintenance companies_societies politic_capacity monopolies e

In [27]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w,10)

monopolies: laws_land freedom_trade retrained restrain restrained retrain exclude_others wrong legally privilege
The most similar n-gram has cosine distance 0.9599364995956421
monopoly: exclusive depending regulated_company joint_stocks monopolies retrained privilege wrong majeties_subjects allege
The most similar n-gram has cosine distance 0.9274605512619019
monopolise: allegations advantages_profits ingroed naturally_follow befalls free_encumbrances trengthen fact_allege corporation_excluding default_paying
The most similar n-gram has cosine distance 0.9918798804283142
monopolising: enriched greater_prices remote_parts riches_strength rates_exchange feel_effects encreaing increase_navigation increase_seamen attempting
The most similar n-gram has cosine distance 0.9856666326522827
monopolizers: lil vat_extent butchers served_apprentice tillage stil betowed consuming afford_gains trademan
The most similar n-gram has cosine distance 0.991474449634552
monopolised: olely forces_forts diff

In [None]:
terms1 = 'body_politic|body_politick|public_utility|public_affairs|wicked|illegal|engrossing|odious|evil|repugnant|arbitrary|popery|papist|bribery|remedies|remedy|monopoly_evils|circulation|disease|wasting|waste'
terms2 = 'monopoly|monopolies|monopolise|monopolising|monopolizers|monopolised|corruption|corrupt|corrupting'
list1 = terms1.split('|')
list2 = terms2.split('|')
for word1 in list1: 
    for word2 in list2: 
        comparePair(publica_eic_monopoly,word1,word2)
    print('\n')

In [None]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w,10)

In [None]:
corruption = 'corruption|corrupt|corrupted|corruptions|corrupting'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w,10)