By Amy Weng

Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization

https://radimrehurek.com/gensim/models/word2vec.html

In [5]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = Word2Vec(bigram_transformer[data], min_count=5,sg=1)
    # save model so we can reload later  
    model.save('/home/rapiduser/Materials/embeddings/'+title+'.model')
    # save model in a format that can be converted to tensor TSV 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/tensor/'+title+'.model')
    return(model)

def similar(model,word,num):
    if word in model.wv.key_to_index.keys():
        words = []
        first = 0
        score = 0
        for w, s in model.wv.most_similar(word,topn=num):
            if first == 0:
                score = s
                first += 1
            words.append(w)
        #     words.append((w,s))
        # print(word + ': ',words)
        print(word + ': '+' '.join(words))
        print('The most similar word has cosine distance '+str(score))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/tensor/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [2]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')
tensor('publica_eic_monopoly')

In [None]:
publica_eic_monopoly = Word2Vec.load('/home/rapiduser/Materials/embeddings/publica_eic_monopoly.model')

In [None]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic = embed(readFile,'publica_eic')
tensor('publica_eic')

In [None]:
similar(publica_eic,'corruption',15)
print('\n')
similar(publica_eic,'monopoly',15)
print('\n')
similar(publica_eic,'monopolies',15)
print('\n')
similar(publica_eic,'body_politic',15)
print('\n')
similar(publica_eic,'odious',15)

In [4]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w,10)

monopolies: laws_land freedom_trade retrained restrained equal_right lawful lives_liberties limitations exercie authorities
The most similar word has cosine distance 0.9826760292053223
monopoly: joint_stocks monopolies freedom_trade privilege retrained regulated_company majeties_subjects conideration particulars pretence
The most similar word has cosine distance 0.9539591670036316
monopolise: carved present_constitution ingroed befalls tion establishment_new moreover impounding avenge majesties_revenue
The most similar word has cosine distance 0.9954888224601746
monopolising: enriched increase_navigation rates_exchange encourages_navigation inland remote_parts riches_strength increases increase_seamen luxurious_prodigal
The most similar word has cosine distance 0.9905856847763062
monopolists: careful_avoid queez superstructure expend perspicuous whiteness proportioned rightly_understood lessening exhauting
The most similar word has cosine distance 0.995648205280304
monopolizers: dearet

In [None]:
comparePair(publica_eic_monopoly,'body_politic','monopoly')
comparePair(publica_eic_monopoly,'body_politick','monopoly')
comparePair(publica_eic_monopoly,'body_politic','monopolies')
comparePair(publica_eic_monopoly,'body_politick','monopolies')
comparePair(publica_eic_monopoly,'public_utility','monopolies')
comparePair(publica_eic_monopoly,'public_affairs','monopolies')
comparePair(publica_eic_monopoly,'wicked','monopoly')
comparePair(publica_eic_monopoly,'illegal','monopoly')
comparePair(publica_eic_monopoly,'engrossing','monopoly')
comparePair(publica_eic_monopoly,'odious','monopoly')
comparePair(publica_eic_monopoly,'evil','monopoly')
comparePair(publica_eic_monopoly,'repugnant','monopoly')

In [None]:
comparePair(publica_eic_monopoly,'arbitrary','corruption')
comparePair(publica_eic_monopoly,'popery','corruption')
comparePair(publica_eic_monopoly,'papist','corruption')
comparePair(publica_eic_monopoly,'monopoly','corruption')
comparePair(publica_eic_monopoly,'bribery','corruption')
comparePair(publica_eic_monopoly,'remedies','corruption')
comparePair(publica_eic_monopoly,'remedy','corruption')
comparePair(publica_eic_monopoly,'monopoly_evils','corruption')
comparePair(publica_eic_monopoly,'body_politick','corruption')
comparePair(publica_eic_monopoly,'body_politic','corruption')

In [None]:
similar(publica_eic_monopoly,'circulation',10)
similar(publica_eic_monopoly,'disease',10)
similar(publica_eic_monopoly,'remedy',10)
similar(publica_eic_monopoly,'remedies',10)
similar(publica_eic_monopoly,'wasting',10)
similar(publica_eic_monopoly,'waste',10)

In [None]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w,10)

In [None]:
corruption = 'corruption|corrupt|corrupted|corruptions|corrupting'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w,10)