By Amy Weng

Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization

https://radimrehurek.com/gensim/models/word2vec.html

In [42]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = Word2Vec(bigram_transformer[data], min_count=3,sg=1)
    # save model so we can reload later  
    model.save('/home/rapiduser/Materials/embeddings/'+title+'.model')
    # save model in a format that can be converted to tensor TSV 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/tensor/'+title+'.model')
    return(model)

def similar(model,word):
    if word in model.wv.key_to_index.keys():
        words = []
        for w, s in model.wv.most_similar(word,topn=10):
            words.append(w)
        print(word + ': '+' '.join(words))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/tensor/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [50]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')
tensor('publica_eic_monopoly')

In [None]:
publica_eic_monopoly = Word2Vec.load('/home/rapiduser/Materials/embeddings/publica_eic_monopoly.model')

In [51]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w)

monopolies: retrained laws_land restrained restrain freedom_trade contitution lives_liberties monopoly legally privilege
monopoly: monopolies privilege joint_stocks retrained wrong contitution majeties_subjects regulated_company allege majesties_subjects
monopolise: entitling obligations_owe allegations humbly_beg parliamentary_sanction commodities_imported writing_signed strengthen extrajudicial first_discoverers
monopolising: remote_parts rates_exchange encourages_navigation increases increase_seamen increase_navigation low_ebb marts greater_prices afford_cheaper
monopolists: groans_plantations straining monopolits exhausting goods_manufactured pony restrictions_limitations cumber monter perishable
monopolizers: dearest recompene envied raiment superfluity spinsters card excellencies beating turky
monopolised: solely forces_forts managing obstructed majeties_revenue factories_forts olely limitation etablihing confining
monoopolies: steephen emond error_aign ectam pragmatical title_cr

In [52]:
comparePair(publica_eic_monopoly,'arbitrary','corruption')
comparePair(publica_eic_monopoly,'popery','corruption')
comparePair(publica_eic_monopoly,'papist','corruption')
comparePair(publica_eic_monopoly,'monopoly','corruption')
comparePair(publica_eic_monopoly,'king','corruption')
comparePair(publica_eic_monopoly,'parliament','corruption')
comparePair(publica_eic_monopoly,'bribery','corruption')
comparePair(publica_eic_monopoly,'remedies','corruption')
comparePair(publica_eic_monopoly,'remedy','corruption')
comparePair(publica_eic_monopoly,'monopoly_evils','corruption')
comparePair(publica_eic_monopoly,'silk','corruption')
comparePair(publica_eic_monopoly,'gold','corruption')
comparePair(publica_eic_monopoly,'body_politick','corruption')
comparePair(publica_eic_monopoly,'body_politic','corruption')
comparePair(publica_eic_monopoly,'body_politic','monopoly')
comparePair(publica_eic_monopoly,'body_politick','monopoly')
comparePair(publica_eic_monopoly,'odious','monopoly')
comparePair(publica_eic_monopoly,'consumption','monopoly')
comparePair(publica_eic_monopoly,'evil','monopoly')
comparePair(publica_eic_monopoly,'repugnant','monopoly')

Cosine similarity between arbitrary and corruption:  0.8638341
Cosine similarity between popery and corruption:  0.8100888
Cosine similarity between papist and corruption:  0.82812756
Cosine similarity between monopoly and corruption:  0.6794247
Cosine similarity between king and corruption:  0.4092655
Cosine similarity between parliament and corruption:  0.5190952
Cosine similarity between bribery and corruption:  0.8954179
Cosine similarity between remedies and corruption:  0.9430104
Cosine similarity between remedy and corruption:  0.88032734
Cosine similarity between monopoly_evils and corruption:  0.918505
Cosine similarity between silk and corruption:  0.631485
Cosine similarity between gold and corruption:  0.7485949
Cosine similarity between body_politick and corruption:  0.90726376
Cosine similarity between body_politic and corruption:  0.72693694
Cosine similarity between body_politic and monopoly:  0.93463105
Cosine similarity between body_politick and monopoly:  0.82322466


In [53]:
similar(publica_eic_monopoly,'circulation')
similar(publica_eic_monopoly,'disease')
similar(publica_eic_monopoly,'remedy')
similar(publica_eic_monopoly,'remedies')
similar(publica_eic_monopoly,'wasting')
similar(publica_eic_monopoly,'waste')

circulation: conceal answer_ends impoverish utain step measure gainer abate intercoure hoard
disease: invention vice discern attention ignorance harder multiply right_knowledge intelligible solid
remedy: expedients grievance evils calculated avoid duly requires practicable seems care_taken
remedies: vulgar explication intricate_subject puts disease calling steps somewhat simplicity_original questions
wasting: intolerable affluence violet wider tend_increase circulate_amongst consum debauchery thinner unspeakable
waste: country_abounding lower_value train wate consum fast comparatively dearth center enables


In [54]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w)

consumption: conumption manufactured manufactury greater_quantities vent raw silks_linens importations materials consumed
consume: conume workmen working barter utter spend deceitful trades_carried hinder_expense subsistence
consuming: prevented_poible greatest_share proper_remedies goods_manufactured answer_question russians things_premised filthy obstructs ruians
consumed: hinder_expense conumed vend exportation_products scarcity greater_quantities necessaries importations cheaper_manufactures upply_want
conume: consume hinder_expense working transported necessaries raw deceitful furnish growths woollen
conumption: consumption manufactury greater_quantities vent manufactured silks_linens importations importation woollen_goods vat_quantities
conuming: conumptions mint_coined rightly_understood wating tend_increae bargains lessening etimates effectual_methods jotle
conumed: consumed upply_want abundantly prizes hinder_expense overplus greater_value cheaper_manufactures loose employ_poo

In [55]:
corruption = 'corruption|corrupt|corrupted|corruptions|corrupting'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w)

corruption: damage_danger rule_regulating laudable right_knowledge enumerated suggest remedies_disease acquir calamity honestly
corrupt: fortitude trike accountable_god unbecoming overeign government_ordinance beseech eminence generosity strife
corrupted: gaming impoverishing improv quickness cherish languishes occupation lay_foundation purely abrogated
corruptions: endeavours_obtain appears_plain best_managed commended unbiased emolument endowment conducing posture vainly
corrupting: fittest ceases dicerning discerning gadding cation disturbed access liable_payment wiet
