By Amy Weng

Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization

https://radimrehurek.com/gensim/models/word2vec.html

In [25]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = Word2Vec(bigram_transformer[data], min_count=1,sg=1)
    # save model so we can reload later  
    model.save('/home/rapiduser/Materials/embeddings/'+title+'.model')
    # save model in a format that can be converted to tensor TSV 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/tensor/'+title+'.model')
    return(model)

def similar(model,word):
    if word in model.wv.key_to_index.keys():
        words = []
        for w, s in model.wv.most_similar(word,topn=10):
            words.append(w)
        print(word + ': '+' '.join(words))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/tensor/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [26]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')
tensor('publica_eic_monopoly')

In [2]:
publica_eic_monopoly = Word2Vec.load('/home/rapiduser/Materials/embeddings/publica_eic_monopoly.model')

In [31]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w)

monopolies: retrained laws_land restrained restrain freedom_trade limitations contitution lives_liberties retrain equal_right
monopoly: monopolies regulated_company contitution privilege joint_stocks majeties_subjects wrong retrained conideration expedient
monopolise: escheat present_constitution moreover firm_legal ingroed fifths annual_charge ready_prove trengthen carved
monopolising: increase_seamen increases advancing_lands inland vast_numbers augment increase_navigation enriched rates_exchange fishing_trades
monopolists: exhauting prevented_possible shooing whereoever lack obstructs securely division decoy consuming
monopolizers: shopkeeper hard_labour dearet sprung hares consuming adventures_returns betowed mariner butchers
monopolised: managing confining forces_forts advocates majeties_revenue admiion enlargement factors_servants extenive years_past
monoopolies: total_villa diowns aliquas spelthorn devises_lands insanity desuetude sauk banister bridge_tower
monopolits: pounds_an

In [28]:
comparePair(publica_eic_monopoly,'body_politic','monopoly')
comparePair(publica_eic_monopoly,'body_politick','monopoly')
comparePair(publica_eic_monopoly,'body_politic','monopolies')
comparePair(publica_eic_monopoly,'body_politick','monopolies')
comparePair(publica_eic_monopoly,'public_utility','monopolies')
comparePair(publica_eic_monopoly,'public_affairs','monopolies')
comparePair(publica_eic_monopoly,'wicked','monopoly')
comparePair(publica_eic_monopoly,'illegal','monopoly')
comparePair(publica_eic_monopoly,'engrossing','monopoly')
comparePair(publica_eic_monopoly,'odious','monopoly')
comparePair(publica_eic_monopoly,'evil','monopoly')
comparePair(publica_eic_monopoly,'repugnant','monopoly')

Cosine similarity between body_politic and monopoly:  0.94068277
Cosine similarity between body_politick and monopoly:  0.823261
Cosine similarity between body_politic and monopolies:  0.9581043
Cosine similarity between body_politick and monopolies:  0.8644752
Cosine similarity between public_utility and monopolies:  0.8966991
Cosine similarity between public_affairs and monopolies:  0.88422114
Cosine similarity between wicked and monopoly:  0.7428431
Cosine similarity between illegal and monopoly:  0.86903757
Cosine similarity between engrossing and monopoly:  0.8959576
Cosine similarity between odious and monopoly:  0.83295286
Cosine similarity between evil and monopoly:  0.7421546
Cosine similarity between repugnant and monopoly:  0.8678481


In [29]:
comparePair(publica_eic_monopoly,'arbitrary','corruption')
comparePair(publica_eic_monopoly,'popery','corruption')
comparePair(publica_eic_monopoly,'papist','corruption')
comparePair(publica_eic_monopoly,'monopoly','corruption')
comparePair(publica_eic_monopoly,'king','corruption')
comparePair(publica_eic_monopoly,'parliament','corruption')
comparePair(publica_eic_monopoly,'bribery','corruption')
comparePair(publica_eic_monopoly,'remedies','corruption')
comparePair(publica_eic_monopoly,'remedy','corruption')
comparePair(publica_eic_monopoly,'monopoly_evils','corruption')
comparePair(publica_eic_monopoly,'body_politick','corruption')
comparePair(publica_eic_monopoly,'body_politic','corruption')

Cosine similarity between arbitrary and corruption:  0.87673855
Cosine similarity between popery and corruption:  0.8303967
Cosine similarity between papist and corruption:  0.8544534
Cosine similarity between monopoly and corruption:  0.70118546
Cosine similarity between king and corruption:  0.43684372
Cosine similarity between parliament and corruption:  0.52153397
Cosine similarity between bribery and corruption:  0.91377383
Cosine similarity between remedies and corruption:  0.9649916
Cosine similarity between remedy and corruption:  0.88699967
Cosine similarity between monopoly_evils and corruption:  0.94034
Cosine similarity between body_politick and corruption:  0.9369623
Cosine similarity between body_politic and corruption:  0.7542465


In [53]:
similar(publica_eic_monopoly,'circulation')
similar(publica_eic_monopoly,'disease')
similar(publica_eic_monopoly,'remedy')
similar(publica_eic_monopoly,'remedies')
similar(publica_eic_monopoly,'wasting')
similar(publica_eic_monopoly,'waste')

circulation: conceal answer_ends impoverish utain step measure gainer abate intercoure hoard
disease: invention vice discern attention ignorance harder multiply right_knowledge intelligible solid
remedy: expedients grievance evils calculated avoid duly requires practicable seems care_taken
remedies: vulgar explication intricate_subject puts disease calling steps somewhat simplicity_original questions
wasting: intolerable affluence violet wider tend_increase circulate_amongst consum debauchery thinner unspeakable
waste: country_abounding lower_value train wate consum fast comparatively dearth center enables


In [54]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w)

consumption: conumption manufactured manufactury greater_quantities vent raw silks_linens importations materials consumed
consume: conume workmen working barter utter spend deceitful trades_carried hinder_expense subsistence
consuming: prevented_poible greatest_share proper_remedies goods_manufactured answer_question russians things_premised filthy obstructs ruians
consumed: hinder_expense conumed vend exportation_products scarcity greater_quantities necessaries importations cheaper_manufactures upply_want
conume: consume hinder_expense working transported necessaries raw deceitful furnish growths woollen
conumption: consumption manufactury greater_quantities vent manufactured silks_linens importations importation woollen_goods vat_quantities
conuming: conumptions mint_coined rightly_understood wating tend_increae bargains lessening etimates effectual_methods jotle
conumed: consumed upply_want abundantly prizes hinder_expense overplus greater_value cheaper_manufactures loose employ_poo

In [55]:
corruption = 'corruption|corrupt|corrupted|corruptions|corrupting'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w)

corruption: damage_danger rule_regulating laudable right_knowledge enumerated suggest remedies_disease acquir calamity honestly
corrupt: fortitude trike accountable_god unbecoming overeign government_ordinance beseech eminence generosity strife
corrupted: gaming impoverishing improv quickness cherish languishes occupation lay_foundation purely abrogated
corruptions: endeavours_obtain appears_plain best_managed commended unbiased emolument endowment conducing posture vainly
corrupting: fittest ceases dicerning discerning gadding cation disturbed access liable_payment wiet
