By Amy Weng

Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization

https://radimrehurek.com/gensim/models/word2vec.html

In [161]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec as w2v
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = w2v(bigram_transformer[data], min_count=1,sg=1)
    # save model as file 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/'+title+'.model')
    return(model)

def similar(model,word):
    if word in model.wv.key_to_index.keys():
        words = []
        for w, s in model.wv.most_similar(word,topn=10):
            words.append(w)
        print(word + ': '+' '.join(words))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [162]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')
tensor('publica_eic_monopoly')

In [164]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w)

monopolies: retrained laws_land coniderations agreeable freedom_trade body_politic authorities circumtances privilege equal_right
monopoly: body_politic joint_stocks concerned monopolies privilege majeties_subjects contitution wrong pretence regulated_company
monopolise: partaker corporation_excluding dissolution_present shares_stock firm_legal intereed rrying shares_shares present_constitution tion
monopolising: enriched fishing_trades remote_parts things_necessary wet_indies marts riches_strength fihing_trades vast_numbers greater_prices
monopolists: monopolits double_tax disadvantage cor_living suggeted uefulnes inclinable nations_conume straining lessening
monopolizers: supplied_negroes ot herring_fishery straining necessitated neglects intellect sometimes_carried spared proportioned
monopolised: admission forces_forts admiion managing hout majeties_plantations managed_joint enlargement unneceary engrossed
monoopolies: molleux crowing cloaks privileges_confirmed loveteyns cotton_ca

In [165]:
comparePair(publica_eic_monopoly,'arbitrary','corruption')
comparePair(publica_eic_monopoly,'popery','corruption')
comparePair(publica_eic_monopoly,'papist','corruption')
comparePair(publica_eic_monopoly,'monopoly','corruption')
comparePair(publica_eic_monopoly,'king','corruption')
comparePair(publica_eic_monopoly,'parliament','corruption')
comparePair(publica_eic_monopoly,'public_good','corruption')
comparePair(publica_eic_monopoly,'body_politick','corruption')
comparePair(publica_eic_monopoly,'body_politic','corruption')
comparePair(publica_eic_monopoly,'bribery','corruption')
comparePair(publica_eic_monopoly,'remedies','corruption')
comparePair(publica_eic_monopoly,'remedy','corruption')
comparePair(publica_eic_monopoly,'odious','monopoly')
comparePair(publica_eic_monopoly,'consumption','monopoly')
comparePair(publica_eic_monopoly,'body_politic','monopoly')
comparePair(publica_eic_monopoly,'body_politick','monopoly')
comparePair(publica_eic_monopoly,'public_good','monopoly')
comparePair(publica_eic_monopoly,'evil','monopoly')
comparePair(publica_eic_monopoly,'repugnant','monopoly')
comparePair(publica_eic_monopoly,'monopoly_evils','corruption')
comparePair(publica_eic_monopoly,'silk','corruption')
comparePair(publica_eic_monopoly,'gold','corruption')

Cosine similarity between arbitrary and corruption:  0.90487784
Cosine similarity between popery and corruption:  0.8567077
Cosine similarity between papist and corruption:  0.86606336
Cosine similarity between monopoly and corruption:  0.71390474
Cosine similarity between king and corruption:  0.43326586
Cosine similarity between parliament and corruption:  0.4980578
Cosine similarity between public_good and corruption:  0.77165323
Cosine similarity between body_politick and corruption:  0.9415461
Cosine similarity between body_politic and corruption:  0.7938976
Cosine similarity between bribery and corruption:  0.9145992
Cosine similarity between remedies and corruption:  0.9604492
Cosine similarity between remedy and corruption:  0.9067065
Cosine similarity between odious and monopoly:  0.8497932
Cosine similarity between consumption and monopoly:  0.70815563
Cosine similarity between body_politic and monopoly:  0.9530904
Cosine similarity between body_politick and monopoly:  0.8265

In [166]:
similar(publica_eic_monopoly,'public_good')
similar(publica_eic_monopoly,'circulation')
similar(publica_eic_monopoly,'disease')
similar(publica_eic_monopoly,'remedy')
similar(publica_eic_monopoly,'remedies')
similar(publica_eic_monopoly,'wasting')
similar(publica_eic_monopoly,'waste')

public_good: managers joint_stocks regulated_company admit permit apparent depending propoed alteration admitted
circulation: stock_treasure arises gather clog originally subsist impoverish conceal recourse measure
disease: discern spare observation invention simple harder puts right_knowledge suffice circle
remedy: avoid calculated evils community appearing worse merit propose intend obviate
remedies: explication vulgar diseases intricate_subject puts errors counsels iniquity practise virtuous
wasting: immense_sums clipp finical ervile unnecessary_charge slow circulate_amongst live_comfortably tend_increase degenerates
waste: coarse grower less_quantity dearth carcity pretty likely_grow train weighs wate


In [167]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w)

consumption: conumption manufactured manufactury vent materials greater_quantities consumed exportations supplied importations
consume: conume trades_carried deceitful subsistence produces barter mines_gold working native_fruits silver_gold
consuming: spends sufficient_supply greatest_share meaner_ort correspondence_presidencies bear_charge admiralities thrut beating forty_pounds
consumed: vend hinder_expense greater_quantities importations conumed upplyed inconsiderable exportations supplied inconiderable
conume: consume growths working hinder_expense raw transported necessaries native_fruits deceitful trades_carried
conumption: consumption vent manufactured manufactury importation silks_linens greater_quantities woollen_manufactures cheaper exportations
conuming: ued_utmot merchantable conumptions desirable lessening decreased extraordinary_charge starve shifting prodigal_expense
conumed: consumed inconiderable hinder_expense inconsiderable abundantly exportations supplied fishery im

In [169]:
corruption = 'corruption|corrupt|corrupted|corruptions|corruptly|corrupting|corrupttoi|corruptpag|corruptness'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w)

corruption: woney rule_regulating simplicity false right_knowledge evils_errors explain_meaning intricate_subject instruct universally
corrupt: trike man_naturally anguish remembrance sport revellings painful changeable insufficient humanly_speaking
corrupted: gaming preserv draws lazy coveted ports_mediterranean rouse reasoning thekings augmenting
corruptions: public_utility bye case_stands diadvantageous emolument affecting setting_poor endowment grutched deductions
corruptly: hog carpenters las puerto nycoppen fruit_trees caribdis funen situate_degrees curiously
corrupting: blameable fittest dut contrivances courthip mutinous brachmans god_ordinance poterities visiting
corrupttoi: standar destained beneficence almans plowden hurled algate cahell nimium rursus
corruptpag: information_riot wulferre princerespected raynar offence_committed lawsthe bourk_clanrickard peeping anthropo mephiboheth
corruptness: praetor erberie vision caligula ipsi pars anointed curses academy funerals
