By Amy Weng
Word2Vec Continuous Skip-Gram Model Word Embedding and Visualization
https://radimrehurek.com/gensim/models/word2vec.html

In [102]:
import pandas as pd
from words import remove_stopwords
from gensim.scripts import word2vec2tensor
from gensim.models import Word2Vec as w2v
from gensim.models import Phrases

def embed(df,title):
    data = df.text.values.tolist()
    # preprocess and remove stopwords
    data = remove_stopwords(data)
    # Train a bigram detector.
    bigram_transformer = Phrases(data)
    # Train a skip-gram model with the bigram detector 
    model = w2v(bigram_transformer[data], min_count=1,sg=1)
    # save model as file 
    model.wv.save_word2vec_format('/home/rapiduser/Materials/embeddings/'+title+'.model')
    return(model)

def similar(model,word):
    if word in model.wv.key_to_index.keys():
        words = []
        for w, s in model.wv.most_similar(word,topn=10):
            words.append(w)
        print(word + ': '+' '.join(words))

def comparePair(model,word1,word2):
    if word1 and word2 in model.wv.key_to_index.keys():
        print('Cosine similarity between ' + word1 + ' and ' + word2 + ': ',model.wv.similarity(word1, word2))

def tensor(f_name):
    word2vec2tensor.word2vec2tensor(
        '/home/rapiduser/Materials/embeddings/'+f_name+'.model',
        '/home/rapiduser/ECBC-21-22/Text_Files/Embeddings TSV/'+f_name,
        binary=False)

In [90]:
myCSV = '/home/rapiduser/Materials/topic model/publica/eic_monopoly.csv'

readFile = pd.read_csv(myCSV)

# read text information into a dataframe
publica_eic_monopoly = embed(readFile,'publica_eic_monopoly')

In [93]:
from words import monopoly
m = monopoly.split('|')
for w in m:
    similar(publica_eic_monopoly,w)

monopolies: retrained coniderations laws_land reaons equal_right limitations retrain restrain restrained right_subject
monopoly: retrained wrong privilege joint_stocks contitution monopolies body_politic majeties_subjects public_good regulated_company
monopolise: ingroed firm_legal inspect prohibition_denied apprehensive carved values_bought shares_shares parliamentary_sanction respectively
monopolising: remote_parts rates_exchange placeat riches_strength encourages_navigation fishing_trades increase_seamen importation_foreign products_manufactories increase_navigation
monopolists: exhausting cor_living meaner_sort course_living someimes paeth get_livelihood prodigal_expense straining considerate
monopolizers: spared ubtle collecting intrigue steal supplied_negroes ullen consumers silk_weavers prejudice_arise
monopolised: admiion excluive forces_forts admission majeties_plantations managing managed_joint ettling confining solely
monoopolies: molleux crowing cloaks privileges_confirmed 

In [100]:
comparePair(publica_eic_monopoly,'arbitrary','corruption')
comparePair(publica_eic_monopoly,'popery','corruption')
comparePair(publica_eic_monopoly,'papist','corruption')
comparePair(publica_eic_monopoly,'monopoly','corruption')
comparePair(publica_eic_monopoly,'silk','corruption')
comparePair(publica_eic_monopoly,'gold','corruption')
comparePair(publica_eic_monopoly,'king','corruption')
comparePair(publica_eic_monopoly,'parliament','corruption')
comparePair(publica_eic_monopoly,'odious','monopoly')
comparePair(publica_eic_monopoly,'consumption','monopoly')
comparePair(publica_eic_monopoly,'body_politic','monopoly')
comparePair(publica_eic_monopoly,'public_good','monopoly')

Cosine similarity between arbitrary and corruption:  0.85236233
Cosine similarity between popery and corruption:  0.82550555
Cosine similarity between papist and corruption:  0.8281945
Cosine similarity between monopoly and corruption:  0.69432163
Cosine similarity between monopoly and company:  0.5970369
Cosine similarity between silk and corruption:  0.70815533
Cosine similarity between gold and corruption:  0.80199057
Cosine similarity between king and corruption:  0.441147
Cosine similarity between parliament and corruption:  0.49644107
Cosine similarity between odious and monopoly:  0.8537864
Cosine similarity between consumption and monopoly:  0.6824925
Cosine similarity between body_politic and monopoly:  0.9510032
Cosine similarity between public_good and monopoly:  0.9504082


In [None]:
consumption = 'consumption|consume|consuming|consumed|conume|conumption|conuming|conumed'
c = consumption.split('|')
for w in c:
    similar(publica_eic_monopoly,w)

In [None]:
corruption = 'corruption|corrupt|corrupted|corruptions|corruptly|corrupting|corrupttoi|corruptpag|corruptness'
corrupt = corruption.split('|')
for w in corrupt:
    similar(publica_eic_monopoly,w)