# Word2Vec Import

In [17]:
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from string import punctuation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pprint as pp

In [4]:
from nltk.corpus import gutenberg
bible_sentences = gutenberg.sents('bible-kjv.txt')
sw = stopwords.words('english')

In [5]:
bible = [[w.lower() for w in s if w not in punctuation and w not in sw] for s in bible_sentences]
# pp.pprint(bible[:200]) # ppretty print the bible

In [21]:
bible_vec_repr = Word2Vec(bible) # bible_vec_repr is our vector representation model
bible_vec_repr.wv.fill_norms() # this is needed to create synonyms
pp.pprint(bible_vec_repr.wv[0])

array([-0.41680467, -0.8382201 ,  0.13424395,  1.3640419 , -0.6760875 ,
       -0.52173406,  0.87343454, -0.00433023, -0.43176237,  0.5492028 ,
       -0.23415169, -0.06460429,  0.1535433 , -0.32025552, -0.31724018,
       -0.8982869 ,  1.0833757 , -0.2905425 , -0.452064  , -1.9540383 ,
        1.0178686 ,  1.1194868 , -0.10984022, -0.17842367,  0.85790056,
       -0.20858185, -0.13293964,  0.55850005, -1.3293628 , -1.1665276 ,
       -1.1726753 ,  1.1164155 ,  0.03278117,  0.28269196,  0.42691007,
        0.33214143, -0.54662585, -1.1147794 , -0.29612616, -0.15167166,
       -1.9943919 ,  0.0516477 , -0.96721566,  0.6152701 ,  0.6417739 ,
       -0.4621336 , -1.2797962 ,  0.8635972 , -0.5663206 ,  1.5159037 ,
       -1.0187337 ,  1.4493569 , -0.37736425,  0.33860862, -0.15099522,
       -0.4659194 ,  0.93663985, -0.9466536 , -0.6552724 ,  2.094246  ,
        1.0316957 , -0.27493966, -0.4299268 , -0.7016775 ,  0.33957526,
       -0.9740273 , -0.30406207,  0.88687634, -0.23039478,  1.11

In [7]:
pp.pprint(bible_vec_repr.wv.most_similar('god', topn=8)) # what are the most similar words to god?

[('mercy', 0.9263613820075989),
 ('liveth', 0.894212007522583),
 ('grace', 0.8868008255958557),
 ('truth', 0.8864462971687317),
 ('hosts', 0.8857892751693726),
 ('glory', 0.8830561637878418),
 ('salvation', 0.8763213157653809),
 ('righteousness', 0.8710134029388428)]


In [8]:
pp.pprint(bible_vec_repr.wv.most_similar('devil', topn=8)) # what are the most similar words to devil?

[('oppression', 0.9780843257904053),
 ('hard', 0.9777470231056213),
 ('hasty', 0.9759880304336548),
 ('bridle', 0.9705972075462341),
 ('unless', 0.9703243970870972),
 ('snared', 0.9701904654502869),
 ('ministry', 0.9694772362709045),
 ('necks', 0.969402551651001)]


In [9]:
pp.pprint(bible_vec_repr.wv.most_similar('heaven', topn=8)) # what are the most similar words to heaven?

[('heavens', 0.8794105648994446),
 ('earth', 0.8657416105270386),
 ('opened', 0.7618483901023865),
 ('face', 0.7432582974433899),
 ('rain', 0.7384818196296692),
 ('rock', 0.7339916229248047),
 ('power', 0.7300347089767456),
 ('maketh', 0.7235194444656372)]


In [10]:
pp.pprint(bible_vec_repr.wv.most_similar('hell', topn=8)) # what are the most similar words to hell?

[('hairs', 0.9672659635543823),
 ('bind', 0.9642070531845093),
 ('belly', 0.9629202485084534),
 ('root', 0.9627992510795593),
 ('treadeth', 0.9622930884361267),
 ('doves', 0.9617848992347717),
 ('pluck', 0.9610278606414795),
 ('grave', 0.9603418111801147)]


In [11]:
pp.pprint(bible_vec_repr.wv.most_similar('jesus', topn=8)) # what are the most similar words to jesus?

[('cried', 0.8468081951141357),
 ('prophet', 0.839110791683197),
 ('word', 0.8253524899482727),
 ('master', 0.8241215944290161),
 ('told', 0.8143592476844788),
 ('samuel', 0.811827540397644),
 ('david', 0.8029812574386597),
 ('spake', 0.7892252206802368)]


In [12]:
pp.pprint(bible_vec_repr.wv.most_similar('angel', topn=8)) # what are the most similar words to angel?

[('cried', 0.9121896624565125),
 ('samuel', 0.8552687764167786),
 ('loud', 0.8469710946083069),
 ('appeared', 0.8005466461181641),
 ('balaam', 0.7972202897071838),
 ('wept', 0.7910525798797607),
 ('besought', 0.7887764573097229),
 ('prayed', 0.7886304259300232)]


In [13]:
pp.pprint(bible_vec_repr.wv.most_similar('satan', topn=8)) # what are the most similar words to satan?

[('kindly', 0.9606541991233826),
 ('bid', 0.9516599178314209),
 ('aloud', 0.9510129690170288),
 ('speech', 0.9443800449371338),
 ('diviners', 0.943240225315094),
 ('quicken', 0.9431401491165161),
 ('martha', 0.9407162666320801),
 ('caesar', 0.9390252232551575)]


# Vector Math

In [72]:
vec = bible_vec_repr.wv['hate'] + bible_vec_repr.wv['pride'] # i.e. hate - prophet = wine
print(vec)

[-0.7569938   0.8879813  -0.78702974 -0.24554564 -0.01011772  0.23533432
 -0.16551168  0.79002535  0.2717863  -0.16873066 -0.0329203  -0.3190724
 -0.02938815 -0.3417831   0.5409791   0.21466517  0.08541109 -0.71320677
  0.08624149 -0.7936401   0.09387267 -0.14707983  1.3149691   0.05255236
 -0.39318427  0.10540803 -0.37265185  0.1222422  -0.05169637  0.6710253
 -0.28758466 -0.33805722 -0.4996088  -0.08908856 -0.0981572   0.5237533
 -0.30051583 -0.3489772   0.22855367 -0.88214177  0.66309416 -0.32017905
 -0.39931786 -0.2582578   0.13474534 -0.37631774 -0.86381626  0.07600288
  0.11230308 -0.4761201   0.69829977 -0.17341009 -0.00612396  0.18559736
  0.00533868  0.16035753  0.43998715 -0.47953367 -0.57589126  0.34792778
  0.3868633  -0.01900236  0.20777294 -0.09766676 -0.43857002  0.37031692
  0.33726227  0.47669125 -0.613973    0.33288127  0.00885321  0.23456916
  0.15450427  0.338553   -0.02226505  0.3800544   0.07221765 -0.17030242
 -0.48297226 -0.4605242  -0.4133175  -0.3676825  -0.60

In [73]:
bible_vec_repr.wv.most_similar(positive=[vec], topn=10) # i.e. hate + pride = folly

[('upright', 0.9789418578147888),
 ('thoughts', 0.9749749898910522),
 ('workers', 0.9743785858154297),
 ('condemn', 0.97227543592453),
 ('hate', 0.9702574014663696),
 ('forbear', 0.968469500541687),
 ('adversaries', 0.9665278792381287),
 ('folly', 0.9664924740791321),
 ('speedily', 0.9653268456459045),
 ('uprightness', 0.9648520350456238)]

# Using a Pretrained W2V Model (Google News)

In [126]:
gn_model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True, limit=300000)

In [127]:
pp.pprint(gn_model.most_similar('god')) # what are the most similar words to god (Google News)?

[('gods', 0.7856108546257019),
 ('deity', 0.7325143814086914),
 ('God', 0.666293203830719),
 ('diety', 0.6607768535614014),
 ('goddess', 0.6518735289573669),
 ('deities', 0.6512717604637146),
 ('lord', 0.635028064250946),
 ('god_Zeus', 0.6218456625938416),
 ('Almighty', 0.6040672659873962),
 ('Gods', 0.6037754416465759)]


In [128]:
pp.pprint(bible_vec_repr.wv.most_similar('god')) # what are the most similar words to god (King James)?

[('mercy', 0.937945544719696),
 ('glory', 0.909177839756012),
 ('liveth', 0.8952507972717285),
 ('grace', 0.8938044905662537),
 ('truth', 0.8920682668685913),
 ('righteousness', 0.8908671736717224),
 ('salvation', 0.8859186172485352),
 ('hosts', 0.8845880031585693),
 ('bless', 0.8707209825515747),
 ('faith', 0.8679599761962891)]


In [129]:
# Let's perform vector math on the Google News Word2Vec model (https://code.google.com/archive/p/word2vec/)

vec = gn_model['god'] - gn_model['mercy'] - gn_model['faith'] 
pp.pprint(gn_model.most_similar(positive=[vec], topn=10))

# Although with not high certainty, god without mercy and faith is ANT_MAN

[('ANT_MAN', 0.28661471605300903),
 ('pronounced_sah', 0.2814408838748932),
 ('Nub', 0.2810094356536865),
 ('wahs', 0.2796354293823242),
 ('Cephei', 0.2793895900249481),
 ('pronounced_AY', 0.2786712646484375),
 ('rahs', 0.2758975028991699),
 ('Zenbu', 0.27580705285072327),
 ('ferris', 0.2752784490585327),
 ('Boletus_edulis', 0.2744906544685364)]


# KMeans Import

In [18]:
from nltk.cluster import KMeansClusterer, euclidean_distance