#### Word2Vec is a group of models that can learn to create word embeddings, which are continuous vector representations of words. 
##### Essentially, it maps words to vectors of real numbers in a high-dimensional space.
##### Developed by **__Google researchers__** Tomas Mikolov and his team, Word2Vec uses a neural network to achieve this.
##### Refer Research Paper "Efficient Estimation of Word Representations in Vector Space"
##### https://arxiv.org/abs/1301.3781

In [1]:
!pip install gensim





In [2]:
import gensim

In [3]:
from gensim.models import Word2Vec, KeyedVectors

In [4]:
import gensim.downloader as api
word2vec = api.load('word2vec-google-news-300')

In [None]:
king = word2vec ['king']

In [5]:
king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

In [6]:
king.shape 

(300,)

In [7]:
word2vec['sports']

array([-1.04980469e-01,  1.26953125e-01,  2.23632812e-01, -1.45507812e-01,
        7.03125000e-02, -5.54199219e-02,  3.39843750e-01, -2.01171875e-01,
       -7.22656250e-02, -1.15234375e-01, -4.95605469e-02, -2.85156250e-01,
       -4.95605469e-02,  1.20605469e-01, -7.08007812e-02,  1.37695312e-01,
       -1.29882812e-01,  1.03515625e-01,  3.92578125e-01, -7.32421875e-02,
        1.63574219e-02,  2.87109375e-01,  5.66406250e-02, -1.80664062e-01,
        7.03125000e-02,  9.47265625e-02, -1.53320312e-01,  4.43359375e-01,
        1.64062500e-01,  2.49023438e-02, -2.42919922e-02,  1.44531250e-01,
        2.19726562e-01,  2.91015625e-01,  1.56250000e-02, -2.53906250e-01,
        1.90429688e-01,  7.12890625e-02,  7.51953125e-02,  1.20605469e-01,
        5.27343750e-02, -2.00195312e-01, -2.39562988e-03,  2.22656250e-01,
       -1.00585938e-01,  1.28906250e-01,  1.59179688e-01, -1.53320312e-01,
        6.17675781e-02,  3.30078125e-01,  5.37109375e-02, -4.07714844e-02,
       -1.25976562e-01,  

#### Finding similar words:

In [28]:
word2vec.most_similar('sport')

[('sports', 0.6914728283882141),
 ('Snooki_wannabes', 0.5916634798049927),
 ('painkillers_throat_lozenges', 0.5643172264099121),
 ('racing', 0.5616023540496826),
 ('sporting', 0.559779703617096),
 ('athletics', 0.5516576766967773),
 ('alpine_ski_racing', 0.5514240264892578),
 ('Pole_vaulting', 0.5459784269332886),
 ('motorsport', 0.5384281277656555),
 ('boxing', 0.5330564379692078)]

In [9]:
word2vec.most_similar('happy')

[('glad', 0.7408890724182129),
 ('pleased', 0.6632170677185059),
 ('ecstatic', 0.6626912355422974),
 ('overjoyed', 0.6599286794662476),
 ('thrilled', 0.6514049172401428),
 ('satisfied', 0.6437949538230896),
 ('proud', 0.636042058467865),
 ('delighted', 0.627237856388092),
 ('disappointed', 0.6269949674606323),
 ('excited', 0.6247665286064148)]

In [14]:
word2vec.most_similar('sad')

[('saddening', 0.7273085713386536),
 ('Sad', 0.6610826849937439),
 ('saddened', 0.6604382395744324),
 ('heartbreaking', 0.6573508381843567),
 ('disheartening', 0.6507317423820496),
 ('Meny_Friedman', 0.6487058401107788),
 ('parishioner_Pat_Patello', 0.6475860476493835),
 ('saddens_me', 0.6407119035720825),
 ('distressing', 0.6399092674255371),
 ('reminders_bobbing', 0.6357713341712952)]

#### Finding probability between words:

In [21]:
word2vec.similarity("happy","cricket")

0.10414833

In [27]:
word2vec.similarity("cricket","sport")

0.43420303

#### Finding relationship between words and new vector analysis:

In [17]:
vector=word2vec['king']-word2vec['man']+word2vec['woman']

In [29]:
vector   # logially thinking above vector could be queen

array([ 4.29687500e-02, -1.78222656e-01, -1.29089355e-01,  1.15234375e-01,
        2.68554688e-03, -1.02294922e-01,  1.95800781e-01, -1.79504395e-01,
        1.95312500e-02,  4.09919739e-01, -3.68164062e-01, -3.96484375e-01,
       -1.56738281e-01,  1.46484375e-03, -9.30175781e-02, -1.16455078e-01,
       -5.51757812e-02, -1.07574463e-01,  7.91015625e-02,  1.98974609e-01,
        2.38525391e-01,  6.34002686e-02, -2.17285156e-02,  0.00000000e+00,
        4.72412109e-02, -2.17773438e-01, -3.44726562e-01,  6.37207031e-02,
        3.16406250e-01, -1.97631836e-01,  8.59375000e-02, -8.11767578e-02,
       -3.71093750e-02,  3.15551758e-01, -3.41796875e-01, -4.68750000e-02,
        9.76562500e-02,  8.39843750e-02, -9.71679688e-02,  5.17578125e-02,
       -5.00488281e-02, -2.20947266e-01,  2.29492188e-01,  1.26403809e-01,
        2.49023438e-01,  2.09960938e-02, -1.09863281e-01,  5.81054688e-02,
       -3.35693359e-02,  1.29577637e-01,  2.41699219e-02,  3.48129272e-02,
       -2.60009766e-01,  

#### Finding most accurate relationship:

In [30]:
word2vec.most_similar([vector])

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]