# Creating Word Vectors with word2vec

In this notebook, we create word vectors from a corpus of public-domain books, a selection from [Project Gutenberg](https://www.gutenberg.org/).

#### Load dependencies

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [2]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Load data

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

In [5]:
len(gutenberg.fileids())

18

In [6]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Tokenize text

In [7]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [8]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [9]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [10]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [11]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [12]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [14]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [15]:
gberg_sents[4][14]

'father'

In [16]:
# another convenient method that we don't immediately need: 
gutenberg.words() 

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [17]:
# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

In [18]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

#### Run word2vec

In [19]:
# model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=8)

In [20]:
# model.save('raw_gutenberg_model.w2v')

#### Explore model

In [21]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [23]:
model.wv['dog']

array([-2.5793877e-01,  1.1699589e-01, -1.6762055e-01,  1.2229755e-01,
       -4.9805470e-02, -8.9718394e-02, -4.9708080e-01, -5.0261606e-02,
       -3.5851620e-02,  1.7913146e-02, -2.6219659e-02,  1.7865178e-01,
       -1.4097963e-01, -5.0025001e-02,  7.7758893e-02, -4.8630935e-01,
       -2.9639542e-01, -1.7738166e-01,  7.2441064e-04,  9.4514467e-02,
        4.3966466e-01, -2.2670019e-01,  2.6953089e-01, -2.1665299e-01,
       -4.4041045e-02,  4.0036646e-01, -1.9829671e-01,  4.0851843e-02,
        1.0639883e-01,  1.1259313e-01, -5.4921287e-01,  3.7452701e-01,
        5.7400495e-01,  5.5473244e-01,  2.9348418e-02,  2.4654505e-01,
       -4.2376769e-01,  5.3879824e-03,  1.5634653e-01, -2.7653140e-01,
        5.2116376e-01, -1.6660908e-01,  4.9827844e-01,  3.3996269e-02,
       -5.8078498e-01, -4.0201211e-01,  7.8634053e-02,  3.8631949e-01,
       -3.7188753e-02,  3.3487952e-01,  7.5791055e-01, -4.0295161e-02,
       -4.1625524e-01,  5.7259612e-02, -6.0007502e-03,  1.9733144e-01,
      

In [24]:
len(model.wv['dog'])

64

In [26]:
model.wv.most_similar('dog') # distance

[('puppy', 0.8346770405769348),
 ('chimney', 0.7908495664596558),
 ('cage', 0.7744636535644531),
 ('thief', 0.7673177719116211),
 ('whip', 0.7652339935302734),
 ('broth', 0.7602391839027405),
 ('sweeper', 0.7593918442726135),
 ('boy', 0.7445755004882812),
 ('lazy', 0.7435389161109924),
 ('kick', 0.740685224533081)]

In [27]:
model.wv.most_similar('think')

[('contradict', 0.840507984161377),
 ('suppose', 0.8370952606201172),
 ('manage', 0.8311524391174316),
 ('believe', 0.8244760036468506),
 ('Mamma', 0.8190703392028809),
 ('NOW', 0.8185515999794006),
 ('imagine', 0.8127619028091431),
 ('behave', 0.8083094954490662),
 ('guess', 0.8069778084754944),
 ('know', 0.8033992052078247)]

In [28]:
model.wv.most_similar('day')

[('morning', 0.7758408188819885),
 ('night', 0.7631299495697021),
 ('time', 0.7534291744232178),
 ('month', 0.7200314402580261),
 ('week', 0.6975276470184326),
 ('Saturday', 0.6719169616699219),
 ('Adar', 0.6649736762046814),
 ('evening', 0.6564797163009644),
 ('seventh', 0.6562521457672119),
 ('sabbath', 0.6537732481956482)]

In [29]:
model.wv.most_similar('father')

[('mother', 0.8594024181365967),
 ('brother', 0.8391922116279602),
 ('sister', 0.8008431792259216),
 ('daughter', 0.7915855646133423),
 ('wife', 0.7882314324378967),
 ('Amnon', 0.7664843797683716),
 ('Tamar', 0.7450737953186035),
 ('younger', 0.7415950894355774),
 ('servant', 0.740749716758728),
 ('uncle', 0.7388681769371033)]

In [32]:
model.wv.doesnt_match("mother father daughter dog".split())

'dog'

In [33]:
model.wv.similarity('father', 'dog')

0.46528155

In [34]:
# close, but not quite; distinctly in female direction: 
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

[('husband', 0.7817324995994568),
 ('daughter', 0.7802364826202393),
 ('wife', 0.7752183675765991),
 ('mother', 0.7721236944198608),
 ('sister', 0.7572321891784668),
 ('brother', 0.7298124432563782),
 ('Rachel', 0.7151593565940857),
 ('Sarah', 0.7043939828872681),
 ('Tamar', 0.6967372894287109),
 ('Leah', 0.692772388458252)]

In [35]:
# more confident about this one: 
model.wv.most_similar(positive=['son', 'woman'], negative=['man']) 

[('Leah', 0.7453159689903259),
 ('Sarah', 0.7336412668228149),
 ('wife', 0.7237880825996399),
 ('daughter', 0.7168752551078796),
 ('Bethuel', 0.7127430438995361),
 ('Rachel', 0.7067197561264038),
 ('Hagar', 0.6947377920150757),
 ('Sarai', 0.6944972276687622),
 ('Tamar', 0.6844563484191895),
 ('Bilhah', 0.6817082166671753)]

In [36]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

[('wife', 0.7358366847038269),
 ('sister', 0.6928739547729492),
 ('widow', 0.6860531568527222),
 ('daughter', 0.6839768886566162),
 ('mother', 0.6765022277832031),
 ('child', 0.674767017364502),
 ('conceived', 0.6671896576881409),
 ('Rachel', 0.6533997058868408),
 ('maid', 0.6480797529220581),
 ('nurse', 0.6368674635887146)]

In [37]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30) 

[('Sarah', 0.7076258063316345),
 ('Babylon', 0.6764272451400757),
 ('Rachel', 0.6728379726409912),
 ('Leah', 0.656883716583252),
 ('daughter', 0.6472305655479431),
 ('Abram', 0.6398137211799622),
 ('Vashti', 0.6352875232696533),
 ('Sarai', 0.6350667476654053),
 ('queen', 0.6346942782402039),
 ('Judah', 0.6322956085205078),
 ('tribute', 0.631965160369873),
 ('household', 0.6319213509559631),
 ('Bethuel', 0.627884030342102),
 ('Hagar', 0.6206828355789185),
 ('Rahab', 0.6168578863143921),
 ('Padanaram', 0.6155596971511841),
 ('eunuchs', 0.6131464242935181),
 ('David', 0.6119171380996704),
 ('Cain', 0.6111243963241577),
 ('wife', 0.6110631823539734),
 ('Rebekah', 0.6097539067268372),
 ('Haman', 0.6097317934036255),
 ('Shechem', 0.6087343096733093),
 ('Ephron', 0.6080019474029541),
 ('Esther', 0.6074013113975525),
 ('Hanun', 0.6067836880683899),
 ('Laban', 0.6063657999038696),
 ('Onan', 0.6046965718269348),
 ('Esau', 0.6045227646827698),
 ('magicians', 0.6044384837150574)]

In [None]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

#### Reduce word vector dimensionality with t-SNE

In [39]:
len(model.wv.vocab)

17011

In [41]:
X = model.wv[model.wv.vocab]

In [42]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [43]:
X_2d = tsne.fit_transform(X)

In [50]:
X_2d[0:5]

array([[-22.123093, -53.10577 ],
       [-58.997185,  14.092356],
       [-17.393017,  27.653416],
       [-57.936623,  15.661589],
       [-22.081415, -53.052452]], dtype=float32)

In [51]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [52]:
coords_df.head()

Unnamed: 0,x,y,token
0,-22.123093,-53.10577,[
1,-58.997185,14.092356,Emma
2,-17.393017,27.653416,by
3,-57.936623,15.661589,Jane
4,-22.081415,-53.052452,]


In [53]:
# coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [54]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [56]:
output_notebook() # output bokeh plots inline in notebook

In [57]:
subset_df = coords_df.sample(n=5000)

In [58]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [59]:
show(p)