Word Embeddings
===

Using the `gensim` Python package.


In [1]:
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [21]:
%matplotlib inline

from pathlib import Path
import os

import pandas as pd
import numpy as np

from collections import Counter
from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib.dates as md
import matplotlib
import pylab as pl
from IPython.core.display import display, HTML

In [3]:
# data is stored relative to the root of the git repository
git_root_dir = !git rev-parse --show-toplevel
git_root_dir = Path(git_root_dir[0].strip())
git_root_dir

PosixPath('/home/levon003/repos/nlp-for-hci-workshop')

In [4]:
wikitext_dir = git_root_dir / 'data' / 'wikitext-103'
train = wikitext_dir / "wiki.train.tokens"
valid = wikitext_dir / "wiki.valid.tokens"
test = wikitext_dir / "wiki.test.tokens"
assert train.exists() and valid.exists() and test.exists()

In [5]:
sentences = []
with open(train, 'r') as infile:
    for line in infile:
        tokens = line.strip().split()
        if len(tokens) > 2:
            sentences.append(tokens)
len(sentences)

1151408

In [6]:
sentences[0]

['=', 'Valkyria', 'Chronicles', 'III', '=']

In [None]:
model = gensim.models.Word2Vec(sentences, min_count=100, workers=16)

Sample output from the above training process:

```
...
2019-03-12 23:57:31,638 : INFO : collected 267717 word types from a corpus of 101405124 raw words and 1151408 sentences
2019-03-12 23:57:31,639 : INFO : Loading a fresh vocabulary
2019-03-12 23:57:31,767 : INFO : min_count=100 retains 37775 unique words (14% of original 267717, drops 229942)
2019-03-12 23:57:31,767 : INFO : min_count=100 leaves 97718769 word corpus (96% of original 101405124, drops 3686355)
2019-03-12 23:57:31,848 : INFO : deleting the raw counts dictionary of 267717 items
2019-03-12 23:57:31,856 : INFO : sample=0.001 downsamples 34 most-common words
2019-03-12 23:57:31,856 : INFO : downsampling leaves estimated 70703379 word corpus (72.4% of prior 97718769)
2019-03-12 23:57:31,958 : INFO : estimated required memory for 37775 words and 100 dimensions: 49107500 bytes
2019-03-12 23:57:31,958 : INFO : resetting layer weights
2019-03-12 23:57:32,215 : INFO : training model with 16 workers on 37775 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
...
2019-03-13 00:00:41,923 : INFO : training on a 507025620 raw words (353511822 effective words) took 189.7s, 1863460 effective words/s
```

In [9]:
len(model.wv.vocab)

37775

In [26]:
models_dir = git_root_dir / "data" / "models"
if not models_dir.exists():
    os.makedirs(models_dir)
model_filepath = (models_dir / "wikitext.w2v").as_posix()

In [27]:
model.wv.save(model_filepath)

2019-03-13 00:09:29,169 : INFO : saving Word2VecKeyedVectors object under /home/levon003/repos/nlp-for-hci-workshop/data/models/wikitext.w2v, separately None
2019-03-13 00:09:29,170 : INFO : not storing attribute vectors_norm
2019-03-13 00:09:29,337 : INFO : saved /home/levon003/repos/nlp-for-hci-workshop/data/models/wikitext.w2v


In [32]:
# load the saved model
wv = gensim.models.KeyedVectors.load(model_filepath)

2019-03-13 00:13:05,607 : INFO : loading Word2VecKeyedVectors object from /home/levon003/repos/nlp-for-hci-workshop/data/models/wikitext.w2v
2019-03-13 00:13:05,723 : INFO : setting ignored attribute vectors_norm to None
2019-03-13 00:13:05,724 : INFO : loaded /home/levon003/repos/nlp-for-hci-workshop/data/models/wikitext.w2v


In [33]:
len(wv['the'])

100

In [44]:
wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.7972210645675659)]

In [35]:
group = "breakfast cereal dinner lunch".split()
wv.doesnt_match(group)

'cereal'

In [36]:
wv.similarity('woman', 'man')

0.8140723100443483

In [43]:
wv.similarity('Kerrigan', 'Raynor')

0.8120143471501855

In [69]:
wv.most_similar(positive=['money'], topn=10)

[('cash', 0.7672889232635498),
 ('funds', 0.7473315000534058),
 ('profits', 0.6970522999763489),
 ('payments', 0.6843724250793457),
 ('bribes', 0.680188775062561),
 ('debt', 0.6790893077850342),
 ('loans', 0.6788615584373474),
 ('compensation', 0.6747171878814697),
 ('fortune', 0.6683861613273621),
 ('ransom', 0.6646652221679688)]

In [70]:
wv.most_similar(positive=['fortune'], topn=10)

[('wealth', 0.7376638054847717),
 ('money', 0.6683861017227173),
 ('reputation', 0.6277018785476685),
 ('debt', 0.6262955665588379),
 ('debts', 0.6246355772018433),
 ('ambition', 0.618557870388031),
 ('uncle', 0.6147124171257019),
 ('grandmother', 0.6039069890975952),
 ('gifts', 0.6032428741455078),
 ('retinue', 0.6031603813171387)]

In [71]:
wv.most_similar(positive=['='], topn=10)

[('Postwar', 0.4719221591949463),
 ('Current', 0.4595763087272644),
 ('Coaching', 0.447429358959198),
 ('Wartime', 0.42003268003463745),
 ('Naming', 0.4007680416107178),
 ('Pre', 0.3979620337486267),
 ('Proposed', 0.3917207717895508),
 ('Preliminary', 0.3714796304702759),
 ('include', 0.3673864006996155),
 ('Gunpowder', 0.36704909801483154)]

In [72]:
wv.most_similar(positive=['<unk>'], topn=10)

[('na', 0.7089570760726929),
 ('́', 0.6850618124008179),
 ('्', 0.6753164529800415),
 ('ki', 0.6698192954063416),
 ('das', 0.6543229222297668),
 ('te', 0.6464616656303406),
 ('IN', 0.6373640298843384),
 ('THE', 0.6352424621582031),
 ('og', 0.6086913347244263),
 ('・', 0.6061152219772339)]

## Comparison with other embeddings

We compare against pre-trained embeddings designed to show less bias.

https://github.com/commonsense/conceptnet-numberbatch

In [47]:
# Download the latest version of the Numberbatch embeddings; copied from the repository linked above
!wget https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz -O {models_dir}/numberbatch.txt.gz

--2019-03-13 00:24:16--  https://conceptnet.s3.amazonaws.com/downloads/2017/numberbatch/numberbatch-en-17.06.txt.gz
Resolving conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)... 52.216.110.83
Connecting to conceptnet.s3.amazonaws.com (conceptnet.s3.amazonaws.com)|52.216.110.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 269500348 (257M) [text/plain]
Saving to: ‘/home/levon003/repos/nlp-for-hci-workshop/data/models/numberbatch.txt.gz’


2019-03-13 00:24:22 (47.0 MB/s) - ‘/home/levon003/repos/nlp-for-hci-workshop/data/models/numberbatch.txt.gz’ saved [269500348/269500348]



In [50]:
!gunzip {models_dir}/numberbatch.txt.gz

In [51]:
cnnb_wv = gensim.models.KeyedVectors.load_word2vec_format((models_dir / "numberbatch.txt").as_posix())

2019-03-13 00:27:17,285 : INFO : loading projection weights from /home/levon003/repos/nlp-for-hci-workshop/data/models/numberbatch.txt
2019-03-13 00:28:26,362 : INFO : loaded (417194, 300) matrix from /home/levon003/repos/nlp-for-hci-workshop/data/models/numberbatch.txt


In [53]:
cnnb_wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('male_monarch', 0.7743297815322876),
 ('king_of_france', 0.7732952833175659),
 ('king_of_germans', 0.7699317932128906),
 ('riley_b_king', 0.7568389177322388),
 ('king_of_england', 0.7519593238830566)]

In [57]:
wv.similarity('John', 'violent'), wv.similarity('Jamal', 'violent')

(-0.17533369082138786, 0.10115992027055618)

In [59]:
wv.similarity('man', 'leader'), wv.similarity('woman', 'leader')

(0.3419413144837632, 0.21904274868122742)

In [62]:
cnnb_wv.similarity('john', 'violent'), cnnb_wv.similarity('jamal', 'violent')

(0.03260216997446677, 0.0031282791130675877)

In [63]:
cnnb_wv.similarity('man', 'leader'), cnnb_wv.similarity('woman', 'leader')

(0.14042042962506568, 0.12014459835681429)