# Word2Vec Model

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

2021-02-08 15:10:26,445 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-02-08 15:10:26,446 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [3]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

2021-02-08 15:10:43,333 : INFO : collecting all words and their counts
2021-02-08 15:10:43,336 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-08 15:10:43,416 : INFO : collected 6981 word types from a corpus of 58152 raw words and 300 sentences
2021-02-08 15:10:43,417 : INFO : Loading a fresh vocabulary
2021-02-08 15:10:43,421 : INFO : effective_min_count=5 retains 1750 unique words (25% of original 6981, drops 5231)
2021-02-08 15:10:43,422 : INFO : effective_min_count=5 leaves 49335 word corpus (84% of original 58152, drops 8817)
2021-02-08 15:10:43,426 : INFO : deleting the raw counts dictionary of 6981 items
2021-02-08 15:10:43,427 : INFO : sample=0.001 downsamples 51 most-common words
2021-02-08 15:10:43,427 : INFO : downsampling leaves estimated 35935 word corpus (72.8% of prior 49335)
2021-02-08 15:10:43,430 : INFO : estimated required memory for 1750 words and 100 dimensions: 2275000 bytes
2021-02-08 15:10:43,431 : INFO : resetting layer weight

In [17]:
model.wv.index2word

['the',
 'to',
 'of',
 'in',
 'and',
 'he',
 'is',
 'for',
 'on',
 'said',
 'that',
 'has',
 'says',
 'was',
 'have',
 'it',
 'be',
 'are',
 'with',
 'will',
 'at',
 'mr',
 'from',
 'by',
 'we',
 'been',
 'as',
 'an',
 'not',
 'his',
 'but',
 'they',
 'after',
 'were',
 'had',
 'there',
 'new',
 'this',
 'australia',
 'australian',
 'who',
 'people',
 'palestinian',
 'their',
 'two',
 'government',
 'up',
 'south',
 'us',
 'which',
 'year',
 'one',
 'about',
 'out',
 'if',
 'also',
 'more',
 'when',
 'its',
 'would',
 'into',
 'first',
 'against',
 'last',
 'israeli',
 'minister',
 'arafat',
 'all',
 'over',
 'three',
 'afghanistan',
 'united',
 'no',
 'world',
 'police',
 'or',
 'than',
 'fire',
 'before',
 'attacks',
 'some',
 'security',
 'day',
 'states',
 'you',
 'could',
 'them',
 'today',
 'say',
 'now',
 'told',
 'time',
 'any',
 'very',
 'laden',
 'just',
 'bin',
 'can',
 'sydney',
 'still',
 'president',
 'what',
 'company',
 'four',
 'man',
 'taliban',
 'killed',
 'forces',


In [18]:
vec_king = model.wv['king']
wv = model.wv
for index, word in enumerate(wv.index2word):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index2word)} is {word}")

word #0/1750 is the
word #1/1750 is to
word #2/1750 is of
word #3/1750 is in
word #4/1750 is and
word #5/1750 is he
word #6/1750 is is
word #7/1750 is for
word #8/1750 is on
word #9/1750 is said


In [19]:
vec_king.shape

(100,)

### evaluating

In [20]:
model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

2021-02-08 15:27:21,864 : INFO : Evaluating word analogies for top 300000 words in the model on /Users/wegzheng/.pyenv/versions/3.7.9/lib/python3.7/site-packages/gensim/test/test_data/questions-words.txt
2021-02-08 15:27:21,868 : INFO : precomputing L2-norms of word weight vectors
2021-02-08 15:27:21,875 : INFO : capital-common-countries: 0.0% (0/6)
2021-02-08 15:27:21,890 : INFO : capital-world: 0.0% (0/2)
2021-02-08 15:27:21,906 : INFO : family: 0.0% (0/6)
2021-02-08 15:27:21,921 : INFO : gram3-comparative: 0.0% (0/20)
2021-02-08 15:27:21,927 : INFO : gram4-superlative: 0.0% (0/12)
2021-02-08 15:27:21,935 : INFO : gram5-present-participle: 0.0% (0/20)
2021-02-08 15:27:21,944 : INFO : gram6-nationality-adjective: 0.0% (0/30)
2021-02-08 15:27:21,954 : INFO : gram7-past-tense: 0.0% (0/20)
2021-02-08 15:27:21,962 : INFO : gram8-plural: 0.0% (0/30)
2021-02-08 15:27:21,966 : INFO : Quadruplets with out-of-vocabulary words: 99.3%
2021-02-08 15:27:21,967 : INFO : NB: analogies containing OOV

(0.0,
 [{'section': 'capital-common-countries',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('CANBERRA', 'AUSTRALIA', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE'),
    ('KABUL', 'AFGHANISTAN', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'CANBERRA', 'AUSTRALIA'),
    ('PARIS', 'FRANCE', 'KABUL', 'AFGHANISTAN')]},
  {'section': 'capital-world',
   'correct': [],
   'incorrect': [('CANBERRA', 'AUSTRALIA', 'KABUL', 'AFGHANISTAN'),
    ('KABUL', 'AFGHANISTAN', 'PARIS', 'FRANCE')]},
  {'section': 'currency', 'correct': [], 'incorrect': []},
  {'section': 'city-in-state', 'correct': [], 'incorrect': []},
  {'section': 'family',
   'correct': [],
   'incorrect': [('HE', 'SHE', 'HIS', 'HER'),
    ('HE', 'SHE', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'MAN', 'WOMAN'),
    ('HIS', 'HER', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HE', 'SHE'),
    ('MAN', 'WOMAN', 'HIS', 'HER')]},
  {'section': 'gram1-adjective-to-adverb', 'correct': [

In [21]:
model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

2021-02-08 15:28:56,949 : INFO : Pearson correlation coefficient against /Users/wegzheng/.pyenv/versions/3.7.9/lib/python3.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.1991
2021-02-08 15:28:56,950 : INFO : Spearman rank-order correlation coefficient against /Users/wegzheng/.pyenv/versions/3.7.9/lib/python3.7/site-packages/gensim/test/test_data/wordsim353.tsv: 0.1414
2021-02-08 15:28:56,950 : INFO : Pairs with unknown words ratio: 83.0%


((0.19906543384391617, 0.12730312889605422),
 SpearmanrResult(correlation=0.14142003699985478, pvalue=0.281114353497797),
 83.0028328611898)