In [91]:
import sys

!{sys.executable} -W ignore:DEPRECATION -m pip install --quiet duckdb==0.7.1 \
duckdb-engine \
watermark \
jupysql \
sqlalchemy \
python-snappy \
pyarrow \
memray \
pandas \
ipywidgets  \
matplotlib \
gensim \
nltk

In [92]:
import duckdb
import re
import pandas as pd
import shlex
import string
from time import time

from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils

import multiprocessing


In [93]:
# checking local machine specs for model processing
cores = multiprocessing.cpu_count()
cores

16

In [106]:
# set log level for model training
import logging 
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

DEBUG:root:test


In [87]:
%load_ext watermark
# Duckdb 0.7.0 offers a bunch of new JSON stuff that I want to test out, checking to see I have the latest
# https://duckdb.org/2023/03/03/json.html
%watermark --iversions

logging: 0.5.1.2
gensim : 4.3.1
pandas : 1.5.3
re     : 2.2.1
duckdb : 0.7.1



In [88]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [89]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [11]:
# Create new DB or load existing https://duckdb.org/docs/guides/python/jupyter.html
%sql duckdb:///viberary.duckdb
    
# connect with pyscopg
con = duckdb.connect('viberary.duckdb')

In [26]:
%sql DESCRIBE select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');

Unnamed: 0,column_name,column_type,null,key,default,extra
0,isbn,VARCHAR,YES,,,
1,text_reviews_count,VARCHAR,YES,,,
2,series,BIGINT[],YES,,,
3,country_code,VARCHAR,YES,,,
4,language_code,VARCHAR,YES,,,
5,popular_shelves,"STRUCT(count BIGINT, ""name"" VARCHAR)[]",YES,,,
6,asin,VARCHAR,YES,,,
7,is_ebook,VARCHAR,YES,,,
8,average_rating,VARCHAR,YES,,,
9,kindle_asin,VARCHAR,YES,,,


In [27]:
# Create table in DuckDB
%sql CREATE TABLE goodreads as select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');

(duckdb.CatalogException) Catalog Error: Table with name "goodreads" already exists!
[SQL: CREATE TABLE goodreads as select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [12]:
%sql select book_id, title, popular_shelves as ps, description from goodreads limit 10;

Unnamed: 0,book_id,title,ps,description
0,5333265,W.C. Fields: A Life on Film,"[{'count': 3, 'name': 'to-read'}, {'count': 1,...",
1,1333909,Good Harbor,"[{'count': 2634, 'name': 'to-read'}, {'count':...","Anita Diamant's international bestseller ""The ..."
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[{'count': 58, 'name': 'to-read'}, {'count': 1...",Omnibus book club edition containing the Ladie...
3,6066819,Best Friends Forever,"[{'count': 7615, 'name': 'to-read'}, {'count':...",Addie Downs and Valerie Adler were eight when ...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,"[{'count': 32, 'name': 'to-read'}, {'count': 3...",
5,287141,The Aeneid for Boys and Girls,"[{'count': 56, 'name': 'to-read'}, {'count': 1...","Relates in vigorous prose the tale of Aeneas, ..."
6,378460,The Wanting of Levine,"[{'count': 14, 'name': 'to-read'}, {'count': 1...",
7,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,"[{'count': 515, 'name': 'to-read'}, {'count': ...","To Kara's astonishment, she discovers that a p..."
8,34883016,Playmaker: A Venom Series Novella,"[{'count': 4, 'name': 'to-read'}, {'count': 1,...",Secrets. Sometimes keeping them in confidence ...
9,287149,The Devil's Notebook,"[{'count': 961, 'name': 'to-read'}, {'count': ...","Wisdom, humor, and dark observations by the fo..."


In [13]:
%sql select country_code, count(*) from goodreads group by country_code;

Unnamed: 0,country_code,count_star()
0,US,2360165
1,,490


In [15]:
# our results in Word2Vec initially are not so great, can we filter? 
%sql select language_code, count(*) from goodreads group by language_code;

Unnamed: 0,language_code,count_star()
0,ger,30941
1,,1060153
2,eng,708457
3,en-US,91452
4,ara,42978
...,...,...
222,tgk,1
223,chn,1
224,cop,1
225,sla,1


In [43]:
%sql select language_code, count(*) FROM goodreads \
WHERE language_code like 'en%' \
GROUP BY language_code;

Unnamed: 0,language_code,count_star()
0,eng,708457
1,en-US,91452
2,en-GB,58358
3,en-CA,7652
4,en,225
5,enm,37
6,en-IN,2


In [45]:
# Create table that filters for english only for accuracy
%sql CREATE TABLE goodreads_en as select * from goodreads WHERE language_code like 'en%';

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,Count
0,866183


In [38]:
# Percent of null descriptions (descriptions help accuracy)
%sql select count(*) AS total_rows, \
count(*) FILTER (WHERE regexp_matches(description, ' ')) AS nulls ,\
count(*) FILTER (WHERE regexp_matches(description, ' ')) / count(*)::float \
FROM goodreads 

Unnamed: 0,total_rows,nulls,"(count_star() FILTER (WHERE regexp_matches(description, ' ')) / CAST(count_star() AS FLOAT))"
0,2360655,1945918,0.824313


In [46]:
# Let's start with title and description as our sentence features
sentences = con.sql("""select concat_ws(' ' , lower(regexp_replace(title, '[[:^alpha:]]',' ','g')), \
                    lower(regexp_replace(description, '[[:^alpha:]]',' ','g'))) as sentence from goodreads_en;""").df()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [47]:
# Each sentence is a single book
sentences.head()

Unnamed: 0,sentence
0,the unschooled wizard sun wolf and starhawk ...
1,best friends forever addie downs and valerie a...
2,the house of memory pluto s snitch
3,the bonfire of the vanities
4,heaven what is heaven really going to be like ...


In [48]:
# Check for nulls
sentences.isnull().sum()

sentence    0
dtype: int64

In [70]:
sentences.to_csv('sentences_en.csv', index=False, header=False)

In [71]:
! cat sentences_en.csv | head

the unschooled wizard  sun wolf and starhawk        omnibus book club edition containing the ladies of madrigyn and the witches of wenshar 
best friends forever addie downs and valerie adler were eight when they first met and decided to be best friends forever  but  in the wake of tragedy and betrayal during their teenage years  everything changed  val went on to fame and fortune  addie stayed behind in their small midwestern town  destiny  however  had more in store for these two  and when  twenty five years later  val shows up at addie s front door with blood on her coat and terror on her face  it is the beginning of a wild adventure for two women joined by love and history who find strength together that they could not find alone 
the house of memory  pluto s snitch     
the bonfire of the vanities 
heaven what is heaven really going to be like  what will we look like  what will we do  won t heaven get boring after a while  we all have questions about what heaven will be like  a

In [95]:
# input for training Word2Vec is a list of lists or iterable
# needs to be streamable https://radimrehurek.com/gensim/models/word2vec.html

class CorpusReader:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('sentences_en.csv')
        for line in open(corpus_path):
            yield utils.simple_preprocess(line)

In [73]:
# Move to where word2vec processes them
!cp sentences_en.csv /usr/local/lib/python3.9/site-packages/gensim/test/test_data/sentences_en.csv

In [103]:
sentences = CorpusReader()

In [105]:
# w2v_model = Word2Vec(min_count=20,
#                      window=2,
#                      vector_size=300,
#                      sample=6e-5, 
#                      alpha=0.03, 
#                      min_alpha=0.0007, 
#                      negative=20,
#                      workers=cores-1)

# we can initialize this with our corpus but splitting out the steps makes them easier to see
w2v_model = Word2Vec()

In [107]:
# building vocab context window

t = time()

w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 1367451 words, keeping 52723 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 2719501 words, keeping 74831 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 4088179 words, keeping 92439 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 5440523 words, keeping 107566 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 6795839 words, keeping 121370 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 8163934 words, keeping 133976 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 9525001 words, keeping 145676 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 10911013 wor

INFO:gensim.models.word2vec:PROGRESS: at sentence #740000, processed 100609335 words, keeping 516470 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #750000, processed 101969547 words, keeping 520272 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #760000, processed 103323174 words, keeping 523785 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #770000, processed 104665690 words, keeping 527360 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #780000, processed 106017802 words, keeping 530611 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #790000, processed 107378178 words, keeping 534276 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #800000, processed 108736748 words, keeping 537926 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #810000, processed 110111745 words, keeping 541425 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #820000, processed 111471009 words, keeping 544887 wor

Time to build vocab: 2.19 mins


In [109]:
t = time()

w2v_model.train(corpus, total_examples=w2v_model.corpus_count, epochs=2, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 3 workers on 158872 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-03-25T12:47:25.381142', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 0.52% examples, 475446 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 1.10% examples, 503613 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 1.66% examples, 507580 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 2.23% examples, 508803 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 2.76% examples, 503873 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 3.33% exam

INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 40.31% examples, 503032 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 40.86% examples, 503030 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 41.41% examples, 503129 words/s, in_qsize 4, out_qsize 2
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 41.97% examples, 503312 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 42.55% examples, 503593 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 43.11% examples, 503791 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 43.68% examples, 503877 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 44.24% examples, 504161 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 44.81% examples, 504372 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:

INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 82.52% examples, 505235 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 83.08% examples, 505279 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 83.64% examples, 505369 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 84.21% examples, 505393 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 84.76% examples, 505431 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 85.33% examples, 505514 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 85.89% examples, 505568 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 86.44% examples, 505539 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 87.01% examples, 505617 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:

INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 21.52% examples, 504759 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 22.09% examples, 505030 words/s, in_qsize 4, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 22.63% examples, 504770 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 23.14% examples, 504199 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 23.69% examples, 503696 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 24.20% examples, 502738 words/s, in_qsize 5, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 24.73% examples, 502606 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 25.27% examples, 502401 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 25.80% examples, 502141 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:

INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 63.30% examples, 502402 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 63.87% examples, 502496 words/s, in_qsize 6, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 64.45% examples, 502718 words/s, in_qsize 2, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 64.95% examples, 502380 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 65.51% examples, 502389 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 66.08% examples, 502521 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 66.64% examples, 502736 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 67.19% examples, 502760 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 67.76% examples, 502869 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:

Time to train the model: 6.08 mins


In [110]:
# Saving and checkpointing
w2v_model.save("word2vec.model")

INFO:gensim.utils:Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-25T12:53:39.210730', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
INFO:gensim.utils:storing np array 'vectors' to word2vec.model.wv.vectors.npy
INFO:gensim.utils:storing np array 'syn1neg' to word2vec.model.syn1neg.npy
INFO:gensim.utils:not storing attribute cum_table
DEBUG:smart_open.smart_open_lib:{'uri': 'word2vec.model', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:saved word2vec.model


In [111]:
w2v_model = Word2Vec.load("word2vec.model")
word_vectors = w2v_model.wv
word_vectors.save('vectors.kv')

INFO:gensim.utils:loading Word2Vec object from word2vec.model
DEBUG:smart_open.smart_open_lib:{'uri': 'word2vec.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from word2vec.model.wv.* with mmap=None
INFO:gensim.utils:loading vectors from word2vec.model.wv.vectors.npy with mmap=None
INFO:gensim.utils:loading syn1neg from word2vec.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'word2vec.model', 'datetime': '2023-03-25T12:53:43.681457', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}
DEBUG:gensim.utils:starting a new internal lifecycle event log for KeyedVectors
INFO:gensim.utils:KeyedVecto

In [112]:
w2v_model.wv.most_similar("beast")

[('monster', 0.6978684663772583),
 ('wolf', 0.6856111288070679),
 ('lion', 0.6570420861244202),
 ('creature', 0.656029462814331),
 ('dragon', 0.6286692023277283),
 ('tiger', 0.6162548065185547),
 ('jaguar', 0.6116740107536316),
 ('tigress', 0.6096420884132385),
 ('demon', 0.6021472811698914),
 ('caged', 0.5965006351470947)]

In [114]:
w2v_model.wv.most_similar("beauty")

[('elegance', 0.6451990008354187),
 ('allure', 0.6288026571273804),
 ('loveliness', 0.6085167527198792),
 ('gentleness', 0.6060893535614014),
 ('ravishing', 0.6009306907653809),
 ('sensuous', 0.5986630916595459),
 ('lass', 0.5856353640556335),
 ('charm', 0.5812059640884399),
 ('sensuality', 0.5809239149093628),
 ('lyricism', 0.5799146294593811)]

In [115]:
# now doing a semantic test for actual book characters
w2v_model.wv.most_similar("harry")

[('beatrix', 0.7520222067832947),
 ('peter', 0.630151093006134),
 ('draco', 0.5766912698745728),
 ('dresden', 0.5766406059265137),
 ('hieronymus', 0.5762723684310913),
 ('tom', 0.5648345351219177),
 ('jack', 0.5640392899513245),
 ('charlie', 0.5528959035873413),
 ('drarry', 0.5491214394569397),
 ('snape', 0.5455459952354431)]

In [118]:
w2v_model.wv.most_similar("copperfield")

[('updike', 0.8069586753845215),
 ('thurber', 0.7918877601623535),
 ('morrell', 0.7728648781776428),
 ('gould', 0.7717663049697876),
 ('piccirilli', 0.7693771719932556),
 ('hogg', 0.7692300081253052),
 ('bowles', 0.7571479082107544),
 ('banville', 0.7553629279136658),
 ('lethem', 0.7543212175369263),
 ('straub', 0.749229371547699)]

In [113]:
# Here's an individual embedding!
vec_beauty = w2v_model.wv['beast']
vec_beauty

array([-0.19518006, -1.7672642 ,  0.5005682 ,  0.9529076 , -1.7250446 ,
       -2.6468692 ,  1.9986058 , -0.46701965,  0.2481065 , -0.6591733 ,
       -0.42162445,  1.3421237 ,  1.352153  ,  0.90025795,  0.43388948,
       -1.8020738 ,  1.3958081 , -1.2582821 , -2.0085955 ,  0.22990757,
        1.970394  ,  0.50289434,  1.8834186 , -0.38901907, -1.9855825 ,
        0.08820678,  0.32120776,  1.2121825 ,  0.6935328 , -0.39396322,
        2.7387917 , -0.18162552, -2.168393  ,  0.49018154, -0.88244426,
       -0.88038564, -5.278482  , -0.6654004 , -0.27781636,  0.69942164,
       -1.7041072 , -1.8432969 ,  0.8142995 , -0.15949471,  0.06960123,
       -0.7476442 , -1.2020489 ,  0.5902427 ,  0.04238603, -0.4309581 ,
        1.1691087 ,  0.18789019, -1.8825562 ,  1.7118117 , -0.25407398,
        2.582912  ,  0.5353555 , -1.3065976 ,  0.8374112 ,  0.29024607,
        1.3353986 , -3.106418  , -0.59596825, -0.7341918 , -1.0472411 ,
        2.7758048 , -2.7797241 , -0.5299576 , -1.3266624 ,  0.97

In [119]:
vocab_len = len(w2v_model.wv)
vocab_len

158872

In [121]:
# Let's look at the key-value mapping
import itertools

dict(itertools.islice(w2v_model.wv.key_to_index.items(), 20))

{'the': 0,
 'and': 1,
 'of': 2,
 'to': 3,
 'in': 4,
 'is': 5,
 'her': 6,
 'his': 7,
 'she': 8,
 'for': 9,
 'he': 10,
 'with': 11,
 'that': 12,
 'as': 13,
 'it': 14,
 'but': 15,
 'on': 16,
 'from': 17,
 'an': 18,
 'has': 19}

In [122]:
# enumerate the index/key of each word 

for index, word in enumerate(w2v_model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_model.wv.index_to_key)} is {word}")

word #0/158872 is the
word #1/158872 is and
word #2/158872 is of
word #3/158872 is to
word #4/158872 is in
word #5/158872 is is
word #6/158872 is her
word #7/158872 is his
word #8/158872 is she
word #9/158872 is for


In [None]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE and log out
    '''
    Expected logs: 
    [t-SNE] Computing 91 nearest neighbors...
    [t-SNE] Indexed 100 samples in 0.000s...
    [t-SNE] Computed neighbors for 100 samples in 0.005s...
    [t-SNE] Computed conditional probabilities for sample 100 / 100
    [t-SNE] Mean sigma: 1.000000
    [t-SNE] KL divergence after 250 iterations with early exaggeration: 55.965740
    [t-SNE] KL divergence after 1000 iterations: 0.386509
    '''
    tsne = TSNE(n_components=num_dimensions, random_state=0, verbose=2)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(w2v_model)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 158872 samples in 0.006s...
[t-SNE] Computed neighbors for 158872 samples in 36.561s...
[t-SNE] Computed conditional probabilities for sample 1000 / 158872
[t-SNE] Computed conditional probabilities for sample 2000 / 158872
[t-SNE] Computed conditional probabilities for sample 3000 / 158872
[t-SNE] Computed conditional probabilities for sample 4000 / 158872
[t-SNE] Computed conditional probabilities for sample 5000 / 158872
[t-SNE] Computed conditional probabilities for sample 6000 / 158872
[t-SNE] Computed conditional probabilities for sample 7000 / 158872
[t-SNE] Computed conditional probabilities for sample 8000 / 158872
[t-SNE] Computed conditional probabilities for sample 9000 / 158872
[t-SNE] Computed conditional probabilities for sample 10000 / 158872
[t-SNE] Computed conditional probabilities for sample 11000 / 158872
[t-SNE] Computed conditional probabilities for sample 12000 / 158872
[t-SNE] Computed conditional probab

[t-SNE] Computed conditional probabilities for sample 121000 / 158872
[t-SNE] Computed conditional probabilities for sample 122000 / 158872
[t-SNE] Computed conditional probabilities for sample 123000 / 158872
[t-SNE] Computed conditional probabilities for sample 124000 / 158872
[t-SNE] Computed conditional probabilities for sample 125000 / 158872
[t-SNE] Computed conditional probabilities for sample 126000 / 158872
[t-SNE] Computed conditional probabilities for sample 127000 / 158872
[t-SNE] Computed conditional probabilities for sample 128000 / 158872
[t-SNE] Computed conditional probabilities for sample 129000 / 158872
[t-SNE] Computed conditional probabilities for sample 130000 / 158872
[t-SNE] Computed conditional probabilities for sample 131000 / 158872
[t-SNE] Computed conditional probabilities for sample 132000 / 158872
[t-SNE] Computed conditional probabilities for sample 133000 / 158872
[t-SNE] Computed conditional probabilities for sample 134000 / 158872
[t-SNE] Computed con

In [None]:
def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)