# Creating Word Vectors with word2vec
## by Wilder Rodrigues

https://medium.com/cityai/deep-learning-for-natural-language-processing-part-i-8369895ffb98

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/learning-stack/Colab-ML-Playbook/blob/master/NLP/Deep%20Learning%20for%20NLP%20-%20Part%201/word2vec.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/learning-stack/Colab-ML-Playbook/blob/master/NLP/Deep%20Learning%20for%20NLP%20-%20Part%201/word2vec.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

Let's start with NLTK

#### Load Dependencies

In [0]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [0]:
nltk.download('punkt')

#### Load Data

In [0]:
nltk.download('gutenberg')

In [0]:
from nltk.corpus import gutenberg

In [0]:
gutenberg.fileids()

#### Tokenize Text

In [0]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sent_tokens = sent_tokenize(gutenberg.raw(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

In [0]:
gberg_sent_tokens[0:5]

In [0]:
gberg_sent_tokens[1]

In [0]:
word_tokenize(gberg_sent_tokens[1])

In [0]:
word_tokenize(gberg_sent_tokens[1])[14]

In [0]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
gberg_sents = gutenberg.sents(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt'])

In [0]:
gberg_sents[0:5]

In [0]:
gberg_sents[4][14]

In [0]:
gutenberg.words()

In [0]:
# Due to lack of resources, I'm not working with the full Gutenberg dataset (18 books).
len(gutenberg.words(fileids=['austen-sense.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt']))

#### Run Word2Vec

In [0]:
# size == dimensions
# window 10: 20 context words, 10 to the left and 10 to the right
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=2)

In [0]:
# We don't have to save the model if we don't want to. It's being done here as demonstration.
model.save('raw_gutenberg_model.w2v')

#### Explore the Model

In [0]:
model = Word2Vec.load('raw_gutenberg_model.w2v')

In [0]:
model['house']

In [0]:
len(model['house'])

In [0]:
model.most_similar('house')

In [0]:
model.most_similar('think')

In [0]:
model.most_similar('day')

In [0]:
model.most_similar('father')

In [0]:
model.doesnt_match('mother father daughter house'.split())

In [0]:
model.similarity('father', 'mother')

In [0]:
model.most_similar(positive=['father', 'woman'], negative=['man'])

In [0]:
model.most_similar(positive=['son', 'woman'], negative=['man'])

In [0]:
model.most_similar(positive=['husband', 'woman'], negative=['man'])

In [0]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

#### Reduce word vector dimensionality with t-SNE

t-Distributed Stochastic Name Embedding

In [0]:
len(model.wv.vocab)

In [0]:
X = model[model.wv.vocab]

In [0]:
tsne = TSNE(n_components=2, n_iter=250)

In [0]:
X_2d = tsne.fit_transform(X)

In [0]:
coords_df = pd.DataFrame(X_2d, columns=['x', 'y'])
coords_df['token'] = model.wv.vocab.keys()

In [0]:
coords_df.head()

In [0]:
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualise 2D representation of word vectors

In [0]:
coorrds_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [0]:
coords_df.head()

In [0]:
_ = coords_df.plot.scatter('x', 'y', figsize=(8,8), marker='o', s=10, alpha=0.2)

In [0]:
output_notebook()

In [0]:
subset_df = coords_df.sample(n=1000)

In [0]:
p = figure(plot_width=600, plot_height=600)
p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [0]:
show(p)