<a href="https://colab.research.google.com/github/kishore145/Mathematics/blob/master/01_Linear_Algebra_Vectors_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vectors
Create simple vectors using numpy and tensorflow

In [2]:
# Import dependencies
import numpy as np
import tensorflow as tf

In [3]:
# Create vector a of 3 values using numpy
a = np.array([2,1,0])
a

array([2, 1, 0])

In [4]:
# Create a vector b of 4 values using numpy
b = np.array([2,1,3,4])
b

array([2, 1, 3, 4])

In [5]:
# Create a vector of 3 values using tensforlow
a_tf = tf.Variable([2,1,0])
a_tf

<tf.Variable 'Variable:0' shape=(3,) dtype=int32, numpy=array([2, 1, 0], dtype=int32)>

In [6]:
# Create a vector of 4 values using tensorflow
b_tf = tf.Variable([2,1,3,4])
b_tf

<tf.Variable 'Variable:0' shape=(4,) dtype=int32, numpy=array([2, 1, 3, 4], dtype=int32)>

# Vector Addition and Subtraction

In [7]:
# Vector arithmetic using numpy
a = np.array([2,1,3,4])
b = np.array([1,3,5,7])

In [8]:
a + b

array([ 3,  4,  8, 11])

In [9]:
a - b

array([ 1, -2, -2, -3])

In [10]:
3 * a 

array([ 6,  3,  9, 12])

In [11]:
# Vector Arithmetic using tensorflow
a_tf = tf.Variable([2,1,3,4])
b_tf = tf.Variable([1,3,5,7])

In [12]:
a_tf + b_tf

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 3,  4,  8, 11], dtype=int32)>

In [13]:
a_tf - b_tf

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 1, -2, -2, -3], dtype=int32)>

In [14]:
3 * a_tf

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([ 6,  3,  9, 12], dtype=int32)>

# word2vec
Implementing word2vec to show practical use of vectors

Original paper:  
https://arxiv.org/pdf/1301.3781.pdf


In [15]:
# Import dependencies for creating word vectors
import nltk
import gensim
from gensim.models.word2vec import Word2Vec

In [16]:
# Import dependencies to plot word vectors visually
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [17]:
# Load data
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [18]:
from nltk.corpus import gutenberg # Classic english books from project gutenberg

In [21]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()
gberg_sents[0:3]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I']]

In [23]:
# Train word2vec model on gutenberg data
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=8 )

In [24]:
# Explore model outputs
# As shown below, word dog is now represented as a vector of 64 dimensions. 
model.wv['dog']

array([-0.16563535,  0.22381859,  0.2872279 , -0.5330074 , -0.0824253 ,
        0.1763634 ,  0.4776888 , -0.09288356, -0.02728338,  0.30896917,
        0.5346775 , -0.03481779, -0.2843367 , -0.29405335, -0.5039348 ,
        0.00195951, -0.34182894,  0.11133863,  0.12136966,  0.03606385,
        0.27968678, -0.3153902 ,  0.26258835,  0.54028714, -0.5804574 ,
       -0.34091175,  0.47754934,  0.01064353, -0.40600032, -0.17685528,
       -0.12168735, -0.41927198, -0.18920766, -0.09919676, -0.08705259,
       -0.4067302 ,  0.16825074, -0.23447332,  0.09267248, -0.02598699,
        0.6039108 , -0.25069472, -0.00891941,  0.03545779, -0.22897191,
        0.3597601 , -0.45064688, -0.33651102,  0.21701235, -0.11734752,
       -0.1966571 ,  0.26342654, -0.10768716, -0.3662372 ,  0.45812222,
       -0.06235375,  0.13271028, -0.13758616,  0.4992194 , -0.21861322,
       -0.14679825, -0.14000617,  0.01914772,  0.25202295], dtype=float32)

In [25]:
# Number of dimensions of word dog
len(model.wv['dog'])

64

In [26]:
# Once the words are represented as vectors, you can perform intersting things with it. 
# Liking finding similar words. All these are found on the fly using word vectors determined from the gutenberg corpus
model.wv.most_similar('dog') # distance

  if np.issubdtype(vec.dtype, np.int):


[('puppy', 0.8269394636154175),
 ('chimney', 0.7919608354568481),
 ('sweeper', 0.7804093360900879),
 ('cage', 0.7706351280212402),
 ('thief', 0.7697631120681763),
 ('boy', 0.7695230841636658),
 ('broth', 0.7588620185852051),
 ('wid', 0.758419394493103),
 ('whip', 0.7565804719924927),
 ('gallon', 0.7409327030181885)]

In [27]:
model.wv.most_similar('think')

  if np.issubdtype(vec.dtype, np.int):


[('contradict', 0.8526637554168701),
 ('suppose', 0.8444442749023438),
 ('manage', 0.8346689939498901),
 ('believe', 0.8254157304763794),
 ('Mamma', 0.8241066336631775),
 ('behave', 0.822253942489624),
 ('awfully', 0.8083606958389282),
 ('NOW', 0.806320071220398),
 ('really', 0.804284930229187),
 ('downright', 0.8024898767471313)]

In [31]:
model.wv.most_similar('day')

  if np.issubdtype(vec.dtype, np.int):


[('morning', 0.791458010673523),
 ('night', 0.77092045545578),
 ('month', 0.7374434471130371),
 ('time', 0.7183862924575806),
 ('week', 0.7064138054847717),
 ('evening', 0.6713027954101562),
 ('Saturday', 0.6698338985443115),
 ('Adar', 0.6594184041023254),
 ('sabbath', 0.653235137462616),
 ('feasting', 0.6524351835250854)]

In [32]:
model.wv.most_similar('father')

  if np.issubdtype(vec.dtype, np.int):


[('mother', 0.8752946853637695),
 ('brother', 0.8497220873832703),
 ('sister', 0.8027050495147705),
 ('wife', 0.7820793390274048),
 ('daughter', 0.7738239169120789),
 ('Amnon', 0.7503934502601624),
 ('Tamar', 0.7226407527923584),
 ('servant', 0.7184084057807922),
 ('uncle', 0.7183826565742493),
 ('younger', 0.716156542301178)]

In [33]:
model.wv.doesnt_match("mother father daughter dog".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'dog'

In [34]:
model.wv.similarity('father', 'dog')

  if np.issubdtype(vec.dtype, np.int):


0.45768186

In [28]:
# Let's do some arithmetic on our word vectors and see if the word relations hold true in vector representation
# Get me wordvectors most similar to (Father + Woman - Man) should get mother
model.wv.most_similar(positive=['father', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('mother', 0.8021386861801147),
 ('daughter', 0.7759180068969727),
 ('wife', 0.7727994918823242),
 ('husband', 0.7648909091949463),
 ('sister', 0.7559406757354736),
 ('brother', 0.7406525611877441),
 ('Rachel', 0.6969828605651855),
 ('Tamar', 0.6896173357963562),
 ('Sarai', 0.6816429495811462),
 ('Sarah', 0.6799650192260742)]

In [29]:
# Get me wordvectors most similar to (son + Woman - Man) should get daughter
model.wv.most_similar(positive=['son', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('daughter', 0.7504309415817261),
 ('wife', 0.7419352531433105),
 ('Leah', 0.7360500693321228),
 ('Sarai', 0.7295117378234863),
 ('Sarah', 0.7213910818099976),
 ('Bethuel', 0.717557966709137),
 ('Abram', 0.7154536247253418),
 ('Hagar', 0.7142410278320312),
 ('Rachel', 0.7128888368606567),
 ('Onan', 0.7062435150146484)]

In [30]:
# Get me wordvectors most similar to (husband + woman - man) should get wife
model.wv.most_similar(positive=['husband', 'woman'], negative=['man'])

  if np.issubdtype(vec.dtype, np.int):


[('wife', 0.7377575635910034),
 ('daughter', 0.6998690962791443),
 ('mother', 0.688650906085968),
 ('widow', 0.6833479404449463),
 ('conceived', 0.6830436587333679),
 ('sister', 0.6826539039611816),
 ('child', 0.6753749847412109),
 ('maid', 0.6712912321090698),
 ('nurse', 0.6483502388000488),
 ('Rachel', 0.6433163285255432)]

In [35]:
# Get me wordvectors most similar to (king + woman - man) should get queen
# Not a great prediction
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30)

  if np.issubdtype(vec.dtype, np.int):


[('Sarah', 0.7133827209472656),
 ('Rachel', 0.686896026134491),
 ('Abram', 0.6860709190368652),
 ('Bethuel', 0.6688151955604553),
 ('Leah', 0.6674652695655823),
 ('Babylon', 0.6648784875869751),
 ('Judah', 0.6607108116149902),
 ('Hagar', 0.6606854200363159),
 ('Padanaram', 0.6586019992828369),
 ('Sarai', 0.6581153869628906),
 ('David', 0.6550211310386658),
 ('Hanun', 0.6543503999710083),
 ('Laban', 0.6539816856384277),
 ('tribute', 0.6465175747871399),
 ('daughter', 0.6433618068695068),
 ('Lot', 0.6427867412567139),
 ('Cain', 0.6420150995254517),
 ('Onan', 0.6417611837387085),
 ('Esther', 0.6378758549690247),
 ('household', 0.6378402709960938),
 ('Jerubbaal', 0.6367617845535278),
 ('Bilhah', 0.6363285183906555),
 ('Jerusalem', 0.636083722114563),
 ('Rahab', 0.6351947784423828),
 ('Rebekah', 0.633100688457489),
 ('Ephron', 0.6315991878509521),
 ('damsel', 0.6311953067779541),
 ('queen', 0.6299747228622437),
 ('Solomon', 0.6288984417915344),
 ('Pharaoh', 0.6277692317962646)]

# Visualize word vectors  
Use TSNE to reduce dimensionality of word vectors from 64 to 2 so that we can visualize them in a plot and try to get more intuition


In [36]:
len(model.wv.vocab)

17011

In [37]:
X = model.wv[model.wv.vocab]

In [38]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [39]:
X_2d = tsne.fit_transform(X)

In [40]:
X_2d[0:5]

array([[ 16.941874,  56.10478 ],
       [-44.798504,  43.040802],
       [-37.59002 ,  -9.568027],
       [-47.438976,  41.21396 ],
       [ 16.960491,  56.021297]], dtype=float32)

In [41]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [42]:
coords_df.head()

Unnamed: 0,x,y,token
0,16.941874,56.104778,[
1,-44.798504,43.040802,Emma
2,-37.590019,-9.568027,by
3,-47.438976,41.213959,Jane
4,16.960491,56.021297,]


In [43]:
output_notebook() # output bokeh plots inline in notebook

In [44]:
subset_df = coords_df.sample(n=500)

In [45]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [46]:
show(p)

In [50]:
# Review selected examples alone
selected_tokens = ['father', 'husband', 'king', 'man', 'son', 'mother', 'wife', 'queen', 'woman', 'daughter']
selected_coords = coords_df[coords_df.token.isin(selected_tokens)]

In [53]:
# Plot the selected tokens
p = figure(plot_width=500, plot_height=500)
_ = p.text(x=selected_coords.x, y=selected_coords.y, text=selected_coords.token)

In [54]:
show(p)

In [None]:
# Code is based on JonKrohn's lectures on Linear Algebra and Natural Language processing.
# You can check out original Repo at github/JonKrohn