# Word To Vector Approach

## Data Collection

In [2]:
corpus = ['I like apple juice',
          'I like orange juice',
          'king is a strong man',
          'queen is a wise woman',
          'boy is a young man',
          'girl is a young woman',
          'prince is a young king',
          'princess is a young queen',
          'man is strong',
          'woman is pretty',
          'prince is a boy will be king',
          'princess is a girl will be queen',
         'Apple is good place for work']

## Remove Stop Words
Preprocessing steps

In [3]:
def remove_stop_words(corpus):
    stop_words = ['is', 'a', 'will', 'be']
    results = []
    for text in corpus:
        tmp = text.split(' ')
        for stop_word in stop_words:
            if stop_word in tmp:
                tmp.remove(stop_word)
        results.append(" ".join(tmp))
    return results

In [4]:
# After removing stop words
corpus = remove_stop_words(corpus)
corpus

['I like apple juice',
 'I like orange juice',
 'king strong man',
 'queen wise woman',
 'boy young man',
 'girl young woman',
 'prince young king',
 'princess young queen',
 'man strong',
 'woman pretty',
 'prince boy king',
 'princess girl queen',
 'Apple good place for work']

In [5]:
words = []
for text in corpus:
    for word in text.split(' '):
        words.append(word)
words

['I',
 'like',
 'apple',
 'juice',
 'I',
 'like',
 'orange',
 'juice',
 'king',
 'strong',
 'man',
 'queen',
 'wise',
 'woman',
 'boy',
 'young',
 'man',
 'girl',
 'young',
 'woman',
 'prince',
 'young',
 'king',
 'princess',
 'young',
 'queen',
 'man',
 'strong',
 'woman',
 'pretty',
 'prince',
 'boy',
 'king',
 'princess',
 'girl',
 'queen',
 'Apple',
 'good',
 'place',
 'for',
 'work']

# Data Generation
Skip-Gram

In [6]:
word2int = {}

#Here we assisgned number to each word store it into Dictionary
for i,word in enumerate(words):
    word2int[word] = i

    
# Here we split corpus into sentences
sentences = []
for sentence in corpus:
    sentences.append(sentence.split())
    
WINDOW_SIZE = 2 # Dimension2 means "we consider 2 words from left and right to the centre word"

data = []
for sentence in sentences:
    for idx, word in enumerate(sentence):
        for neighbor in sentence[max(idx - WINDOW_SIZE, 0) : min(idx + WINDOW_SIZE, len(sentence)) + 1] : 
            if neighbor != word:
                data.append([word, neighbor])

In [7]:
import pandas as pd
#for text in corpus:
    #print(text)
    
    
df = pd.DataFrame(data, columns = ['input', 'label'])
df

Unnamed: 0,input,label
0,I,like
1,I,apple
2,like,I
3,like,apple
4,like,juice
...,...,...
81,for,good
82,for,place
83,for,work
84,work,place


In [8]:
df.head(10)

Unnamed: 0,input,label
0,I,like
1,I,apple
2,like,I
3,like,apple
4,like,juice
5,apple,I
6,apple,like
7,apple,juice
8,juice,like
9,juice,apple


In [9]:
len(words)

41

# Define Tensorflow Graph

In [None]:
import tensorflow as tf
import numpy as np

ONE_HOT_DIM = len(words)

# function to convert numbers to one hot vectors
def to_one_hot_encoding(data_point_index):
    one_hot_encoding = np.zeros(ONE_HOT_DIM)
    one_hot_encoding[data_point_index] = 1
    return one_hot_encoding

X = [] # input word
Y = [] # target word

for x, y in zip(df['input'], df['label']):
    X.append(to_one_hot_encoding(word2int[ x ]))
    Y.append(to_one_hot_encoding(word2int[ y ]))

# convert them to numpy arrays
X_train = np.asarray(X)
Y_train = np.asarray(Y)

In [None]:
X_train[0]

In [None]:
ONE_HOT_DIM

In [None]:
# import Tensorflow 2
import tensorflow as tf
 
# placeholders are not executable immediately so we need to disable eager exicution in TF 2 not in 1
tf.compat.v1.disable_eager_execution()
 
# # Create Placeholder
# making placeholders for X_train and Y_train
x = tf.compat.v1.placeholder(tf.float32, shape=(None, ONE_HOT_DIM)) 
#x=v1.placeholder(tf.float32,shape=(None,ONE_HOT_DIM))
y_label = tf.compat.v1.placeholder(tf.float32, shape=(None, ONE_HOT_DIM))

In [None]:

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 2 

# hidden layer: which represents word vector eventually
W1 = tf.Variable(tf.random.normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random.normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)

# output layer
W2 = tf.Variable(tf.random.normal([EMBEDDING_DIM, ONE_HOT_DIM]))
b2 = tf.Variable(tf.random.normal([1]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_layer, W2), b2))

# loss function: cross entropy
loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.math.log(prediction), axis=[1]))

# training operation 
train_op = tf.compat.v1.train.GradientDescentOptimizer(0.05).minimize(loss)

# Train

In [None]:
sess = tf.compat.v1.Session()
init = tf.compat.v1.global_variables_initializer()
sess.run(init) 

iteration = 20000
for i in range(iteration):
    # input is X_train which is one hot encoded word
    # label is Y_train which is one hot encoded neighbor word
    sess.run(train_op, feed_dict={x: X_train, y_label: Y_train})
    if i % 3000 == 0:
        print('iteration '+str(i)+' loss is : ', sess.run(loss, feed_dict={x: X_train, y_label: Y_train}))

In [None]:
# Now the hidden layer (W1 + b1) is actually the word look up table
vectors = sess.run(W1 + b1)
print(vectors)

# Word2VecTable

In [None]:
type(vectors),type(words)
data1={}
w2v_df = pd.DataFrame(vectors, columns = ['x1', 'x2'])
w2v_df['word'] = words
w2v_df = w2v_df[['word', 'x1', 'x2']]
w2v_df