## NLP with autoencoder using word2vec 

### word2vec: a type of autoencoder used to compress multidimensional text data into smaller size vectors that can be used for text classification

In [1]:
import numpy as np
np.random.seed(1)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import OneHotEncoder


In [42]:
# define documents
docs = ['king is man',
'a king marrys queen',
'a queen marrys king',
'some unrelated words'
'queen is woman']
docs

['king is man',
 'a king marrys queen',
 'a queen marrys king',
 'some unrelated wordsqueen is woman']

In [43]:
vocab_size = 50

In [44]:
oh = np.array(one_hot('king queen man woman unrelated', vocab_size)).reshape(-1,1)
print(oh.shape)
oh

(5, 1)


array([[40],
       [11],
       [20],
       [31],
       [10]])

In [45]:
enc = OneHotEncoder()
enc.fit(np.array(range(50)).reshape(-1,1))
oh_enc = enc.transform(oh).toarray()
print(oh_enc.shape)
oh_enc

(5, 50)


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 

In [46]:
encoded_docs = [one_hot(d, vocab_size) for d in docs]
encoded_docs 

[[40, 19, 20], [1, 40, 23, 11], [1, 11, 23, 40], [39, 10, 6, 19, 31]]

In [47]:
max_length = 24
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs.shape)
padded_docs 

(4, 24)


array([[40, 19, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 40, 23, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 11, 23, 40,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [39, 10,  6, 19, 31,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

### Create array of tuples which contain embedding of neighboring words for each word. This is done for two preceding and succeeding neighbors 

In [48]:
tuples = np.empty((0, 2))
for padded_doc in padded_docs:
    length = len(padded_doc)
    for i in range(length):
        if padded_doc[i] != 0:
            if i<length-1 & padded_doc[i+1] != 0:
                tuples = np.append(tuples, [[padded_doc[i],padded_doc[i+1]]], axis=0) 
                if i<length-2 & padded_doc[i+2] != 0:
                    tuples = np.append(tuples, [[padded_doc[i],padded_doc[i+2]]], axis=0) 
            if i > 0:
                tuples = np.append(tuples, [[padded_doc[i],padded_doc[i-1]]], axis=0) 
                if i > 1:
                    tuples = np.append(tuples, [[padded_doc[i],padded_doc[i-2]]], axis=0) 

    
print(tuples.shape)
tuples

(36, 2)


array([[40., 19.],
       [40., 20.],
       [19., 20.],
       [19., 40.],
       [20., 19.],
       [20., 40.],
       [40., 23.],
       [40., 11.],
       [40.,  1.],
       [23., 11.],
       [23., 40.],
       [23.,  1.],
       [11., 23.],
       [11., 40.],
       [ 1., 11.],
       [ 1., 23.],
       [11., 23.],
       [11.,  1.],
       [23., 11.],
       [23.,  1.],
       [40., 23.],
       [40., 11.],
       [39., 10.],
       [39.,  6.],
       [10.,  6.],
       [10., 19.],
       [10., 39.],
       [ 6., 19.],
       [ 6., 31.],
       [ 6., 10.],
       [ 6., 39.],
       [19., 31.],
       [19.,  6.],
       [19., 10.],
       [31., 19.],
       [31.,  6.]])

In [49]:
# features: input of NN
onehotlabels_x = enc.transform(tuples[:,0].reshape(-1, 1)).toarray()

print(onehotlabels_x.shape)
print('---')
print(onehotlabels_x)
print('---')
print(onehotlabels_x[0])

(36, 50)
---
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
---
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


In [50]:
# targets: output of NN (what we want to predict)
onehotlabels_y = enc.transform(tuples[:,1].reshape(-1, 1)).toarray()

print(onehotlabels_y.shape)
print('---')
print(onehotlabels_y)
print('---')
print(onehotlabels_y[0])

(36, 50)
---
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
---
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]


### Build autoencoder

In [51]:
model = Sequential()

input = Dense(50, input_shape=(50,), activation='relu')
model.add(input)
bottleneck = Dense(2, activation='relu')
model.add(bottleneck)
model.add(Dense(50, activation='softmax')) # we are mapping sparse vectors (of dimension vocab_size) to sparse vectors
# compile the model
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

# fit the model
model.fit(onehotlabels_x, onehotlabels_y, epochs=500, verbose=1)

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_9 (Dense)              (None, 2)                 102       
_________________________________________________________________
dense_10 (Dense)             (None, 50)                150       
Total params: 2,802
Trainable params: 2,802
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/

Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 234/500
Epoch 235/500
Epoch 236/500
Epoch 237/500
Epoch 238/500
Epoch 

Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 394/500
Epoch 395/500
Epoch 396/500
Epoch 397/500
Epoch 398/500
Epoch 399/500
Epoch 400/500
Epoch 401/500
Epoch 402/500
Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7f1ec8129a90>

### Build a new NN reusing two of the three trained layers in the previous NN

In [52]:
model_2 = Sequential()
model_2.add(input)
model_2.add(bottleneck)
model_2.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
pred = model_2.predict(oh_enc)
print(pred)

[[ 7.992171   6.157775 ]
 [ 7.9900002  9.913249 ]
 [ 3.4434202 11.900734 ]
 [ 2.4878886 13.888353 ]
 [ 0.        11.007908 ]]


In [14]:
model = Sequential()

input = Dense(50, input_shape=(50,), activation='relu')
model.add(input)
bottleneck = Dense(2, activation='relu')
model.add(bottleneck)
model.add(Dense(50, activation='softmax')) # we are mapping sparse vectors (of dimension vocab_size) to sparse vectors
# compile the model
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

# fit the model
model.fit(onehotlabels_x, onehotlabels_y, epochs=500, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 102       
_________________________________________________________________
dense_3 (Dense)              (None, 50)                150       
Total params: 2,802
Trainable params: 2,802
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch

<keras.callbacks.History at 0x7f923fee19e8>

In [15]:
model = Sequential()
model.add(input)
model.add(bottleneck)
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['acc'])
test = model.predict(oh_enc)
print (test)


[[  0.          13.04530525]
 [  0.          14.59268951]
 [  0.          10.17055321]
 [  0.           6.23393059]
 [  0.           6.23393154]]


### Keras Embedding layer supports this complex functionality. It takes an input of certain dimension and and creates a low dimensional representation of it

In [22]:
model_3 = Sequential()
model_3.add(Embedding(1000, 2, input_length=5)) # we use wocabulary of 1000 possible words. High dimensional space is compressed to dimensionality or size 2
model_3.compile('rmsprop', 'mse')

In [20]:
input_array = np.random.randint(1000, size=(12, 5)) # 12 documents or sentences, 5 words per document
print(input_array.shape)
print(input_array)

(12, 5)
[[454 917 561 313 515]
 [964 792 497  43 588]
 [ 26 820 336 621 883]
 [297 466  15  64 196]
 [ 25 367 738 471 903]
 [282 665 616  22 777]
 [707 999 126 279 381]
 [356 155 933 313 595]
 [166 648 288 418 778]
 [279 655 751  87 793]
 [967 243 348 586 190]
 [302 928 728 151 695]]


In [23]:
output_array = model_3.predict(input_array)
print(output_array.shape)
print(output_array) # shape: (sentences, words x sentence, word representation of size 2)

(12, 5, 2)
[[[ 0.00873353  0.01683189]
  [-0.03336556 -0.00220304]
  [ 0.00740368  0.01604131]
  [-0.02312704  0.03433776]
  [ 0.00271295  0.00062187]]

 [[ 0.020719    0.04488344]
  [-0.00919532  0.03901578]
  [ 0.00820915  0.04683762]
  [-0.024279   -0.03038523]
  [ 0.02864711 -0.03809894]]

 [[ 0.00908076  0.01814784]
  [-0.02510971  0.01840098]
  [-0.04452644 -0.03145589]
  [ 0.01004372 -0.02500057]
  [ 0.04485198 -0.04449007]]

 [[-0.01882191  0.00446071]
  [ 0.04686885 -0.01876297]
  [-0.00302432 -0.04422064]
  [ 0.03549853 -0.00495008]
  [-0.02539498 -0.00483633]]

 [[-0.00767629  0.04864499]
  [-0.03129484  0.04894128]
  [-0.02639418  0.04804343]
  [ 0.04755181  0.0202137 ]
  [-0.04303333  0.03432877]]

 [[-0.00220926  0.00394775]
  [ 0.04777471 -0.03969891]
  [-0.00393724 -0.00980663]
  [ 0.03202167  0.01493868]
  [-0.03468503  0.03767708]]

 [[ 0.00543623 -0.01748652]
  [ 0.0181114   0.0130722 ]
  [ 0.02840662 -0.0127637 ]
  [ 0.01980129 -0.01787628]
  [ 0.04474458  0.0380576

In [31]:
# define documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Really Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
# define class labels
# labels = [1,1,1,1,1,0,0,0,0,0]
labels = np.array([1,1,1,1,1,0,0,0,0,0])

In [25]:
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

[[31, 13], [11, 1], [15, 13], [18, 1], [46], [13, 3], [30, 13], [39, 11], [30, 1], [22, 5, 13, 46]]


In [38]:
# pad documents to a max length of 4 words

max_length = 4 # longest sentence is of length 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(type(padded_docs))
print(padded_docs.shape)
print(padded_docs)

<class 'numpy.ndarray'>
(10, 4)
[[31 13  0  0]
 [11  1  0  0]
 [15 13  0  0]
 [18  1  0  0]
 [46  0  0  0]
 [13  3  0  0]
 [30 13  0  0]
 [39 11  0  0]
 [30  1  0  0]
 [22  5 13 46]]


In [35]:
model_4 = Sequential()
model_4.add(Embedding(vocab_size, 8, input_length=max_length))
model_4.add(Flatten())
model_4.add(Dense(1, activation='sigmoid'))
# compile the model
model_4.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model_4.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_3 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
# fit the model
model_4.fit(padded_docs, labels, epochs=100, verbose=0)
# evaluate the model
loss, accuracy = model_4.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998


In [40]:
np.hstack((model_4.predict(padded_docs), np.array(labels).reshape(10,1)))

array([[0.58643997, 1.        ],
       [0.6190235 , 1.        ],
       [0.60626405, 1.        ],
       [0.65018296, 1.        ],
       [0.60447013, 1.        ],
       [0.38351047, 0.        ],
       [0.4628529 , 0.        ],
       [0.41842979, 0.        ],
       [0.5138579 , 0.        ],
       [0.26797736, 0.        ]])

In [41]:
model_5 = Sequential()
model_5.add(Embedding(vocab_size, 4, input_length=max_length))
model_5.add(Flatten())
model_5.add(Dense(1, activation='sigmoid'))
# compile the model
model_5.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
# print(model_5.summary())
# fit the model
model_5.fit(padded_docs, labels, epochs=100, verbose=0)
# evaluate the model
loss, accuracy = model_5.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))
# print predictions and labels
np.hstack((model_5.predict(padded_docs), np.array(labels).reshape(10,1)))

Accuracy: 89.999998


array([[0.54626375, 1.        ],
       [0.57243031, 1.        ],
       [0.5442698 , 1.        ],
       [0.58779037, 1.        ],
       [0.56310326, 1.        ],
       [0.44660532, 0.        ],
       [0.4668698 , 0.        ],
       [0.48059812, 0.        ],
       [0.51284158, 0.        ],
       [0.35799441, 0.        ]])