In [9]:
import tensorflow as tf
print(tf.__version__)
tf.enable_eager_execution()


1.15.0


In [0]:
import tensorflow_datasets as tfds
imdb,info = tfds.load('imdb_reviews',with_info=True,as_supervised=True)


In [0]:
import numpy as np

train_data,test_data = imdb['train'],imdb['test']
train_sentences=[]
test_sentences=[]

train_labels=[]
test_labels=[]

for s,l in train_data:
  train_sentences.append(str(s.numpy()))
  train_labels.append(l.numpy())

for s,l in test_data:
  test_sentences.append(str(s.numpy()))
  test_labels.append(l.numpy())



In [0]:
#sentence encoding
vocab_size = 10000 
embedding_dim = 16
max_length = 120 # review length
trunc_type='post'
oov_token="<oov"

# Steps : Tokenize -> create word index -> convert to sequences -> do padding for same size -> testing sequences should be tokenized based on training word index

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_token)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences,maxlen=max_length)

In [35]:
#define  the NN
model = keras.Sequential([
              keras.layers.Embedding(vocab_size,embedding_dim,input_length=max_length),
              keras.layers.Flatten(),
              keras.layers.Dense(6,activation='relu'),
              keras.layers.Dense(1,activation='sigmoid') # two classes , so one neuron
])

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 11526     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(padded,
          np.array(train_labels),
          epochs=10,
          validation_data=(test_padded,np.array(test_labels))
          )

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1cfae7bc50>

**To view this in tensorflow embedding projector**

In [50]:
weights = model.layers[0].get_weights()[0]
print(weights.shape)
print(weights)

(10000, 16)
[[ 2.3717916e-02 -4.1335560e-02  4.8740409e-02 ... -5.4761455e-03
   2.5576169e-02 -2.4499504e-02]
 [ 5.5061463e-02 -7.9436496e-02  6.8881005e-02 ... -5.3180512e-03
  -1.2371118e-02 -7.7541757e-05]
 [ 3.9728680e-03 -5.2733060e-02  9.4237372e-02 ...  2.5939497e-03
   6.0456324e-02 -3.9922852e-02]
 ...
 [-1.6018631e-01 -9.2584811e-02 -3.5184532e-02 ...  1.1890968e-01
   9.6716762e-02  2.3366053e-02]
 [-2.5773803e-02 -2.8773749e-02  6.6927269e-02 ... -4.8390757e-02
   2.8916422e-02  2.7901145e-02]
 [ 3.5977900e-02 -1.1615841e-01 -8.4580844e-03 ... -7.0351437e-02
  -5.8211260e-02  6.4553164e-02]]


In [0]:
# generate vector and meta auto files to upload in embedding projector and visualize


import io

reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

vectors = io.open("imdb_vectors.tsv",'w',encoding='utf-8')
meta = io.open("imdb_meta.tsv",'w',encoding='utf-8')

for word_num in range(1,vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  meta.write(word + "\n")
  vectors.write('\t'.join([str(w) for w in embeddings]) + "\n")

vectors.close()
meta.close()

In [0]:
# download the file

try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download("imdb_vectors.tsv")
  files.download("imdb_meta.tsv")

Go to https://projector.tensorflow.org/

1. Load the vector and meta files
2. visualize it.