In [4]:
import tensorflow as tf
import tensorflow_datasets as tfds

tfds.disable_progress_bar()

## Learning embedding from scratch

In [5]:
(train_data, test_data), info = tfds.load('imdb_reviews/subwords8k',
                                          split=[tfds.Split.TRAIN, tfds.Split.TEST],
                                          with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /home/kaimo/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0...[0m
Shuffling and writing examples to /home/kaimo/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete3B2EE8/imdb_reviews-train.tfrecord
Shuffling and writing examples to /home/kaimo/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete3B2EE8/imdb_reviews-test.tfrecord
Shuffling and writing examples to /home/kaimo/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0.incomplete3B2EE8/imdb_reviews-unsupervised.tfrecord
[1mDataset imdb_reviews downloaded and prepared to /home/kaimo/tensorflow_datasets/imdb_reviews/subwords8k/1.0.0. Subsequent calls will reuse this data.[0m


Get the encoder (`tfds.features.text.SubwordTextEncoder`), and have a quick look at the vocabulary.

In [7]:
encoder = info.features['text'].encoder
encoder.subwords[:20]

['the_',
 ', ',
 '. ',
 'a_',
 'and_',
 'of_',
 'to_',
 's_',
 'is_',
 'br',
 'in_',
 'I_',
 'that_',
 'this_',
 'it_',
 ' /><',
 ' />',
 'was_',
 'The_',
 'as_']

In [33]:
train_data.padded_batch(10, ([None,], []))

<DatasetV1Adapter shapes: ((None, None), (None,)), types: (tf.int64, tf.int64)>

In [36]:
train_batches = train_data.shuffle(1000).padded_batch(10, ([None], []))
test_batches = test_data.padded_batch(10, ([None], []))

### Create a simgle model

In [37]:
embedding_size = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=encoder.vocab_size, output_dim=embedding_size),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [39]:
model.compile(optimizer='Adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

In [40]:
model.fit(train_batches, epochs=10, validation_data=test_batches)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fc2091a9450>

## Retrieve the learned embeddings

In [42]:
embed = model.layers[0]
weights = embed.get_weights()[0]
print(weights.shape)

(8185, 16)


Save learned embedding vectors and vocab to files

In [46]:
import io
import tempfile

In [69]:
_, vec_file = tempfile.mkstemp(suffix='.tsv')
_, meta_file = tempfile.mkstemp(suffix='.tsv')

out_vec = io.open(vec_file, 'w', encoding='utf-8')
out_m = io.open(meta_file, 'w', encoding='utf-8')

In [70]:
for idx, word in enumerate(encoder.subwords):
    vec = weights[idx + 1] # skip 0, it's padding
    out_m.write(word + '\n')
    out_vec.write('\t'.join([str(x) for x in vec]) + '\n')
out_vec.close()
out_m.close()

In [73]:
!head {vec_file}

-0.033357132	-0.18019158	0.019915733	0.05698549	0.095509514	0.028049178	0.07198314	0.044673074	0.08024245	-0.09789379	-0.038111404	0.06030338	0.08713607	0.045528017	0.0650031	0.10778238
-0.011729498	-0.08036991	0.0344378	0.037724752	0.043161742	-0.010260238	0.058903255	0.0031632227	0.0750861	0.0062241	-0.023241108	0.08388362	-0.0099297315	-0.025998624	-0.036804646	0.022164281
0.021704625	-0.026616946	0.08833752	-0.06613831	-0.06283467	-0.026924789	-0.024841247	-0.029082617	-0.030629328	0.0181491	0.049597535	0.006083052	-0.011908604	-0.04640039	-0.102714196	-0.05951492
-0.035312433	-0.09898356	0.07880238	-0.0023776633	0.0066279373	-0.02269177	0.024111446	0.009804823	0.07577569	0.0050103036	-0.013129178	0.012743426	0.028365336	-0.015440008	0.025633333	0.09226063
-0.10275722	-0.13000952	0.016330292	0.1523128	0.065438606	0.0284161	0.15065762	0.109051965	0.093990445	-0.15007022	-0.018336393	0.08064745	0.062052786	0.03342415	0.034527965	0.09858491
0.106824055	0.020318193	0.14062102	-0.0

In [74]:
!head {meta_file}

the_
, 
. 
a_
and_
of_
to_
s_
is_
br
