In [3]:
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)



In [4]:
# Let's install tensorflow 2.x first :)
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [5]:
try: 
    import tensorflow_datasets as tfds
except Exception:
    !sudo pip install -q tensorflow-datasets

In [29]:
import io

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
print(tf.__version__)

2.0.0


In [8]:
DATASET_NAME = "imdb_reviews"

In [9]:
imdb, info = tfds.load(DATASET_NAME, with_info=True, as_supervised=True)

In [10]:
# split data between training and testing
train_data, test_data = imdb['train'], imdb['test']

In [11]:
# print info
print(info)

tfds.core.DatasetInfo(
    name='imdb_reviews',
    version=0.1.0,
    description='Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.',
    urls=['http://ai.stanford.edu/~amaas/data/sentiment/'],
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=100000,
    splits={
        'test': 25000,
        'train': 25000,
        'unsupervised': 50000,
    },
    supervised_keys=('text', 'label'),
    citation="""@InProceedings{maas-EtAl:2011:ACL-HLT2011,
      author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
      title     = {Learning Word V

In [12]:
# Declare the data structures for storing sentences and its labels
training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

In [13]:
# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
# data needs to be transformed as it's coming as string for sentences
def transform_dataset(dataset, is_training=True):
    for sentence, label in dataset:
        if is_training:
            training_sentences.append(str(sentence.numpy()))
            training_labels.append(label.numpy())
        else:
            testing_sentences.append(str(sentence.numpy()))
            testing_labels.append(label.numpy())

In [14]:
transform_dataset(train_data)
transform_dataset(test_data, is_training=False)

In [15]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [16]:
# Define some variables
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

In [17]:
# Create the tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
# Encode the sentences
tokenizer.fit_on_texts(training_sentences)
# Define a word index variable
word_index = tokenizer.word_index
# Get the sequences based on the encoded words
sequences = tokenizer.texts_to_sequences(training_sentences)
# Add padding 
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

In [18]:
# Test the encoding with the testing sentences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [19]:
# Sneak peek into the word_index
# print(word_index.keys())

In [20]:
# Reverse the index as the key is the word and the value is the number
# We want it as: number: word
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [21]:
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [22]:
# Checkout how we decode the review
print(decode_review(padded[2]))

b i saw this film on true movies which automatically made me <OOV> but actually it was good why not because of the amazing plot twists or breathtaking dialogue of which there is little but because actually despite what people say i thought the film was accurate in it's depiction of teenagers dealing with <OOV> br br it's not <OOV> creek they're not <OOV> cool witty characters who <OOV> through sexuality with <OOV> knowledge they're kids and they act like kids would br br they're blunt awkward and annoyingly confused about everything yes this could be by accident and they could just be bad actors but i don't think so <OOV> <OOV> gives when not trying to be cool a


In [23]:
# Checkout the original sentence
print(training_sentences[2])

b"I saw this film on True Movies (which automatically made me sceptical) but actually - it was good. Why? Not because of the amazing plot twists or breathtaking dialogue (of which there is little) but because actually, despite what people say I thought the film was accurate in it's depiction of teenagers dealing with pregnancy.<br /><br />It's NOT Dawson's Creek, they're not graceful, cool witty characters who breeze through sexuality with effortless knowledge. They're kids and they act like kids would. <br /><br />They're blunt, awkward and annoyingly confused about everything. Yes, this could be by accident and they could just be bad actors but I don't think so. Dermot Mulroney gives (when not trying to be cool) a very believable performance and I loved him for it. Patricia Arquette IS whiny and annoying, but she was pregnant and a teenagers? The combination of the two isn't exactly lavender on your pillow. The plot was VERY predictable and but so what? I believed them, his stress an

In [24]:
# Let's create our neural net using embeddings :) 
'''
Using:
 - Vocabulary size: 1000 
 - 16 dimensions
'''
 
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [25]:
# We will use binary cross entropy because it's either positive or negative
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [27]:
# Configure epochs for training, how many iterations we are going to do 
num_epochs = 10

In [28]:
# Train the model using the padded model and the labels for training and testing
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f25ec4b1080>

In [32]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [35]:
out_v = io.open('./datasets/vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('./datasets/meta.tsv', 'w', encoding='utf-8')

In [36]:
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [37]:
# Download the files and try the tool for watching the dimensions
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

In [38]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences(sentence)
print(sequence)

[[11], [], [1430], [968], [4], [1537], [1537], [4738], [], [790], [2015], [11], [2922], [2191], [], [790], [2015], [11], [579], [], [11], [579], [], [4], [1783], [4], [4508], [11], [2922], [1277], [], [], [2015], [1005], [2922], [968], [579], [790], []]
