# Week 2: word embeddings
How to get a sentiment from a sequence of numbers? It can be learned from a corpus of words (embedding).
The idea is that words and associate words are clustered as vectors in multidimensional space.

## The IMDB dataset
TensorFlow data services (TFDS) is a library that contains lots of data sets in lots of different categories.
We will use `imdb_reviews` dataset that contains 50k movie reviews categorized as positive or negative.

## Looking into the details

In [1]:
import tensorflow as tf
print(tf.__version__) # determine TF version, we will need eager execution which is enabled by default in TF 2.0
# Install the TFDS library:
# pip install -q tensorflow-datasets

import tensorflow_datasets as tfds
# get data and metadate from the imdb ds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

# Data is split in 25K samples for training and 25K samples for testing
import numpy as np
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

# Do some conversion to get arrays of sentences
for s,l in train_data: # iterate over train data to extract sentences and labels
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())

for s,l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

# during the training we need a numpy array, so we convert them:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

# Tokenize the sentences
## first, define some hyperparams:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences) # fit tokenizer on a training set of data
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences) # replace strings containing the words with token values
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type) # pad/truncate the sentences until they are of the same length.

# Do the same for testing sequences; there should be more OOV token because we reuse the word index from training step
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), # a key to performing text sentiment analysis in TF
    # the result of the embedding is a 2d array with a len of the sentence and embedding dimension, that's why we have to flatten it:
    # tf.keras.layers.Flatten(),
    # or alternatively, use:
    tf.keras.layers.GlobalAveragePooling1D(), # averages across the vector to flatten it out

    # Dense NN that performs classification:
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

2.4.0
