In [1]:
%cd ..
%load_ext nb_black

/home/hardianlawi/attention-lstm-sentiment-analysis


<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from src.preprocess import Preprocessor
from src.data import get_data
from src.models import get_model

<IPython.core.display.Javascript object>

In [3]:
(str_X_train, y_train), (str_X_val, y_val), (str_X_test, y_test) = get_data()

print("Train size:", len(str_X_train))
print("Val size:", len(str_X_val))
print("Test size:", len(str_X_test))

Train size: 23750
Val size: 1250
Test size: 25000


<IPython.core.display.Javascript object>

In [4]:
sentence_lens = list(
    map(lambda x: len(x.split()), str_X_train + str_X_val + str_X_test)
)
print("Maximum review length: {}".format(max(sentence_lens)))
print("Minimum review length: {}".format(min(sentence_lens)))

Maximum review length: 2470
Minimum review length: 4


<IPython.core.display.Javascript object>

In [5]:
max_words = 500
vocab_size = 5000

preprocessor = Preprocessor(maxlen=max_words, vocab_size=vocab_size, oov_token="<OOV>")
preprocessor.fit_on_texts(str_X_train + str_X_val + str_X_test)

<IPython.core.display.Javascript object>

In [6]:
def generate_unique_tokens(X):
    seqs = preprocessor._tokenizer.texts_to_sequences(X)
    unique_tokens = list(
        set([preprocessor._tokenizer.index_word[x] for seq in seqs for x in seq])
    )
    return unique_tokens


unique_tokens = generate_unique_tokens(str_X_train + str_X_val + str_X_test)

X_train = preprocessor.transform(str_X_train)
X_val = preprocessor.transform(str_X_val)
X_test = preprocessor.transform(str_X_test)

<IPython.core.display.Javascript object>

In [7]:
def embed_elmo(module):
    with tf.Graph().as_default():
        sentences = tf.placeholder(tf.string)
        embed = hub.Module(module, trainable=False)
        embeddings = embed(sentences, signature="default", as_dict=True)["elmo"]
        session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})


embed_fn = embed_elmo("https://tfhub.dev/google/elmo/3")


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


<IPython.core.display.Javascript object>

In [8]:
def _generate_elmo_embeddings():
    emb_matrix = np.zeros((vocab_size, 1024))
    for x in range(0, len(unique_tokens), 10000):
        temp = unique_tokens[x : x + 1000]
        embeddings = embed_fn(temp)
        for word, emb in zip(temp, embeddings.squeeze(axis=1)):
            i = preprocessor.word2id[word]
            emb_matrix[i] = emb
    return emb_matrix


emb_matrix = _generate_elmo_embeddings()
emb_matrix.shape

(5000, 1024)

<IPython.core.display.Javascript object>

In [9]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras import Model

batch_size = 64
epochs = 5


def build_model():
    inp = Input(shape=(max_words,))
    emb = Embedding(
        vocab_size, 1024, mask_zero=True, weights=[emb_matrix], trainable=True
    )(inp)
    x = LSTM(units=32)(emb)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    return model


model = build_model()
model.summary(line_length=200)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Model: "model"
________________________________________________________________________________________________________________________________________________________________________________________________________
Layer (type)                                                                              Output Shape                                                                    Param #                       
input_1 (InputLayer)                                                                      [(None, 500)]                                                                   0                             
________________________________________________________________________________________________________________________________________________________________________________________________________
embedding (Embedding)                                                                     (None, 500, 1024)                                                               5120000    

<IPython.core.display.Javascript object>

Compare to without using pre-trained embeddings at all

In [10]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras import Model

batch_size = 64
epochs = 5


def build_model():
    inp = Input(shape=(max_words,))
    emb = Embedding(vocab_size, 1024, mask_zero=True, trainable=True)(inp)
    x = LSTM(units=32)(emb)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    return model


model = build_model()
model.summary(line_length=200)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    batch_size=batch_size,
    epochs=epochs,
)

scores = model.evaluate(X_test, y_test, verbose=0)
print("Test accuracy:", scores[1])

Model: "model_1"
________________________________________________________________________________________________________________________________________________________________________________________________________
Layer (type)                                                                              Output Shape                                                                    Param #                       
input_2 (InputLayer)                                                                      [(None, 500)]                                                                   0                             
________________________________________________________________________________________________________________________________________________________________________________________________________
embedding_1 (Embedding)                                                                   (None, 500, 1024)                                                               5120000  

<IPython.core.display.Javascript object>