<a href="https://colab.research.google.com/github/harenlin/Sentiment-Analysis-With-Tensorflow/blob/main/Sentiment_Analysis_with_RNN_%26_CNN_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with RNN

import package

In [None]:
import os
import time
import pprint
import logging
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import tensorflow as tf
from collections import Counter
from pathlib import Path
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


load in dataset

In [None]:
# file path to training data
train_path = '/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/train.txt'
# calculate how many non-repeating words in corpus
counter = Counter() 
with open(train_path, encoding='utf-8') as f:
    for line in f:
        line = line.rstrip()
        label, words = line.split('\t')
        words = words.split(' ')
        counter.update(words)
# filt the words, only reserve for the word which occurs over 10 times
words = ['<pad>'] + [w for w, freq in counter.most_common() if freq >= 10]
print('Vocab Size:', len(words)) # Vocab Size: 20598
# save vocab.txt
Path('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab').mkdir(exist_ok=True)
with open('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.txt', 'w',encoding='utf-8') as f:
    for w in words:
        f.write(w+'\n')
# create word2idx and idx2word
word2idx = {}
with open('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.txt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        line = line.rstrip()
        word2idx[line] = i
        
idx2word = dict(zip(word2idx.values(), word2idx.keys()))

Vocab Size: 20598


Embedding Layer
-> 可以抓取別人已經Pre-trained好的模型來用(e.g. https://nlp.stanford.edu/projects/glove/)
，也可以用自己手上的資料集來訓練

In [None]:
# we choose to use the simple glove pre-trained model as our embedding representation
# if the token is not exist in our vocab.txt -> make it as [UNK] = unknown
# declare 2d-array for representing embedding of each word in vocab.txt
embedding = np.zeros((len(word2idx)+1, 50))

with open('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/glove.6B.50d.txt', encoding='utf-8') as f:
    count = 0
    for i, line in enumerate(f):
        line = line.rstrip()
        sp = line.split(' ')
        word, representation = sp[0], sp[1:]
        if word in word2idx:
            embedding[word2idx[word]] = np.asarray(representation, dtype='float32') 
            count += 1

print("[%d / %d] words have found pre-trained values" % (count, len(word2idx)))
np.save('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.npy', embedding)
print('Saved /content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.npy')

[19676 / 20598] words have found pre-trained values
Saved /content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.npy


Establish Training Data
-> we need to pay attention that all the input data have to be with same size

In [None]:
# tf.data.Dataset.from_tensor_slices(tensor)
# tf.data.Dataset.from_generator(data_generator,output_data_type,output_data_shape)

In [None]:
def data_generator(file_path, parameters):
    with open(file_path, encoding='utf-8') as f:
        print('Reading Data from ', file_path)
        for line in f:
            line = line.rstrip()
            label, text = line.split('\t')
            text = text.split(' ')
            x_data = [parameters['word2idx'].get(word, len(word2idx)) for word in text]
            if len(x_data) >= parameters['max_seq_len']: x_data = x_data[:parameters['max_seq_len']]
            else: x_data += [0] * (parameters['max_seq_len'] - len(x_data)) # padding
                  # why pad 0? since word2idx['<pad>'] = 0
            y_label = int(label)
            # print( np.array(x_data).shape ) # (1000,) = (parameters['max_seq_len'],)
            yield x_data, y_label # not return x, y -> since generator

In [None]:
# next(data_generator(parameters['train_path'], parameters))

In [None]:
# remember to add "lambda" to avoid TypeError: `generator` must be callable.
def dataset(isTraining, parameters):
    shapes = (parameters['max_seq_len'], ())
    types = (tf.int32, tf.int32)
    if isTraining:
        dataSet = tf.data.Dataset.from_generator(
            lambda: data_generator(parameters['train_path'], parameters),
            output_types=types, output_shapes=shapes, )
        dataSet = dataSet.shuffle(parameters['num_samples'])
        dataSet = dataSet.batch(parameters['batch_size'])
        dataSet = dataSet.prefetch(tf.data.experimental.AUTOTUNE)
    else:
        dataSet = tf.data.Dataset.from_generator(
            lambda: data_generator(parameters['test_path'], parameters),
            output_types=types, output_shapes=shapes, )
        dataSet = dataSet.batch(parameters['batch_size'])
        dataSet = dataSet.prefetch(tf.data.experimental.AUTOTUNE)
    return dataSet

In [None]:
parameters = {
    'vocab_path': '/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.txt',
    'train_path': '/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/train.txt',
    'test_path': '/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/test.txt',
    'num_samples': 25000,
    'num_labels': 2,
    'batch_size': 128,
    'max_seq_len': 1000,
    'rnn_units': 200,
    'dropout_rate': 0.2,
    'clip_norm': 10.,
    'num_patience': 5,
    'learning_rate': 1e-3,
}

word2idx = {}
with open(parameters['vocab_path'], encoding='utf-8') as f:
    for i, line in enumerate(f):
        line = line.rstrip()
        word2idx[line] = i
parameters['word2idx'] = word2idx
parameters['vocab_size'] = len(word2idx) + 1

Define Network

In [None]:
class Model(tf.keras.Model):
    def __init__(self, parameters):
        super().__init__()
        self.embedding = tf.Variable(np.load('/content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/vocab/word.npy'),
                                     dtype=tf.float32, name='pretrained_embedding', trainable=False,)
        self.dropout = tf.keras.layers.Dropout(parameters['dropout_rate'])
        self.rnn1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(parameters['rnn_units'], return_sequences=True))
        self.rnn2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(parameters['rnn_units'], return_sequences=True))
        self.rnn3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(parameters['rnn_units'], return_sequences=False))
        self.fc = tf.keras.layers.Dense(2 * parameters['rnn_units'], tf.nn.elu)
        self.out_linear = tf.keras.layers.Dense(2)
  
    def call(self, inputs, training=False):
        if inputs.dtype != tf.int32: inputs = tf.cast(inputs, tf.int32)
        x = tf.nn.embedding_lookup(self.embedding, inputs)
        x = self.dropout(x, training=training)
        x = self.rnn1(x)
        x = self.dropout(x, training=training)
        x = self.rnn2(x)
        x = self.dropout(x, training=training)
        x = self.rnn3(x)
        x = self.dropout(x, training=training)
        x = self.fc(x)
        x = self.out_linear(x)
        return x

Model Training

In [None]:
model = Model(parameters)
decay_lr = tf.optimizers.schedules.ExponentialDecay(parameters['learning_rate'], 1000, 0.95)
optimizer = tf.optimizers.Adam(parameters['learning_rate'])
logger = logging.getLogger('tensorflow')
logger.setLevel(logging.INFO)

EPOCHS = 50
epoch, global_step, train = 0, 0, True
while epoch < EPOCHS: # train loop start
    epoch += 1
    print("Now in epoch: ", epoch)
    for texts, labels in dataset(isTraining=True, parameters=parameters):
        # print(texts.shape) # (parameters['batch_size'], parameters['max_seq_len'])
        with tf.GradientTape() as tape:
            logits = model(texts, training=True) # correpsond to call(self, inputs, training)
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
            loss = tf.reduce_mean(loss)
        optimizer.lr.assign(decay_lr(global_step)) # assign new learning rate
        grads = tape.gradient(loss, model.trainable_variables) # compute gradient
        grads, _ = tf.clip_by_global_norm(grads, parameters['clip_norm']) # clip gradient to avoid overfitting
        optimizer.apply_gradients(zip(grads, model.trainable_variables)) # update weights
        # print out training information
        if global_step % 10 == 0:
            y_pred = tf.argmax(logits, axis=-1)
            matrics = tf.keras.metrics.Accuracy()
            matrics.update_state(y_true=labels, y_pred=y_pred)
            accuracy = matrics.result().numpy()
            if accuracy > best_acc: best_acc = accuracy
            logger.info("Step {} | Acc: {:.4f} | Loss: {:.4f} | LR: {:.6f}".format(global_step, accuracy, loss.numpy().item(), optimizer.lr.numpy().item()))
        global_step += 1

EPOCH 1
Reading Data from  /content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/train.txt
INFO:tensorflow:Step 0 | Acc: 0.3906 | Loss: 0.7016 | LR: 0.001000
INFO:tensorflow:Step 10 | Acc: 0.5312 | Loss: 0.6854 | LR: 0.000999
INFO:tensorflow:Step 20 | Acc: 0.4844 | Loss: 0.6966 | LR: 0.000999
INFO:tensorflow:Step 30 | Acc: 0.6016 | Loss: 0.6815 | LR: 0.000998
INFO:tensorflow:Step 40 | Acc: 0.6562 | Loss: 0.5978 | LR: 0.000998
INFO:tensorflow:Step 50 | Acc: 0.6562 | Loss: 0.6292 | LR: 0.000997
INFO:tensorflow:Step 60 | Acc: 0.6562 | Loss: 0.6200 | LR: 0.000997
INFO:tensorflow:Step 70 | Acc: 0.6953 | Loss: 0.6291 | LR: 0.000996
INFO:tensorflow:Step 80 | Acc: 0.7031 | Loss: 0.5678 | LR: 0.000996
INFO:tensorflow:Step 90 | Acc: 0.6562 | Loss: 0.6437 | LR: 0.000995
INFO:tensorflow:Step 100 | Acc: 0.5312 | Loss: 0.6891 | LR: 0.000995
INFO:tensorflow:Step 110 | Acc: 0.5156 | Loss: 0.6939 | LR: 0.000994
INFO:tensorflow:Step 120 | Acc: 0.6250 | Loss: 0.6593 | LR: 0.000994
IN

Exception ignored in: <function IteratorResourceDeleter.__del__ at 0x7f8d453d7830>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 546, in __del__
    handle=self._handle, deleter=self._deleter)
  File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1264, in delete_iterator
    _ctx, "DeleteIterator", name, handle, deleter)
KeyboardInterrupt: 


KeyboardInterrupt: ignored

In [None]:
# testing
m = tf.keras.metrics.Accuracy()
for texts, labels in dataset(isTraining=False, parameters=parameters):
    logits = model(texts, training=False)
    y_pred = tf.argmax(logits, axis=-1)
    m.update_state(y_true=labels, y_pred=y_pred)
acc = m.result().numpy()
logger.info("Evaluation: Testing Accuracy: {:.3f}".format(acc))

Reading Data from  /content/drive/My Drive/Tensorflow Course/ch10-RNN-sentiment-analysis/data/test.txt
INFO:tensorflow:Evaluation: Testing Accuracy: 0.879


# Sentiment Analysis with CNN

import package

In [95]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Reshape, Embedding, Activation
from keras.layers import Dense, Dropout, Conv2D, Flatten, MaxPool2D, Input, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


load in data

In [110]:
# hyper-parameters
vocab_size = 3000
max_seq_len = 300
embedding_dim = 100

# load-in dataset
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)
# x_train = array of indices, you can see whats in it
print(x_train[1]) # [1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 2, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 2, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 2, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 2, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 2, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 2, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
print(x_train.shape) # (25000,)
print(y_train.shape) # (25000,)
print(x_test.shape)  # (25000,)
print(y_test.shape)  # (25000,)

# pad the sequence 
x_train = pad_sequences(x_train, maxlen = max_seq_len)
x_test = pad_sequences(x_test, maxlen = max_seq_len)
print(x_train.shape) # (25000, 300)
print(x_test.shape)  # (25000, 300)

[1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 2, 2, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 2, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 2, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 2, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 2, 5, 163, 11, 2, 2, 4, 1153, 9, 194, 775, 7, 2, 2, 349, 2637, 148, 605, 2, 2, 15, 123, 125, 68, 2, 2, 15, 349, 165, 2, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 2, 228, 2, 5, 2, 656, 245, 2350, 5, 4, 2, 131, 152, 491, 18, 2, 32, 2, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]
(25000,)
(25000,)
(25000,)
(25000,)
(25000, 300)
(25000, 300)


Model Definition

In [99]:
# different size of kernal
filter_sizes = [3,4,5]

def convolutions(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    inputs = Input(shape = (max_seq_len, embedding_dimension, 1))
    cnns = []
    for size in filter_sizes:
        cnn = Conv2D(filters=64, kernel_size=(size, embedding_dimension), strides=1, padding='valid', activation='relu')(inputs)
        pooling = MaxPool2D(pool_size=(max_seq_len-size+1, 1), padding='valid')(cnn)
        cnns.append(pooling)
    cnns_outputs = concatenate(cnns)
    model = Model(inputs=inputs, outputs=cnns_outputs)
    return model

def cnn_nlp_model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    model = Sequential([
        Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len),
        Reshape(target_shape=(max_seq_len, embedding_dimension, 1)), # 2D -> 3D
        convolutions(vocab_size, embedding_dim, max_seq_len, filter_sizes),
        Flatten(),
        Dense(10, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

Model Training

In [108]:
model = cnn_nlp_model(vocab_size, embedding_dim, max_seq_len, filter_sizes)
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Second way to define model

In [None]:
# different size of kernal
filter_sizes = [3,4,5]

def convolutions(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    inputs = Input(shape = (max_seq_len, embedding_dimension, 1))
    cnns = []
    for size in filter_sizes:
        cnn = Conv2D(filters=64, kernel_size=(size, embedding_dimension), strides=1, padding='valid', activation='relu')(inputs)
        pooling = MaxPool2D(pool_size=(max_seq_len-size+1, 1), padding='valid')(cnn)
        cnns.append(pooling)
    cnns_outputs = concatenate(cnns)
    model = Model(inputs=inputs, outputs=cnns_outputs)
    return model

def cnn_nlp_model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len))
    model.add(Reshape(target_shape=(max_seq_len, embedding_dimension, 1))) # 2D -> 3D
    model.add(convolutions(vocab_size, embedding_dim, max_seq_len, filter_sizes))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

model = cnn_nlp_model(vocab_size, embedding_dim, max_seq_len, filter_sizes)
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Third way to define model

In [116]:
class Model(tf.keras.Model):
    def __init__(self, vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes):
        super().__init__()
        self.embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dimension, input_length=max_seq_len)
        self.reshape = Reshape(target_shape=(max_seq_len, embedding_dimension, 1)) # 2D -> 3D
        self.cnn1 = Conv2D(filters=64, kernel_size=(filter_sizes[0], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool1 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[0]+1, 1), padding='valid')
        self.cnn2 = Conv2D(filters=64, kernel_size=(filter_sizes[1], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool2 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[1]+1, 1), padding='valid')
        self.cnn3 = Conv2D(filters=64, kernel_size=(filter_sizes[2], embedding_dimension), strides=1, padding='valid', activation='relu')
        self.pool3 = MaxPool2D(pool_size=(max_seq_len-filter_sizes[2]+1, 1), padding='valid')
        self.flatten = Flatten()
        self.fc = Dense(10, activation='relu')
        self.dropout = Dropout(0.2)
        self.out_linear = Dense(1, activation='sigmoid')
  
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.reshape(x)
        x1 = self.cnn1(x)
        x1 = self.pool1(x1)
        x2 = self.cnn1(x)
        x2 = self.pool1(x2)
        x3 = self.cnn1(x)
        x3 = self.pool1(x3)
        x = concatenate([x1,x2,x3], axis=-1)
        x = self.flatten(x)
        x = self.fc(x)
        if training: x = self.dropout(x, training=training)
        x = self.out_linear(x)
        return x

model = Model(vocabulary_size=vocab_size, embedding_dimension=embedding_dim, max_seq_len=max_seq_len, filter_sizes=filter_sizes)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
