In [None]:
# Movie sentiment analysis
%tensorflow_version 2.x

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.utils import plot_model

import matplotlib.pyplot as plt

In [None]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  
  return train_df

In [None]:
train = download_and_load_datasets()
train

In [None]:
train_text = train['sentence'].values
train_label = train['polarity'].values

In [None]:
index = 2
print(train_text[index])
print(train_label[index])

In [None]:
max_words = 10000  # We will only consider the top frequent 10,000 words in the dataset
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index
print('Found %s unique tokens but will only consider the top frequent %s words' %(len(word_index), max_words))

In [None]:
word_index

In [None]:
# convert the words to a seqence of numbers (index by the top frequency of the words)
sequences = tokenizer.texts_to_sequences(train_text)

index = 3
print(train_text[index])
print(sequences[index])

In [None]:
# We will cut reviews after maxlen words because neural network input is fixed size vector
maxlen = 200 
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(train_label)

print('Shape of data tensor', data.shape)
print('Shape of labels tensor', labels.shape)

In [None]:
# Now the training X, Y is the acceptable format by Neural Network
# Input X is a vectors and Y is the corresponding labels (i.e. the sentiment)
index = 3
print(data[index])
print("\n lable:", labels[index])

In [None]:
training_samples = 16000  
validation_samples = 4000  
test_samples = 5000

# Split the data into a training set and a validation set
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
x_test = data[training_samples + validation_samples:]
y_test = labels[training_samples + validation_samples:]

test_text = train_text[training_samples + validation_samples:]
test_label = train_label[training_samples + validation_samples:]

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_val shape:", x_val.shape)
print("y_val shape:", y_val.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

In [None]:
# Use the one hidden Dense layer connection which is similar to the MNIST (Digit) recognition
model = Sequential()

model.add(Dense(32, activation='relu', input_shape=(maxlen,)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
model.summary()

In [None]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

# Plot the accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

In [None]:
# As usual, the neural network will learn the input/output associations (i.e. map the sentence sequence to the sentiment)
# In order to do the mapping effectively, neural network needs to understand the text 
# (which are now merely encoded as numbers by the frequency) 
# So the challenge is that the Neural Network has to learn 2 things (input/output mapping and language understanding) at the same time
# Comparing to the image recognition, the pixel positions are naturally mapped to the vectors to represent the image.

# The key idea here is to use word2vec (a pretrained vector which understand the text to certain extend) 
# to solve the 2nd issue (text understanding) and helps the neural network to learn the input/output mappings.

#load a pre-trainied Google News w2v
import nltk
nltk.download('word2vec_sample')

from nltk.data import find
import gensim

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

#Each word is represented in the space of 300 dimensions:
embedding_dim = w2v.vector_size

print("No.of words in the vocab:", len(w2v.vocab))
print("Model dimensions:", embedding_dim)

In [None]:
testWord = 'boy'
w2v[testWord]

In [None]:
# build the embedding matrix from the word2vec that we will be able to load into an Embedding layer. 
# It must be a matrix of shape (max_words, embedding_dim), 
# where each entry i contains the word2vec vector 
# for the word of index i in our reference word index (built during tokenization). 
# Note that the index 0 is not supposed to stand for any word or token -- it's a placeholder.

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
  # Words not found in embedding index will be all-zeros.
  if word in w2v.vocab and i < max_words:
    embedding_matrix[i] = w2v[word]

print("embedding_matrix.shape:", embedding_matrix.shape)
print("max_words:", max_words)
print("embedding_dim:", embedding_dim)
print("maxlen:", maxlen)    

In [None]:
# check this vector is the same as the previous w2v vector
wIndex = word_index[testWord]
embedding_matrix[wIndex]

In [None]:
# Use w2v embedding layer which encode the input data more meaningfully
# Then flatten the embedding output to a single vector for the hidden Dense layer 

w2vModel = Sequential()
w2vModel.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# flatten the 200 sequence of 300-dim w2v vector into one single vector for dense layer
w2vModel.add(Flatten())
w2vModel.add(Dense(32, activation='relu'))
w2vModel.add(Dense(1, activation='sigmoid'))

plot_model(w2vModel, show_shapes=True, show_layer_names=True)

In [None]:
w2vModel.summary()

# Embedding layer = max_words * embedding_dim = 10,000 * 300 = 3,000,000
# Flatten = (input) maxlen * embedding_dim = 200 * 300 = 60,000
# dense = flatten_output * dense_output_neurons + bias = 60,000 * 32 + 32 = 1,920,032
# dense_1 = dense_output * dense_1_output + bias = 32 * 1 + 1 = 33

In [None]:
# Load the Embedding layer from the embedding_matrix and no need to train the parameters 

w2vModel.layers[0].set_weights([embedding_matrix])
w2vModel.layers[0].trainable = False
w2vModel.summary()

In [None]:
# compile the model after setting the Embedding layer to non-trainable
w2vModel.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = w2vModel.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

# Plot the accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

In [None]:
# Do the prediction on the test set
score = w2vModel.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ans = w2vModel.predict(x_test)

In [None]:
# Predict individual movie review
p_index = 20

print("Movie Review:", test_text[p_index])
print("Truth Sentiment:", test_label[p_index])
print("Predicted Sentiment:", round(ans[p_index][0]))

In [None]:
# Change the neural network architecture from dense layer to LSTM which is better for sequence 
# https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
from tensorflow.keras.layers import LSTM

lstm = Sequential()
lstm.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# No need to flatten the vector (required for dense layer but not for LSTM)
lstm.add(LSTM(32))
lstm.add(Dense(1, activation='sigmoid'))

# Load the Embedding layer from the embedding_matrix and no need to train the parameters 
lstm.layers[0].set_weights([embedding_matrix])
lstm.layers[0].trainable = False

lstm.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

plot_model(lstm, show_shapes=True, show_layer_names=True)


In [None]:
lstm.summary()
# The lstm (LSTM) Param # = g * [h(h+i) + h]
# where g is number of gates and LSTM = 4
# h = no. of LSTM hidden neurons
# i = the dimension of input (feature)
# 4 * (32 (32 + 300) + 32) = 42624

In [None]:
history = lstm.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

# Plot the accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.show()

In [None]:
# Suggested changes
# - Instead of using Google News word2vec, use the Movie reivew itself to train the word2vec to embed here
# - Preprocess the train_text such as removing the stop words
# - Try with different max_words and maxlen in the data processing
# - When padding the text sequence to fixed size vector, removing some portion from the middle 
#       because usually the first and last sentence are more important as far as sentiment is concerned
# - Tune the neural network hyper-parameters
# - Train with News Sentiment dataset 