# Sentiment analysis on IMDB movie reviews using GloVe word embeddings and deep LSTM network

This is a draft only showing the ability to convert an example of an IMDB movie review into a vectorized representation using a 50-dimensional GloVe word embedding

# Imports

In [80]:
# Deep Learning
from keras.models import Model
from keras.layers import LSTM, Dropout, Input, Activation, Embedding, Dense
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

# Linear Algebra
import numpy as np

# Regex
import re

# File manipulation
import glob, os

# Load the IMDB dataset
Dataset can be downloaded here: http://ai.stanford.edu/~amaas/data/sentiment/

In [65]:
def load_dataset():
    '''
    Loads the training and testing examples.
    '''
    
    # Load the training set
    
    pos_train = glob.glob("aclImdb_v1/aclImdb/train/pos/*.txt")
    neg_train = glob.glob("aclImdb_v1/aclImdb/train/neg/*.txt")
    
    X_train = sorted(pos_train + neg_train)

    # Load the testing set
    
    pos_test = glob.glob("aclImdb_v1/aclImdb/test/pos/*.txt")
    neg_test = glob.glob("aclImdb_v1/aclImdb/test/neg/*.txt")

    X_test = sorted(pos_test + neg_test)
    
    # Generate the labels
    
    Y_train = []
    Y_test = []
    
    for file in X_train:
        Y_train.append(file.split("\\")[1].split("_")[1].split('.')[0])
        
    for file in X_test:
        Y_test.append(file.split("\\")[1].split("_")[1].split('.')[0])   
        
    return (X_train, Y_train), (X_test, Y_test)

In [66]:
(X_train, Y_train), (X_test, Y_test) = load_dataset()

# Load 50 dimensional GloVe word embeddings

GloVe word embeddings can be downloaded here: https://nlp.stanford.edu/projects/glove/

In [4]:
def read_glove_vecs(glove_file):
    '''
    Imports the GloVe embedding matrix from an external file and creates several interfaces to that data
    '''
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        word_to_index = {}
        index_to_word = {}
        for w in sorted(words):
            word_to_index[w] = i
            index_to_word[i] = w
            i = i + 1
    return word_to_index, index_to_word, word_to_vec_map

In [5]:
glove_path = "glove.6B.50d.txt"

In [78]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(glove_path)

# Word embedding Example

In [73]:
word = "embedding"
try:
    print(word,'\n', word_to_vec_map[word])
except:
    print(word, " not found")

embedding 
 [ 0.62345   0.032983  0.43996   0.44996   0.85634   0.10575   0.9867
 -1.1748    0.28233   0.11164   0.14791  -0.33504  -0.54567  -0.48938
 -0.30864   0.0542    0.51353   0.25094   0.90265  -0.44953  -0.19574
 -0.059456 -0.23541   0.47732   0.14565   0.71205   0.10384   0.38435
  0.28728  -0.62065   0.19764  -0.92376  -0.45941  -0.35899  -0.36896
 -0.022755  0.036052 -0.037406 -0.6725    0.96637   1.3847   -0.22727
 -0.21122   0.47012  -0.37961  -1.0339    0.93388   0.60006  -0.36329
 -0.078399]


# Data Cleaning Procedures

In [23]:
def remove_html(text):
    '''
    Removes all html tags from input text
    '''
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

In [24]:
def separate_punctuation(text):
    '''
    Adds spaces around all punctation to ensure that they get tokenized properly 
    by the .split() function later on
    '''
    return text.replace("'", "").replace(","," , ") \
    .replace("."," . ").replace("!", " ! ").replace("?", " ? ")

In [35]:
def is_word_known(word):
    '''
    Determines whether the input word relates to a valid GloVe word embedding
    '''
    try:
        _ = word_to_vec_map[word]
        return True
    except:
        return False

In [27]:
def remove_unknown_words(text):
    '''
    Removes all words from the text not related to a valid GloVe word embedding
    '''
    return [word for word in text if is_word_known(word)]

In [28]:
def clean_example(review):
    '''
    Input: A movie review
    Output: A clean tokenized list of words and punctuation extracted from the movie review. 
    '''
    return remove_unknown_words(separate_punctuation(remove_html(review)).lower().split())

# Find Longest Review

We do this so we know how much to pad our other examples

In [54]:
def find_longest_review(X_train, X_test):
    '''
    Returns the amount of tokens in the longest review. (Tokens: words and punctation)
    '''
    max_len = 0
    for file in X_train + X_test:
        with open(file, 'r', encoding="utf8") as file:
            max_len = max(maxLen, len(clean_example(file.read())))

    return max_len

In [None]:
max_len = find_longest_review(X_train, X_test)

In [53]:
print(max_len)

2627


# Define the model

In [75]:
def glove_embedding_layer(word_to_vec_map, word_to_index):
    '''
    Returns a Keras embedding layer populated with 50-dimensional GloVe word embeddings
    '''
    # Define dimensions of embedding matrix
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["embedding"].shape[0]
    
    # Initialize empty matrix
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Populate embedding matrix
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    # Build Embedding layer
    embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [83]:
def lstm_network(input_shape, word_to_vec_map, word_to_index):
    '''
    Returns keras model of the neural network
    '''
    # Input Layer
    sentence_indices = Input(shape=input_shape, dtype="int32")
    
    # Embedding layer
    embedding_layer = glove_embedding_layer(word_to_vec_map, word_to_index)
    embeddings = embedding_layer(sentence_indices)   
    
    # LSTM layers
    X = LSTM(256, return_sequences=True)(embeddings)
    X = Dropout(rate=0.5)(X)
    X = LSTM(256, return_sequences=False)(X)
    X = Dropout(rate=0.5)(X)
    
    # Output layer
    X = Dense(1)(X)
    
    model = Model(inputs=sentence_indices, outputs=X)
    
    return model

In [85]:
model = lstm_network((maxLen,), word_to_vec_map, word_to_index)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae', 'acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 2627)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 2627, 50)          20000050  
_________________________________________________________________
lstm_5 (LSTM)                (None, 2627, 256)         314368    
_________________________________________________________________
dropout_5 (Dropout)          (None, 2627, 256)         0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total para

# Generate Mini-Batches

In [74]:
def sentences_to_indices(X, word_to_index, max_len):
    '''
    Input: A list of reviews containing the complete text of a movie review
    Output: A list of reviews containing the indices to the GloVe embedding 
        matrix for each word in the original review
    '''
    m = X.shape[0]

    X_indices = np.zeros([m, max_len])
    
    for i in range(m):
        
        j = 0
        sentence_words = clean_example(X[i])

        for w in sentence_words:
            X_indices[i, j] = word_to_index[w]
            j = j + 1
    
    return X_indices

In [110]:
def generator(filenames, labels, batch_size, word_to_index, max_len):
    '''
    Generates next mini-batch of data each time this function is called
    '''
    file_count = len(filenames)
    batch_count = 0
    
    while True:
        
        # Generate mini-batch features
        batch_features = []
        for file in filenames[beg:end]:
            with open(file, 'r', encoding="utf8") as f:
                batch_features.append(f.read())
        
        batch_features = sentences_to_indices(np.array(batch_features), word_to_index, max_len)
        
        # Generate mini-batch labels
        beg = batch_count * batch_size
        end = beg + batch_size
        batch_labels = np.array(labels[beg:end])
        
        # Prepare for next batch
        batch_count = batch_count + 1
        
        yield batch_features, batch_labels

# Train the Model

In [114]:
batch_size = 32
m = len(X_train)

In [None]:
model.fit_generator(generator(X_train, Y_train, batch_size, word_to_index, max_len), \
                    steps_per_epoch=m/batch_size, epochs=50)

Epoch 1/50
 68/781 [=>............................] - ETA: 56:02:12 - loss: 1.4622 - mean_absolute_error: 1.0692 - acc: 0.1820

# Evaluate