We will be Developing a Deep Convolutional Neural Network to Perform Sentiment Analysis on IMDB Reviews

Imports Required

In [1]:
from string import punctuation
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from collections import Counter
import nltk
import re
from sklearn.model_selection import train_test_split
import numpy as np
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",level=logging.INFO)

Using TensorFlow backend.


Here are some helper functions to do the loading and preprocessing:

In [2]:
punctuation_pattern = "[" + punctuation + "]"
stopwords = nltk.corpus.stopwords.words("english")

def load_doc( filename ):
    """
    opens a file and returns the text inside
    """
    with open(filename, 'r') as fileHandle:
        text = fileHandle.read()
    return text

def process_docs( directory, vocab ):
    """
    loop over files in a folder, skip reviews that are supposed to make up the test set,
    load the file as document, clean the document text, then add cleaned document to documents.
    return documents
    Args:
        directory (str): name of the directory containign documents (each in separate file)
        vocab (Vocabulary): vocab to use to filter for words in the vocab
    Returns:
        documents (list): list of strings [documents]
    """
    documents = []
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)
        doc = load_doc(full_path)
        tokens = vocab.clean_doc(doc)
        documents.append(tokens)
    return documents

We define a vocabulary. The more words we have, the larger the representation of the documents. Therefore, it is important to constrain the words to only those believed to be predictive. It is difficult to know beforehand which words to use, and it is often necessary to test different hypotheses. 

We develop a vocabulary as a `Counter`, a dictionary mapping of words to their counts. 

In [3]:
class Vocabulary(object):
    """
    A wrapper around a collections.Counter
    """
    def __init__(self, vocab=None, minimum_token_length=2):
        self._vocab_is_trained = False
        if vocab is not None:
            assert isinstance(vocab, Counter) \
                or isinstance(vocab, dict), "If passing a vocab, must be of dict or counter type"
            self._vocab = Counter(vocab)
            self._vocab_is_trained = True
        else:
            self._vocab = Counter()
        self._minimum_token_length = minimum_token_length
        
    def fit(self, directory):
        for filename in os.listdir(directory):
            # skip any reviews in test set
            if is_train and filename.startswith("cv9"):
                continue
            if not is_train and not filename.startswith("cv9"):
                continue
        full_path = os.path.join(directory, filename)
        self.add_tokens(full_path)
        self._vocab_is_trained = True
    
    def add_tokens(self, tokens):
        self._vocab.update(tokens)
        
    def get_words(self):
        return list(self._vocab.keys())
    
    def most_common(self,n=10):
        return self._vocab.most_common(n)
    
    def write_to_file(self, outfilename):
        with open(outfilename, "w") as outFileHandle:
            data = "\n".join(self.get_words())
            outFileHandle.write(data)
            
    def clean_doc( self, doc, minimum_token_length=2 ):
        """
        reads in a string and returns a list of tokens
        """
        tokens = doc.split()
        tokens = [re.sub(punctuation_pattern,"",token).lower() for token in tokens if token not in stopwords\
                 and len(token) >= minimum_token_length]
        tokens = " ".join(tokens)
        return tokens
    
    def __len__(self):
        return len(self._vocab.keys())

In [4]:
DATA_DIR = "data/small_imdb_movie_reviews/"
VOCAB_FILE = "vocab.txt"

if os.path.exists(os.path.join(DATA_DIR,VOCAB_FILE)):
    with open(os.path.join(DATA_DIR,VOCAB_FILE)) as file:
        tokens = file.readlines()
        vocab = Vocabulary()
        vocab.add_tokens(tokens)
else:
    vocab = Vocabulary()
    vocab.fit(os.path.join(DATA_DIR,"txt_sentoken/neg"))
    vocab.fit(os.path.join(DATA_DIR,"txt_sentoken/pos"))
len(vocab)

45157

Now we can save this vocabulary to a new file that we can later load and use to filter movie reviews before encoding them for modeling.

In [5]:
vocab.write_to_file(os.path.join(DATA_DIR,VOCAB_FILE))

Load the training reviews

In [6]:
positive_documents = process_docs("data/small_imdb_movie_reviews/txt_sentoken/pos/", vocab)
negative_documents = process_docs("data/small_imdb_movie_reviews/txt_sentoken/neg/", vocab)

all_documents = negative_documents + positive_documents

Now we need to create a tokenizer and fit it on our training documents:

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_documents)

Encode the documents as sequences of integers

In [8]:
encoded_docs = tokenizer.texts_to_sequences(all_documents)

Pad the sequences so they are all of the same length.

These define the input training features

In [9]:
max_length = max([len(document) for document in all_documents])
padded_sequences = pad_sequences(
    encoded_docs, maxlen=max_length, padding="post")

Define labels.

In [10]:
labels = np.array([0] * len(negative_documents) + [1] * len(positive_documents))

Split data into training and testing:

In [11]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, \
                                                    random_state=2019, stratify=labels)

Download the pre-trained stanford GloVe vectors from [here](https://nlp.stanford.edu/projects/glove/). 

Load the embedding from file and convert it into a matrix:

In [12]:
def load_embedding(filename):
    """
    takes in filename and returns a dictionary containing word to word vector
    mappings. This assumes the text file containing the embeddings has the word
    in the first column and then a space and then the vector
    
    Args:
        filename (str): the name of the file containing the raw embeddings
    Returns:
        embedding (dict): mapping of words to their numpy vectors
    """
    with open(filename, "r") as fileHandle:
        lines = fileHandle.readlines()
    embedding = {}
    for line in lines:
        cols = line.split()
        word = cols[0]
        vector = cols[1:]
        embedding[word] = np.array(vector, dtype=np.float32)
    return embedding

def convert_raw_embeddings_to_matrix(embedding, vocab):
    """
    Creates a matrix of the words contained in the vocab

    Args:
        embedding (dict):
        vocab (Vocabulary):
    """
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, 100))
    for word, i in vocab.items():
        vector = embedding.get(word)  # a "None-safe" __getitem__
        if vector is not None:
            weight_matrix[i] = vector
        # else it stays zeroes
        if (i % 15000 == 0):
            logging.info("converted {} embeddings".format(i))
    return weight_matrix

In [13]:
%%time
raw_embedding = load_embedding("data/glove_wikipedia_embeddings/glove.6B.100d.txt")

In [14]:
%%time
embedding_vectors = convert_raw_embeddings_to_matrix(raw_embedding, tokenizer.word_index)

2019-08-22 13:52:19,162 : INFO : converted 1000 embeddings
2019-08-22 13:52:19,165 : INFO : converted 2000 embeddings
2019-08-22 13:52:19,169 : INFO : converted 3000 embeddings
2019-08-22 13:52:19,172 : INFO : converted 4000 embeddings
2019-08-22 13:52:19,176 : INFO : converted 5000 embeddings
2019-08-22 13:52:19,179 : INFO : converted 6000 embeddings
2019-08-22 13:52:19,183 : INFO : converted 7000 embeddings
2019-08-22 13:52:19,187 : INFO : converted 8000 embeddings
2019-08-22 13:52:19,189 : INFO : converted 9000 embeddings
2019-08-22 13:52:19,192 : INFO : converted 10000 embeddings
2019-08-22 13:52:19,196 : INFO : converted 11000 embeddings
2019-08-22 13:52:19,199 : INFO : converted 12000 embeddings
2019-08-22 13:52:19,203 : INFO : converted 13000 embeddings
2019-08-22 13:52:19,206 : INFO : converted 14000 embeddings
2019-08-22 13:52:19,210 : INFO : converted 15000 embeddings
2019-08-22 13:52:19,213 : INFO : converted 16000 embeddings
2019-08-22 13:52:19,217 : INFO : converted 17000 

Hello world


Create the embedding layer

In [15]:
vocab_size = len(tokenizer.word_index)+1
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], \
                           input_length=max_length, trainable=False)

Now we define the model:

In [17]:
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10498, 100)        4748800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 10494, 128)        64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 5247, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 671616)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 671617    
Total params: 5,484,545
Trainable params: 735,745
Non-trainable params: 4,748,800
_________________________________________________________________
None


Now compile the model

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Now train the model:

In [None]:
model.fit(x_train, y_train, epochs=5, verbose=2)

Now we evaluate the model:

In [None]:
loss, acc = model.evaluate(x_test, y_test, verbose=1)