In [56]:
from string import punctuation
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from collections import Counter
import nltk
import re

Here are some helper functions to do the loading and preprocessing:

In [78]:
punctuation_pattern = "[" + punctuation + "]"
stopwords = nltk.corpus.stopwords.words("english")

def load_doc( filename ):
    """
    opens a file and returns the text inside
    """
    with open(filename, 'r') as fileHandle:
        text = fileHandle.read()
    return text

def clean_doc( doc, minimum_token_length=2 ):
    """
    reads in a string and returns a list of tokens
    """
    tokens = doc.split()
    tokens = [re.sub(punctuation_pattern,"",token).lower() for token in tokens if token not in stopwords\
             and len(token) >= minimum_token_length]
    return tokens

def process_docs( directory, vocab, is_train ):
    """
    takes in a directory and a vocab, and reads the documents into a signle data structure
    """
    documents = []
    for filename in os.listdir(directory):
        if is_train and filename.startswith("cv9"):
            continue
        if not is_train and not filename.startswith("cv9"):
            continue
        full_path = os.path.join(directory, filename)
        vocab.add_doc_to_vocab(full_path)

We define a vocabulary. The more words we have, the larger the representation of the documents. Therefore, it is important to constrain the words to only those believed to be predictive. It is difficult to know beforehand which words to use, and it is often necessary to test different hypotheses. 

We develop a vocabulary as a `Counter`, a dictionary mapping of words to their counts. 

In [97]:
class Vocabulary(object):
    """
    A wrapper around a collections.Counter
    """
    def __init__(self, vocab=None, minimum_token_length=2):
        self._vocab_is_trained = False
        if vocab is not None:
            assert isinstance(vocab, Counter) \
                or isinstance(vocab, dict), "If passing a vocab, must be of dict or counter type"
            self._vocab = Counter(vocab)
            self._vocab_is_trained = True
        else:
            self._vocab = Counter()
        self._minimum_token_length = minimum_token_length
    
    def fit_transform( self, filename ):
        """
        this function reads in the file and adds tokens to the vocab
        """
        doc = load_doc(filename)
        tokens = self.clean_doc_under_vocab(doc)
        self._vocab.update(tokens)
    
    def get_words(self):
        return list(self._vocab.keys())
    
    def most_common(self,n=10):
        return self._vocab.most_common(n)
    
    def write_to_file(self, outfilename):
        with open(outfilename, "w") as outFileHandle:
            data = "\n".join(self.get_words())
            outFileHandle.write(data)
            
    def clean_doc_under_vocab(self, doc):
        """
        reads in a string and returns a list of tokens
        """
        tokens = doc.split()
        tokens = [re.sub(punctuation_pattern,"",token).lower() for token in tokens if token not in stopwords\
                 and len(token) >= self._minimum_token_length and token in self.get_words()]
        return tokens

In [93]:
vocab = Vocabulary()
process_docs("data/small_imdb_movie_reviews/txt_sentoken/neg", vocab, True)
process_docs("data/small_imdb_movie_reviews/txt_sentoken/pos", vocab, True)

In [94]:
print("Length of the vocabulary: {}".format(len(vocab.get_words())))
TOPN = 10
most_common = vocab.most_common(TOPN)
for word, count in most_common:
    print(word, ": {} counts".format(count))

Length of the vocabulary: 45157
film : 7983 counts
one : 4946 counts
movie : 4826 counts
like : 3201 counts
even : 2262 counts
good : 2080 counts
time : 2041 counts
story : 1907 counts
films : 1873 counts
would : 1844 counts


Now we can save this vocabulary to a new file that we can later load and use to filter movie reviews before encoding them for modeling.

In [95]:
vocab.write_to_file("data/small_imdb_movie_reviews/vocab.txt")

Now we can begin training an embedding layer. You learn a word embedding by training a neural network on the classification problem. 

But before that, we need to load all of the training data movie reviews. We want each document to be a string for easy encoding as a sequence of integers.