# Text Classification with CNN

Acknowledgments: Denny Bretz's excellent tutorial here: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/

**Dataset**
- We will use the **Movie Review** dataset. 
- http://www.cs.cornell.edu/people/pabo/movie-review-data/ 

The dataset contains 10,662 example review sentences, half positive and half negative. The dataset has a vocabulary of size around 20k. Note that since this data set is pretty small we’re likely to overfit with a powerful model. Also, the dataset doesn’t come with an official train/test split, so we simply use 10% of the data as a dev set. The original paper reported results for 10-fold cross-validation on the data.


The approach (from Denny's blog):

1. Load positive and negative sentences from the raw data files.
2. Clean the text data using the same code as the original paper.
3. Pad each sentence to the maximum sentence length, which turns out to be 59. We append special <PAD> tokens to all other sentences to make them 59 words. Padding sentences to the same length is useful because it allows us to efficiently batch our data since each example in a batch must be of the same length.
4. Build a vocabulary index and map each word to an integer between 0 and 18,765 (the vocabulary size). Each sentence becomes a vector of integers.


**NOTE**  Most of the code is from Denny's original files.


In [1]:
import tensorflow as tf
import numpy as np

import os
import time
import datetime
from tensorflow.contrib import learn

print('TensorFlow version', tf.__version__)

TensorFlow version 0.11.0rc2


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline 

%load_ext watermark

%watermark -a 'Atul Acharya' -u -d -v -p numpy,tensorflow,matplotlib

Atul Acharya 
last updated: 2016-12-06 

CPython 3.5.2
IPython 5.0.0

numpy 1.11.2
tensorflow 0.11.0rc2
matplotlib 1.5.1


In [3]:
## First, define a few helper funcs to prepare data

import numpy as np
import re
import itertools
from collections import Counter


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads Movie Review polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]



### Define the CNN Model

The model looks similar to the one in the original paper.
http://d3kbpzbmcynnmx.cloudfront.net/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-8.03.47-AM.png


In [4]:


class TextCNN(object):
    """
    A ConvNet for text classification
    Uses an embedding layer, following by a convolutional layer, then a max pool, and softmax
    """
    def __init__(
        self, sequence_length, num_classes, vocab_size,
        embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0):
        
        ## placeholder for input / output
        # X: [None, Sequence Length]
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name='input_x')
        # Y: [None, Num Classes]
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        
        # keep track of L2 regularization loss
        l2_loss = tf.constant(0.0)
        
        ## Embedding layer
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            # W: embedding matrix - we will learn this during training
            W = tf.Variable(tf.random_uniform([vocab_size, embedding_size],
                                             -1.0, 1.0),
                           name="W")
            
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            # Result of embedding op:
            # a 3D tensor of shape: [None, Sequence length, Embedding size]
            
        # TF's Conv2D expects a 4d tensor 
        ''' TensorFlow’s convolutional conv2d operation expects a 4-dimensional tensor with dimensions 
        corresponding to batch, width, height and channel. 
        The result of our embedding doesn’t contain the channel dimension, 
        so we add it manually, leaving us with a layer of shape 
        [None, sequence_length, embedding_size, 1].

        '''
        ## Create a conv + max pool layer for each filter size
        ''' Note that filter_sizes will usually be [3,4,5] meaning filters will operate
            on 3-words, 4-words, 5-words at a time.
            Total num filters = num_filters * len(filter_sizes)
        '''
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv_maxpool_%s" % filter_size):
                # Conv layer
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(
                        self.embedded_chars_expanded,
                        W,
                        strides=[1,1,1,1],
                        padding='VALID',   # narrow conv, no padding
                        name='conv')
                # >> output shape: [1, sequence_length - filter_size + 1, 1, 1]
                # Apply non-linearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
                # Maxpool over the outputs
                pooled = tf.nn.max_pool(
                        h,
                        ksize=[1, sequence_length - filter_size + 1, 1, 1],
                        strides=[1,1,1,1],
                        padding='VALID',
                        name='pool')
                # >> output shape: [batch_size, 1, 1, num_filters]
                
                pooled_outputs.append(pooled)
                
        # Combine all the pooled features
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3, pooled_outputs)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        # >> output shape: [batch_size, num_filters_total]
        
        ## Dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
        ## Final (unnormalized) scores and predictions
        with tf.name_scope('output'):
            W = tf.get_variable("W",
                               shape=[num_filters_total, num_classes],
                               initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            
        # Calc mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores,
                                                            self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss
            
        ## Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions,
                                          tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions,
                                                  tf.float32),
                                          name="accuracy")
            
        # Add an op to initialize the variables.
        init_op = tf.initialize_all_variables()
            
            

### Training

Now let's train the CNN. 

In [5]:
### Define the TF flags

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", 0.1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the positive data.")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.1, "L2 regularizaion lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 50, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")



In [6]:
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_EVERY=100
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=128
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.1
LOG_DEVICE_PLACEMENT=False
NEGATIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.neg
NUM_EPOCHS=50
NUM_FILTERS=128
POSITIVE_DATA_FILE=./data/rt-polaritydata/rt-polarity.pos



In [7]:
## Load data
print('Loading data...')
x_text, y = load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print('X shape:', x.shape)

## for cross -validation - 
# shuffle
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_dev  :', x_dev.shape)
print('y_dev  :', y_dev.shape)

Loading data...
X shape: (10662, 56)
Vocabulary Size: 18758
Train/Dev split: 9596/1066
x_train: (9596, 56)
y_train: (9596, 2)
x_dev  : (1066, 56)
y_dev  : (1066, 2)


In [8]:
### Begin training

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    
    with sess.as_default():
        cnn = TextCNN(sequence_length=x_train.shape[1],
                      num_classes=y_train.shape[1],   # 2 output classes in this case
                      vocab_size=len(vocab_processor.vocabulary_),
                      embedding_size=FLAGS.embedding_dim,
                      filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                      num_filters=FLAGS.num_filters,
                      l2_reg_lambda=FLAGS.l2_reg_lambda)
        
        # Training procedure
        global_step    = tf.Variable(0, name="global_step", trainable=False)
        optimizer      = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op       = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
        
        # keep track of gradients and sparsity
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
                sparsity_summary  = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.merge_summary(grad_summaries)
        
        # output dir for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))
        
        # Summaries for loss and accuracy
        loss_summary = tf.scalar_summary("loss", cnn.loss)
        acc_summary  = tf.scalar_summary("accuracy", cnn.accuracy)
        
        # Train Summaries - save 
        train_summary_op     = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir    = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.train.SummaryWriter(train_summary_dir, sess.graph)
        
        # Dev summaries - save
        dev_summary_op     = tf.merge_summary([loss_summary, acc_summary])
        dev_summary_dir    = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.train.SummaryWriter(dev_summary_dir, sess.graph)
        
        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir    = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver()   # removing tf.global_variables()
        
        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))
        
        
        ### NOW start the Training Session
        
        # init all vars
        sess.run(tf.initialize_all_variables())
        
        # define train step
        def train_step(x_batch, y_batch):
            ''' A single train step
            '''
            feed_dict = { 
                cnn.input_x: x_batch,
                cnn.input_y: y_batch,
                cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
            
            # one step run
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], 
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            #print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)
        
        # validation/dev step 
        def dev_step(x_batch, y_batch, writer=None):
            ''' Eval model on a validation/dev set
            '''
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0     # no dropout on Test/Validation!
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)
        
        ## Generate batches
        batches = batch_iter(list(zip(x_train, y_train)),
                            FLAGS.batch_size,
                            FLAGS.num_epochs)
        
        # train loop: for each batch
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            # run one step on the batch
            train_step(x_batch, y_batch)
            current_step = tf.train.global_step(sess, global_step)
            
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev, writer=dev_summary_writer)
                print("")
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

Writing to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-100


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-200


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-300


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-400


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-500


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearningExamples/text_classification/runs/1481089620/checkpoints/model-600


Evaluation:

Saved model checkpoint to /Users/aa/Developer/DeepLearnin

### Evaluate

Let's see how this performs on supplied test data. 

We will make up some sentences. 


In [10]:
RUN_DIR = 'runs/1481089620'
CHECKPOINT_DIR = "checkpoints"

x_raw = ["something was off",
        "it was exciting but would have been better if the movie was 30 minutes shorter",
        "the acting was so-so but the plot was engaging",
        "wish i saw this three times"]
y_raw = [0, 0, 1, 1]


# map data into vocab
vocab_path = os.path.join(RUN_DIR, ".", "vocab" )
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

#print('x_test shape', x_raw.shape)

In [None]:
## Evaluate

print("\nEvaluating...\n")
BATCH_SIZE = 64

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(RUN_DIR + '/' + CHECKPOINT_DIR)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=True,   # FLAGS.allow_soft_placement,
      log_device_placement=False)  # FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        # Get the placeholders from the graph by name
        input_x = graph.get_operation_by_name("input_x").outputs[0]
        # input_y = graph.get_operation_by_name("input_y").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        # Tensors we want to evaluate
        predictions = graph.get_operation_by_name("output/predictions").outputs[0]

        # Generate batches for one epoch
        batches = data_helpers.batch_iter(list(x_test), BATCH_SIZE, 1, shuffle=False)

        # Collect the predictions here
        all_predictions = []

        for x_test_batch in batches:
            batch_predictions = sess.run(predictions, {input_x: x_test_batch, dropout_keep_prob: 1.0})
            all_predictions = np.concatenate([all_predictions, batch_predictions])

# Print accuracy if y_test is defined
if y_test is not None:
    correct_predictions = float(sum(all_predictions == y_test))
    print("Total number of test examples: {}".format(len(y_test)))
    print("Accuracy: {:g}".format(correct_predictions/float(len(y_test))))