First, pre-process the data. Based heavily upon the following sources :
1. [yoonkim's 'process_data.py'](https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py)
2. [dennybritz's 'data_helpers.py'](https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py)

Note that this implementation trains its own word embeddings, in contrast to using the [word2vec](https://code.google.com/archive/p/word2vec/) model from Google.

In [1]:
import numpy as np
import re
import itertools
import tensorflow as tf
from collections import Counter

In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [9]:
class TextCNN(object):
    """
    sequence-length : The length of the sentences, padded with <Pad> so that each sentence is of length = 58.
    num_classes : 2 ; +'ve and -'ve
    vocab_size : The total number of words
    embedding_size : Number of dimensions that each words in the vocab. will be represented in.
    filter_sizes : No of words the filter(s) must cover
    num_filter : no of filters of each size.
    To understand better,
    http://d3kbpzbmcynnmx.cloudfront.net/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-12.05.40-PM.png
    """
    def __init__(self, sequence_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters,
                l2_reg_lambda=0.0):
        self.input_x = tf.placeholder(tf.float32,[None,sequence_length],name="input_x") 
        self.input_y = tf.placeholder(tf.float32,[None,num_classes],name="input_y") 
        #enable dropout only during training.
        self.dropout = tf.placeholder(tf.float32,name="dropout")
        l2_loss = tf.constant(0.0)
        #build the embedding layer which maps the vocab. words into a lower-dimensional space.
        #tf.device places this layer on a CPU; GPU nahin aata!
        #tf.name_scope builds the top-level node : 'embedding'
        #in case you wondering how 'Variable' is different from 'placeholder' : 
        #https://stackoverflow.com/questions/36693740/whats-the-difference-between-tf-placeholder-and-tf-variable
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            W = tf.Variable(tf.random_uniform([vocab_size,embedding_size],-1.0,1.0),name = "W")
            #embedding_lookup is just a way of indexing, the esoteric name is just to highlight 
            #that this is mosly seen in Word-Embedding settings.
            self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
            #the above to add a dimendion at the end to incorporate the channel dimension
            
        #add convolution layer.
        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv_maxpool" + str(filter_size)):
                #convolution-layer
                filter_shape = [filter_size,embedding_size,1,num_filters] 
                #why truncated_normal and not random_normal?
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                conv = tf.nn.conv2d(self.embedded_chars_expanded,W,strides=[1, 1, 1, 1],padding="VALID",
                                    name="conv")
                #input shape : [batch, in_height, in_width, in_channels]
                #filter shape : [filter_height, filter_width, in_channels, out_channels]
                #note that the each filter spans the whole embedding.
                #might prove to be useful : https://www.tensorflow.org/api_docs/python/tf/nn/conv2d
                # Apply nonlinearity
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                # Max-pooling over the outputs
                pooled = tf.nn.max_pool(
                    h,
                    ksize=[1, sequence_length - filter_size + 1, 1, 1], 
                    #The size of the window for each dimension of the input tensor.
                    strides=[1, 1, 1, 1],
                    padding='VALID',#implies narrow convolution. Hence 'sequence_length - filter_size + 1' ouputs.
                    name="pool")
                pooled_outputs.append(pooled)
    
        #combining all pooled features:
        no_of_filters = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_outputs, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total]) 
        #shape - [batch_size, num_filters_total]
        
        #add dropout:
        #We set p to something like 0.5 during training, and to 1 (disable dropout) during evaluation.
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout)
        
        #generate scores and predictions
        with tf.name_scope("output"):
            #note that get_variable is used to get a varaible already defined. Also see 'AUTO_REUSE = True'.
            W = tf.get_variable(
                "W",
                shape=[num_filters_total, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)
            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name="scores")
            #Here, tf.nn.xw_plus_b is a convenience wrapper to perform the Wx + b matrix multiplication.
            self.predictions = tf.argmax(self.scores, 1, name="predictions")#same as softmax. Didn't use it here.
        
        # Calculate mean cross-entropy loss
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss

        # Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

![TensorBoard visualisation](http://www.wildml.com/wp-content/uploads/2015/12/Screen-Shot-2015-12-10-at-10.22.29-AM.png "As seen in TensorBoard")