In [1]:
# Characters level ConvNet paper: https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
# Reference: https://github.com/mhjabreel/CharCnn_Keras
import pandas as pd
import numpy as np
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, ThresholdedReLU, MaxPooling1D, Flatten, Dropout, ReLU, Activation
from keras.layers import Convolution1D
from keras.layers import Embedding
from keras.callbacks import TensorBoard
from keras.optimizers import Adam, SGD
from keras.utils import multi_gpu_model

Using TensorFlow backend.


In [2]:
train_data = pd.read_csv('../data/no_tags_lower_train_data.csv', encoding='ISO-8859-1', keep_default_na=False)
test_data = pd.read_csv('../data/no_tags_lower_test_data.csv', encoding='ISO-8859-1', keep_default_na=False)
dev_data = pd.read_csv('../data/no_tags_lower_dev_data.csv', encoding='ISO-8859-1', keep_default_na=False)

In [3]:
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
alphabet_size = len(alphabet)
alphabet_index = {alphabet[i]: i + 1 for i in range(alphabet_size)}
alphabet_index

{'!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [4]:
#max_input_size = max(len(row['text']) for _, row in train_data.iterrows())
max_input_size = 374

In [5]:
def text_to_padding(df, alphabet_index, max_input_size):
    X = []
    for _, row in df.iterrows():
        str2idx = np.zeros(max_input_size, dtype='int64')
        for i, letter in enumerate(row['text'].lower()):
            if i == max_input_size:
                break
            str2idx[i] = alphabet_index.get(letter, 0)
        X.append(str2idx)
    return np.array(X)

In [6]:
X_train = text_to_padding(train_data, alphabet_index, max_input_size)
X_test = text_to_padding(test_data, alphabet_index, max_input_size)
X_dev = text_to_padding(dev_data, alphabet_index, max_input_size)

In [7]:
print('tranin: ' + str(X_train.shape))
print('test: ' + str(X_test.shape))
print('dev: ' + str(X_dev.shape))

tranin: (1024000, 374)
test: (320000, 374)
dev: (256000, 374)


In [8]:
Y_train = pd.get_dummies(train_data['sentiment']).values
Y_test = pd.get_dummies(test_data['sentiment']).values
Y_dev = pd.get_dummies(dev_data['sentiment']).values

In [9]:
print('tranin: ' + str(Y_train.shape))
print('test: ' + str(Y_test.shape))
print('dev: ' + str(Y_dev.shape))

tranin: (1024000, 2)
test: (320000, 2)
dev: (256000, 2)


In [14]:
class CharCNNZhang(object):
    """
    Class to implement the Character Level Convolutional Neural Network for Text Classification,
    as described in Zhang et al., 2015 (http://arxiv.org/abs/1509.01626)
    """
    def __init__(self, input_size, alphabet_size, embedding_size,
                 conv_layers, fully_connected_layers, num_of_classes,
                 threshold, dropout_p, learning_rate, loss='categorical_crossentropy'):
        """
        Initialization for the Character Level CNN model.
        Args:
            input_size (int): Size of input features
            alphabet_size (int): Size of alphabets to create embeddings for
            embedding_size (int): Size of embeddings
            conv_layers (list[list[int]]): List of Convolution layers for model
            fully_connected_layers (list[list[int]]): List of Fully Connected layers for model
            num_of_classes (int): Number of classes in data
            threshold (float): Threshold for Thresholded ReLU activation function
            dropout_p (float): Dropout Probability
            optimizer (str): Training optimizer
            loss (str): Loss function
        """
        self.input_size = input_size
        self.alphabet_size = alphabet_size
        self.embedding_size = embedding_size
        self.conv_layers = conv_layers
        self.fully_connected_layers = fully_connected_layers
        self.num_of_classes = num_of_classes
        self.threshold = threshold
        self.dropout_p = dropout_p
        self.optimizer = Adam(learning_rate)
        self.loss = loss
        self._build_model()  # builds self.model variable

    def _build_model(self):
        """
        Build and compile the Character Level CNN model
        Returns: None
        """
        # Input layer
        inputs = Input(shape=(self.input_size,), name='sent_input')
        # Embedding layers
        x = Embedding(self.alphabet_size + 1, self.embedding_size, input_length=self.input_size)(inputs)
        # Convolution layers
        for cl in self.conv_layers:
            x = Convolution1D(cl[0], cl[1])(x)
            x = ThresholdedReLU(self.threshold)(x)
            if cl[2] != -1:
                x = MaxPooling1D(cl[2])(x)
        x = Flatten()(x)
        # Fully connected layers
        for fl in self.fully_connected_layers:
            x = Dense(fl)(x)
            x = ThresholdedReLU(self.threshold)(x)
            x = Dropout(self.dropout_p)(x)
        # Output layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        # Build and compile model
        model = Model(inputs=inputs, outputs=predictions)
        # model = multi_gpu_model(model, 2, cpu_relocation=True)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
        print("CharCNNZhang model built: ")
        self.model.summary()


    def train(self, training_inputs, training_labels, validation_data, epochs, batch_size):
        """
        Training function
        Args:
            training_inputs (numpy.ndarray): Training set inputs
            training_labels (numpy.ndarray): Training set labels
            epochs (int): Number of training epochs
            batch_size (int): Batch size
            checkpoint_every (int): Interval for logging to Tensorboard
        Returns: None
        """
        # Start training
        print("Training CharCNNZhang model: ")
        return self.model.fit(training_inputs, training_labels,
                       validation_data=validation_data,
                       epochs=epochs,
                       batch_size=batch_size)

    def test(self, testing_inputs, testing_labels, batch_size):
        """
        Testing function
        Args:
            testing_inputs (numpy.ndarray): Testing set inputs
            testing_labels (numpy.ndarray): Testing set labels
            batch_size (int): Batch size
        Returns: None
        """
        # Evaluate inputs
        return self.model.evaluate(testing_inputs, testing_labels, batch_size=batch_size)
        # self.model.predict(testing_inputs, batch_size=batch_size, verbose=1)

In [19]:
embedding_size = 128
conv_layers = [[256, 7, 3], [256, 7, 3], [256, 3, -1], [256, 3, -1], [256, 3, -1], [256, 3, 3]]
fully_connected_layers = [1024, 1024]
num_of_classes = 2
threshold = 1e-6
dropout_p = 0.5
learning_rate = 0.0001

model = CharCNNZhang(max_input_size, alphabet_size, embedding_size,
                 conv_layers, fully_connected_layers, num_of_classes,
                 threshold, dropout_p, learning_rate)

CharCNNZhang model built: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sent_input (InputLayer)      (None, 374)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 374, 128)          8960      
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 368, 256)          229632    
_________________________________________________________________
thresholded_re_lu_17 (Thresh (None, 368, 256)          0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 122, 256)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 116, 256)          459008    
_________________________________________________________________
thresholded_re_lu_18 (Thresh (None, 116, 256)    

In [17]:
epochs = 4
batch_size = 256

history = model.train(
    training_inputs=X_train,
    training_labels=Y_train,
    validation_data=[X_dev, Y_dev],
    epochs=epochs,
    batch_size=batch_size)

Training CharCNNZhang model: 
Train on 1024000 samples, validate on 256000 samples
Epoch 1/4
 - 763s - loss: 0.5103 - acc: 0.7413 - val_loss: 0.4513 - val_acc: 0.7889
Epoch 2/4
 - 720s - loss: 0.4238 - acc: 0.8057 - val_loss: 0.4304 - val_acc: 0.8035
Epoch 3/4
 - 720s - loss: 0.3911 - acc: 0.8248 - val_loss: 0.3978 - val_acc: 0.8200
Epoch 4/4
 - 718s - loss: 0.3658 - acc: 0.8386 - val_loss: 0.3972 - val_acc: 0.8228


In [18]:
score, acc = model.model.evaluate(X_test, Y_test, batch_size=batch_size)
print("score: %.8f" % (score))
print("acc: %.8f" % (acc))

score: 0.39518755
acc: 0.82347812


In [20]:
epochs = 5
batch_size = 256

history = model.train(
    training_inputs=X_train,
    training_labels=Y_train,
    validation_data=[X_dev, Y_dev],
    epochs=epochs,
    batch_size=batch_size)

Training CharCNNZhang model: 
Train on 1024000 samples, validate on 256000 samples
Epoch 1/5
 - 724s - loss: 0.5086 - acc: 0.7419 - val_loss: 0.4464 - val_acc: 0.7921
Epoch 2/5
 - 721s - loss: 0.4228 - acc: 0.8064 - val_loss: 0.4312 - val_acc: 0.7997
Epoch 3/5
 - 721s - loss: 0.3908 - acc: 0.8245 - val_loss: 0.4018 - val_acc: 0.8168
Epoch 4/5
 - 720s - loss: 0.3669 - acc: 0.8380 - val_loss: 0.4033 - val_acc: 0.8177
Epoch 5/5
 - 731s - loss: 0.3457 - acc: 0.8493 - val_loss: 0.3970 - val_acc: 0.8230


In [21]:
score, acc = model.model.evaluate(X_test, Y_test, batch_size=batch_size)
print("score: %.8f" % (score))
print("acc: %.8f" % (acc))

score: 0.39605225
acc: 0.82321563
