In [102]:
# Characters level ConvNet paper: https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
# Reference: https://github.com/mhjabreel/CharCnn_Keras
import pandas as pd
import numpy as np
import tensorflow as tf

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, ThresholdedReLU, MaxPooling1D, Flatten, Dropout, ReLU, Activation
from keras.layers import Convolution1D
from keras.layers import Embedding
from keras.callbacks import TensorBoard
from keras.optimizers import Adam, SGD
from keras.utils import multi_gpu_model

In [2]:
train_data = pd.read_csv('../data/sentiment140_train.zip', encoding='ISO-8859-1', header=None, names=['sentiment','id','timestamp','type','user','text'])
test_data = pd.read_csv('../data/sentiment140_test.zip', encoding='ISO-8859-1', header=None, names=['sentiment','id','timestamp','type','user','text'])

# The original training data are sorted by sentiment value. Shuffle the training data for randomness
train_data = train_data[['text','sentiment']].sample(frac=1, random_state=40)
test_data = test_data[['text','sentiment']]

In [3]:
test_data = test_data[test_data['sentiment'].isin([0, 4])]
test_data['sentiment'].unique()

array([4, 0])

In [7]:
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
alphabet_size = len(alphabet)
alphabet_index = {alphabet[i]: i + 1 for i in range(alphabet_size)}
alphabet_index

{'!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [13]:
max_input_size = 374

In [6]:
X_train = []
for _, row in train_data.iterrows():
    str2idx = np.zeros(max_input_size, dtype='int64')
    for i, letter in enumerate(row['text'].lower()):
        if i == max_input_size:
            break
        str2idx[i] = alphabet_index.get(letter, 0)
    X_train.append(str2idx)
    
X_train = np.array(X_train)
X_train.shape

(1600000, 100)

In [7]:
X_test = []
for _, row in test_data.iterrows():
    str2idx = np.zeros(max_input_size, dtype='int64')
    for i, letter in enumerate(row['text'].lower()):
        if i == max_input_size:
            break
        str2idx[i] = alphabet_index.get(letter, 0)
    X_test.append(str2idx)
    
X_test = np.array(X_test)
X_test.shape

(359, 100)

In [8]:
Y_train = pd.get_dummies(train_data['sentiment']).values
Y_test = pd.get_dummies(test_data['sentiment']).values

In [22]:
np.save('x_train', X_train)
np.save('x_test', X_test)
np.save('y_train', Y_train)
np.save('y_test', Y_test)

In [2]:
max_input_size = 374
X_train = np.load('x_train.npy')
X_test = np.load('x_test.npy')
Y_train = np.load('y_train.npy')
Y_test = np.load('y_test.npy')

In [3]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(1600000, 374) (1600000, 2)
(359, 374) (359, 2)


In [120]:
class CharCNNZhang(object):
    """
    Class to implement the Character Level Convolutional Neural Network for Text Classification,
    as described in Zhang et al., 2015 (http://arxiv.org/abs/1509.01626)
    """
    def __init__(self, input_size, alphabet_size, embedding_size,
                 conv_layers, fully_connected_layers, num_of_classes,
                 threshold, dropout_p, learning_rate, loss='categorical_crossentropy'):
        """
        Initialization for the Character Level CNN model.
        Args:
            input_size (int): Size of input features
            alphabet_size (int): Size of alphabets to create embeddings for
            embedding_size (int): Size of embeddings
            conv_layers (list[list[int]]): List of Convolution layers for model
            fully_connected_layers (list[list[int]]): List of Fully Connected layers for model
            num_of_classes (int): Number of classes in data
            threshold (float): Threshold for Thresholded ReLU activation function
            dropout_p (float): Dropout Probability
            optimizer (str): Training optimizer
            loss (str): Loss function
        """
        self.input_size = input_size
        self.alphabet_size = alphabet_size
        self.embedding_size = embedding_size
        self.conv_layers = conv_layers
        self.fully_connected_layers = fully_connected_layers
        self.num_of_classes = num_of_classes
        self.threshold = threshold
        self.dropout_p = dropout_p
        self.optimizer = Adam(learning_rate)
        self.loss = loss
        self._build_model()  # builds self.model variable

    def _build_model(self):
        """
        Build and compile the Character Level CNN model
        Returns: None
        """
        # Input layer
        inputs = Input(shape=(self.input_size,), name='sent_input')
        # Embedding layers
        x = Embedding(self.alphabet_size + 1, self.embedding_size, input_length=self.input_size)(inputs)
        # Convolution layers
        for cl in self.conv_layers:
            x = Convolution1D(cl[0], cl[1])(x)
            x = ThresholdedReLU(self.threshold)(x)
            if cl[2] != -1:
                x = MaxPooling1D(cl[2])(x)
        x = Flatten()(x)
        # Fully connected layers
        for fl in self.fully_connected_layers:
            x = Dense(fl)(x)
            x = ThresholdedReLU(self.threshold)(x)
            x = Dropout(self.dropout_p)(x)
        # Output layer
        predictions = Dense(self.num_of_classes, activation='softmax')(x)
        # Build and compile model
        model = Model(inputs=inputs, outputs=predictions)
        # model = multi_gpu_model(model, 2, cpu_relocation=True)
        model.compile(optimizer=self.optimizer, loss=self.loss, metrics=['accuracy'])
        self.model = model
        print("CharCNNZhang model built: ")
        self.model.summary()


    def train(self, training_inputs, training_labels, validation_split,
              epochs, batch_size, checkpoint_every=100):
        """
        Training function
        Args:
            training_inputs (numpy.ndarray): Training set inputs
            training_labels (numpy.ndarray): Training set labels
            epochs (int): Number of training epochs
            batch_size (int): Batch size
            checkpoint_every (int): Interval for logging to Tensorboard
        Returns: None
        """
        # Create callbacks
        tensorboard = TensorBoard(log_dir='./logs', histogram_freq=checkpoint_every, batch_size=batch_size,
                                  write_graph=False, write_grads=True, write_images=False,
                                  embeddings_freq=checkpoint_every,
                                  embeddings_layer_names=None)
        # Start training
        print("Training CharCNNZhang model: ")
        return self.model.fit(training_inputs, training_labels,
                       validation_split=validation_split,
                       epochs=epochs,
                       batch_size=batch_size,
                       verbose=2)

    def test(self, testing_inputs, testing_labels, batch_size):
        """
        Testing function
        Args:
            testing_inputs (numpy.ndarray): Testing set inputs
            testing_labels (numpy.ndarray): Testing set labels
            batch_size (int): Batch size
        Returns: None
        """
        # Evaluate inputs
        return self.model.evaluate(testing_inputs, testing_labels, batch_size=batch_size, verbose=2)
        # self.model.predict(testing_inputs, batch_size=batch_size, verbose=1)

In [121]:
embedding_size = 128
conv_layers = [[256, 7, 3], [256, 7, 3], [256, 3, -1], [256, 3, -1], [256, 3, -1], [256, 3, 3]]
fully_connected_layers = [1024, 1024]
num_of_classes = 2
threshold = 1e-6
dropout_p = 0.5
learning_rate = 0.0001

model = CharCNNZhang(max_input_size, alphabet_size, embedding_size,
                 conv_layers, fully_connected_layers, num_of_classes,
                 threshold, dropout_p, learning_rate)

CharCNNZhang model built: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sent_input (InputLayer)      (None, 374)               0         
_________________________________________________________________
embedding_33 (Embedding)     (None, 374, 128)          8960      
_________________________________________________________________
conv1d_181 (Conv1D)          (None, 368, 256)          229632    
_________________________________________________________________
thresholded_re_lu_201 (Thres (None, 368, 256)          0         
_________________________________________________________________
max_pooling1d_88 (MaxPooling (None, 122, 256)          0         
_________________________________________________________________
conv1d_182 (Conv1D)          (None, 116, 256)          459008    
_________________________________________________________________
thresholded_re_lu_202 (Thres (None, 116, 256)    

In [122]:
epochs = 4
batch_size = 256

history = model.train(
    training_inputs=X_train,
    training_labels=Y_train,
    validation_split=0.2,
    epochs=epochs,
    batch_size=batch_size)

Training CharCNNZhang model: 
Train on 1280000 samples, validate on 320000 samples
Epoch 1/4
 - 883s - loss: 0.5014 - acc: 0.7455 - val_loss: 0.4318 - val_acc: 0.8002
Epoch 2/4
 - 875s - loss: 0.4148 - acc: 0.8108 - val_loss: 0.4003 - val_acc: 0.8176
Epoch 3/4
 - 873s - loss: 0.3825 - acc: 0.8290 - val_loss: 0.3881 - val_acc: 0.8257
Epoch 4/4
 - 873s - loss: 0.3585 - acc: 0.8418 - val_loss: 0.3812 - val_acc: 0.8293


In [123]:
score, acc = model.model.evaluate(X_test, Y_test, batch_size=batch_size)
print("score: %.8f" % (score))
print("acc: %.8f" % (acc))

score: 0.42604588
acc: 0.80222842
