In [13]:
# Characters level ConvNet paper: https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf
import pandas as pd
import numpy as np
import tensorflow as tf
import cnn_models as models
import importlib

from keras.models import Model
from keras.layers import Input, Dense, Concatenate, ThresholdedReLU, MaxPooling1D, Flatten, Dropout, ReLU, Activation
from keras.layers import Convolution1D
from keras.layers import Embedding
from keras.callbacks import TensorBoard
from keras.optimizers import Adam, SGD
from keras.utils import multi_gpu_model

importlib.reload(models)

<module 'cnn_models' from '/home/jerrysong/w266-Final-project/jerry/cnn_models.py'>

In [2]:
train_data = pd.read_csv('../data/no_tags_lower_train_data.csv', encoding='ISO-8859-1', keep_default_na=False)
test_data = pd.read_csv('../data/no_tags_lower_test_data.csv', encoding='ISO-8859-1', keep_default_na=False)
dev_data = pd.read_csv('../data/no_tags_lower_dev_data.csv', encoding='ISO-8859-1', keep_default_na=False)

In [3]:
alphabet = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:\'"/\\|_@#$%^&*~`+-=<>()[]{}'
alphabet_size = len(alphabet)
alphabet_index = {alphabet[i]: i + 1 for i in range(alphabet_size)}
alphabet_index

{'!': 41,
 '"': 45,
 '#': 51,
 '$': 52,
 '%': 53,
 '&': 55,
 "'": 44,
 '(': 64,
 ')': 65,
 '*': 56,
 '+': 59,
 ',': 38,
 '-': 60,
 '.': 40,
 '/': 46,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 ':': 43,
 ';': 39,
 '<': 62,
 '=': 61,
 '>': 63,
 '?': 42,
 '@': 50,
 '[': 66,
 '\\': 47,
 ']': 67,
 '^': 54,
 '_': 49,
 '`': 58,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '{': 68,
 '|': 48,
 '}': 69,
 '~': 57}

In [19]:
max_input_size = max(len(row['text']) for _, row in train_data.iterrows())
print('The max input size is: ' + str(max_input_size))

The max input size is: 1390


In [20]:
from statistics import mean 
average_input_size = mean(len(row['text']) for _, row in train_data.iterrows())
print('The average input size is: ' + str(average_input_size))

The average input size is: 68.3684013671875


In [4]:
input_length = 500

In [5]:
X_train = models.text_to_padding(train_data, alphabet_index, input_length)
X_test = models.text_to_padding(test_data, alphabet_index, input_length)
X_dev = models.text_to_padding(dev_data, alphabet_index, input_length)

In [6]:
print('tranin: ' + str(X_train.shape))
print('test: ' + str(X_test.shape))
print('dev: ' + str(X_dev.shape))

tranin: (1024000, 500)
test: (320000, 500)
dev: (256000, 500)


In [7]:
Y_train = pd.get_dummies(train_data['sentiment']).values
Y_test = pd.get_dummies(test_data['sentiment']).values
Y_dev = pd.get_dummies(dev_data['sentiment']).values

In [8]:
print('tranin: ' + str(Y_train.shape))
print('test: ' + str(Y_test.shape))
print('dev: ' + str(Y_dev.shape))

tranin: (1024000, 2)
test: (320000, 2)
dev: (256000, 2)


In [15]:
params = {
    'epochs': 4,
    'batch_size': 256,
    'alphabet_size': alphabet_size + 1, # All nonalphabet characters are seen as the same character
    'embedding_size': 128,
    'input_length': input_length,
    'filters': [256, 256, 256, 256, 256, 256],
    'kernal_size': [7, 7, 3, 3, 3, 3],
    'pool_size': [3, 3, None, None, None, 3],
    'fully_connected_dim': [1024, 1024],
    'dropout_rate': [0.5, 0.5],
    'loss': 'binary_crossentropy',
    'activation': 'sigmoid',
    'lr': 0.0001,
}

model = models.get_model(params)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 128)          8960      
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 494, 256)          229632    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 164, 256)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 158, 256)          459008    
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 52, 256)           0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 50, 256)           196864    
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 48, 256)           196864    
__________

In [16]:
history = model.fit(
    X_train, 
    Y_train, 
    validation_data=[X_dev, Y_dev], 
    epochs=params['epochs'], 
    batch_size=params['batch_size'])

Train on 1024000 samples, validate on 256000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [18]:
score, acc = model.evaluate(X_test, Y_test, batch_size=params['batch_size'])
print("score: %.8f" % (score))
print("acc: %.8f" % (acc))

score: 0.39531848
acc: 0.82130313
