In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import re
import datetime
from text_cnn import TextCNN
from tensorflow.contrib import learn

In [2]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:


positive_data_file = "./data/rt-polarity.pos"
negative_data_file = "./data/rt-polarity.neg"

positive_examples = list(open(positive_data_file, "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file, "r").readlines())
negative_examples = [s.strip() for s in negative_examples]

print(type(positive_examples))
print(len(positive_examples))
print(positive_examples[0],"\n")

print(type(negative_examples))
print(len(negative_examples))
print(negative_examples[0],"\n")

x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]

print(type(x_text))
print(len(x_text))
print(x_text[0],"\n")

positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]

print(type(positive_labels))
print(len(positive_labels))
print(positive_labels[0],"\n")

print(type(negative_labels))
print(len(negative_labels))
print(negative_labels[0],"\n")

y = np.concatenate([positive_labels, negative_labels], 0)
print(type(y))
print(y.shape)
print(y[0],"\n")

<class 'list'>
5331
the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

<class 'list'>
5331
simplistic , silly and tedious . 

<class 'list'>
10662
the rock is destined to be the 21st century 's new conan and that he 's going to make a splash even greater than arnold schwarzenegger , jean claud van damme or steven segal 

<class 'list'>
5331
[0, 1] 

<class 'list'>
5331
[1, 0] 

<class 'numpy.ndarray'>
(10662, 2)
[0 1] 



In [4]:
max_document_length = max([len(x.split(" ")) for x in x_text])
print(max_document_length,"\n")
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print(len(vocab_processor.vocabulary_),"\n")
print(x.shape)
print(x[0],"\n")

np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

dev_sample_percentage = 0.1
dev_sample_index = -1 * int(dev_sample_percentage * float(len(y)))
print(dev_sample_index,"\n")
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

print(x_train.shape)
print(x_train[0],"\n")
print(x_dev.shape)
print(x_dev[0],"\n")
print(y_train.shape)
print(y_train[0],"\n")
print(y_dev.shape)
print(y_dev[0],"\n")

56 

18758 

(10662, 56)
[ 1  2  3  4  5  6  1  7  8  9 10 11 12 13 14  9 15  5 16 17 18 19 20 21
 22 23 24 25 26 27 28 29 30  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0] 

-1066 

(9596, 56)
[4719   59  182   34  190  804    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0] 

(1066, 56)
[  292    84   523  1889    99   100   274    67    13 15402   121  4596
   600   722  1456  2279   944   207  8493   503   125 10507     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0] 

(9596, 2)
[1 0] 

(1066, 2)
[1 0] 



In [5]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [6]:
session_conf = tf.ConfigProto(
    allow_soft_placement=True, log_device_placement=False)

sess = tf.Session(config=session_conf)

embedding_dim = 128
filter_sizes = "3,4,5"
num_filters = 128
l2_reg_lambda = 0.0
dropout_keep_prob = 0.5
batch_size = 64
num_epochs = 200
evaluate_every = 100
checkpoint_every = 100

with sess.as_default():
    cnn = TextCNN(
        sequence_length=x_train.shape[1],
        num_classes=y_train.shape[1],
        vocab_size=len(vocab_processor.vocabulary_),
        embedding_size=embedding_dim,
        filter_sizes=list(map(int, filter_sizes.split(","))),
        num_filters=num_filters,
        l2_reg_lambda=l2_reg_lambda)

    global_step = tf.Variable(0, trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    grads_and_vars = optimizer.compute_gradients(cnn.loss)
    train_op = optimizer.apply_gradients(
        grads_and_vars, global_step=global_step)

    sess.run(tf.global_variables_initializer())

    def train_step(x_batch, y_batch):
        feed_dict = {
            cnn.input_x: x_batch,
            cnn.input_y: y_batch,
            cnn.dropout_keep_prob: dropout_keep_prob
        }
        _, step, loss, accuracy = sess.run(
            [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)
#         print("step:{}\t loss:{:g}\t acc:{:g}".format(step, loss,
#                                                   accuracy))

    def dev_step(x_batch, y_batch):
        feed_dict = {
            cnn.input_x: x_batch,
            cnn.input_y: y_batch,
            cnn.dropout_keep_prob: 1.0
        }
        step, loss, accuracy = sess.run([global_step, cnn.loss, cnn.accuracy],
                                        feed_dict)
        print("step:{}\t loss:{:g}\t acc:{:g}".format(step, loss, accuracy))
        
    batches = batch_iter(list(zip(x_train, y_train)), batch_size, num_epochs)
    
    
    for batch in batches:
        x_batch, y_batch = zip(*batch)
        train_step(x_batch, y_batch)
        current_step = tf.train.global_step(sess, global_step)
        if current_step % evaluate_every == 0:
            print("\nEvaluation:")
            dev_step(x_dev, y_dev)
            print("")



Evaluation:
step:100	 loss:0.963009	 acc:0.556285


Evaluation:
step:200	 loss:0.685209	 acc:0.602251


Evaluation:
step:300	 loss:0.642567	 acc:0.629456


Evaluation:
step:400	 loss:0.655216	 acc:0.616323


Evaluation:
step:500	 loss:0.643114	 acc:0.62758


Evaluation:
step:600	 loss:0.663851	 acc:0.616323


Evaluation:
step:700	 loss:0.61413	 acc:0.661351


Evaluation:
step:800	 loss:0.590864	 acc:0.684803


Evaluation:
step:900	 loss:0.58794	 acc:0.682927


Evaluation:
step:1000	 loss:0.575223	 acc:0.712008


Evaluation:
step:1100	 loss:0.581421	 acc:0.718574


Evaluation:
step:1200	 loss:0.574816	 acc:0.724203


Evaluation:
step:1300	 loss:0.581084	 acc:0.727955


Evaluation:
step:1400	 loss:0.600231	 acc:0.739212


Evaluation:
step:1500	 loss:0.612172	 acc:0.745779


Evaluation:
step:1600	 loss:0.641584	 acc:0.73546


Evaluation:
step:1700	 loss:0.691855	 acc:0.732645


Evaluation:
step:1800	 loss:0.719599	 acc:0.73546


Evaluation:
step:1900	 loss:0.711525	 acc:0.734522


Evalua

KeyboardInterrupt: 