In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import os
import re
import json
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
# Variables and data munging
train_positive_reviews = []
train_negative_reviews = []

test_positive_reviews = []
test_negative_reviews = []

for filename_ in os.listdir('../data/aclImdb/train/pos'):
    with open(os.path.join('../data/aclImdb/train/pos',filename_),'r') as f:
        sentiment = f.read()
        train_positive_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/train/neg'):
    with open(os.path.join('../data/aclImdb/train/neg',filename_),'r') as f:
        sentiment = f.read()
        train_negative_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/test/pos'):
    with open(os.path.join('../data/aclImdb/test/pos',filename_),'r') as f:
        sentiment = f.read()
        test_positive_reviews.append(sentiment)
    
for filename_ in os.listdir('../data/aclImdb/test/neg'):
    with open(os.path.join('../data/aclImdb/test/neg',filename_),'r') as f:
        sentiment = f.read()
        test_negative_reviews.append(sentiment)

train_reviews = train_positive_reviews + train_negative_reviews
test_reviews = test_positive_reviews + test_negative_reviews

train_labels = [0]*len(train_positive_reviews) + [1]*len(train_negative_reviews)
test_labels = [0]*len(test_positive_reviews) + [1]*len(test_negative_reviews)

print 'Number of training reviews: ', len(train_reviews)
print 'Number of testing reviews: ', len(test_reviews)
print 'Number of train labels: ', len(train_labels)
print 'Number of test labels: ', len(test_labels)

Number of training reviews:  25000
Number of testing reviews:  25000
Number of train labels:  25000
Number of test labels:  25000


In [3]:
# creating dataframes
reviews_train_df = pd.DataFrame({'reviews': train_reviews, 'sentiment':train_labels})
reviews_test_df = pd.DataFrame({'reviews': test_reviews, 'sentiment':test_labels})

reviews_train_df = reviews_train_df.sample(frac=1).reset_index(drop=True)
reviews_test_df = reviews_test_df.sample(frac=1).reset_index(drop=True)

# 1 for negative and 0 for positive
print reviews_train_df.head()
print reviews_test_df.head()

                                             reviews  sentiment
0  Kris Kristofferson, at his drugged-out peak in...          1
1  I went in not knowing anything about this movi...          1
2  I saw not so fabulous rating on IMDb, but I we...          1
3  I got this DVD from a friend, who got it from ...          1
4  An excellent performance by Alix Elias highlig...          1
                                             reviews  sentiment
0  Go to the video store and get the original. I ...          1
1  SPOILER!! Terrible camera work, horrible writi...          1
2  I went to see this movie mostly because it loo...          1
3  A young woman, Nicole Carrow (Jaimie Alexander...          1
4  This film is terribly bad. Kevin Spacey is a r...          1


In [4]:
reviews = train_reviews + test_reviews
labels = train_labels + test_labels

MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 500

tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# one hot encoding
labels = np_utils.to_categorical(np.asarray(labels))

print data.shape
print labels.shape

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:25000]
y_train = labels[:25000]
x_val = data[25000:]
y_val = labels[25000:]



Found 124259 unique tokens.
(50000, 500)
(50000, 2)


In [5]:
embeddings_index = {}
f = open('../data/glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM), dtype='float32')
for word,i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [6]:
embedding_matrix.dtype

dtype('float32')

In [8]:
y_train.dtype

dtype('float64')

In [7]:
import tensorflow as tf

batchSize = 24
numClasses = 2
training_iters = 100000
learning_rate = 0.001
display_step = 20
tf.reset_default_graph()

In [9]:
input_data = tf.placeholder(tf.int32, [batchSize, MAX_SEQUENCE_LENGTH], name="input_x")
labels = tf.placeholder(tf.float32, [batchSize, numClasses], name="input_y")
data = tf.Variable(tf.zeros([batchSize, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM]), dtype=tf.float32)
data = tf.nn.embedding_lookup(embedding_matrix, input_data)

conv1 = tf.layers.conv1d(data,filters=32,kernel_size=(3),padding="same",activation=tf.nn.relu)
pool1 = tf.layers.max_pooling1d(inputs=conv1, pool_size=(2), strides=2)
pool1_flat = tf.reshape(pool1, [-1, 8000])
dense1 = tf.layers.dense(inputs=pool1_flat, units=250, activation=tf.nn.relu)
out = tf.layers.dense(inputs=dense1, units=2, activation=tf.nn.softmax)

In [10]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
correct_pred = tf.equal(tf.argmax(out,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    step = 0
    # Keep training until reach max iterations
    while step * batchSize < training_iters:
        batch_xs, batch_ys = x_train[step*batchSize: ((step*batchSize) + batchSize)],y_train[
            step*batchSize: ((step*batchSize) + batchSize)]
        # Fit training using batch data
        sess.run(optimizer, feed_dict={input_data: batch_xs, labels: batch_ys})
        if step % display_step == 0:
            # Calculate batch accuracy
            acc = sess.run(accuracy, feed_dict={input_data: batch_xs, labels: batch_ys})
            # Calculate batch loss
            loss = sess.run(cost, feed_dict={input_data: batch_xs, labels: batch_ys})
            print "Iter " + str(step*batchSize) + ", Minibatch Loss= " + \
                  "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc)
        step += 1
    print "Optimization Finished!"

Iter 0, Minibatch Loss= 0.449172, Training Accuracy= 0.91667
Iter 480, Minibatch Loss= 0.938261, Training Accuracy= 0.37500
Iter 960, Minibatch Loss= 1.016819, Training Accuracy= 0.29167
Iter 1440, Minibatch Loss= 0.438288, Training Accuracy= 0.87500
Iter 1920, Minibatch Loss= 0.814641, Training Accuracy= 0.50000
Iter 2400, Minibatch Loss= 0.854882, Training Accuracy= 0.45833
Iter 2880, Minibatch Loss= 0.688262, Training Accuracy= 0.62500
Iter 3360, Minibatch Loss= 0.771595, Training Accuracy= 0.54167
Iter 3840, Minibatch Loss= 0.896595, Training Accuracy= 0.41667
Iter 4320, Minibatch Loss= 0.771595, Training Accuracy= 0.54167
Iter 4800, Minibatch Loss= 0.896592, Training Accuracy= 0.41667
Iter 5280, Minibatch Loss= 0.729928, Training Accuracy= 0.58333
Iter 5760, Minibatch Loss= 0.771586, Training Accuracy= 0.54167
Iter 6240, Minibatch Loss= 0.938254, Training Accuracy= 0.37500
Iter 6720, Minibatch Loss= 0.771590, Training Accuracy= 0.54167
Iter 7200, Minibatch Loss= 0.813262, Training

ValueError: Cannot feed value of shape (16, 500) for Tensor u'input_x:0', which has shape '(24, 500)'