In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

nltk.download('stopwords')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

data['review'] = data.apply(lambda x: clean_text(x['review']), axis=1)

In [4]:
def split_data(data, labels, train_test_split=0.8):
    data_size = len(data)
    test_size = int(data_size - round(data_size * train_test_split))
    
    print("\nTraining set:")
    X_train = data[test_size:]
    print("  X_train: {}".format(len(X_train)))
    y_train = labels[test_size:]
    print("  y_train: {}".format(len(y_train)))
    
    print("\nTesting set:")
    X_test = data[:test_size]
    print("  X_test: {}".format(len(X_test)))
    y_test = labels[:test_size]
    print("  y_test: {}".format(len(y_test)))

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = split_data(data, data['sentiment'])


Training set:
  X_train: 20000
  y_train: 20000

Testing set:
  X_test: 5000
  y_test: 5000


In [5]:
num_words = 1000

all_reviews = data['review']
all_reviews.head()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(all_reviews)

total_words = len(tokenizer.word_index)
print('{} words in a dictionary'.format(total_words))

X_train = tokenizer.texts_to_matrix(X_train['review'])
X_test = tokenizer.texts_to_matrix(X_test['review'])

73272 words in a dictionary


In [6]:
from keras.utils import to_categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

print(X_train.shape[1])
print(y_train.shape[1])
print(X_test.shape)
print(y_test.shape)

1000
2
(5000, 1000)
(5000, 2)


In [17]:
# Hyperparameters
learning_rate = 0.05
epochs = 25
display_epoch = True

# Network Parameters
n_hidden_1 = 128
n_hidden_2 = 64
n_hidden_3 = 32
num_input = X_train.shape[1]
num_classes = y_train.shape[1]

X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)

In [18]:
# Store layers weight & bias
weights = {
    'w1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'w2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'w3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [19]:
def neural_net(X):
    h1 = tf.add(tf.matmul(X, weights['w1']), biases['b1'])
    a1 = tf.nn.relu(h1, name='a1')
#     a1 = tf.nn.dropout(a1, 0.2) 

    h2 = tf.add(tf.matmul(a1, weights['w2']), biases['b2'])
    a2 = tf.nn.relu(h2, name='a2')
#     a2 = tf.nn.dropout(a2, 0.2) 

    h3 = tf.add(tf.matmul(a2, weights['w3']), biases['b3'])
    a3 = tf.nn.relu(h3, name='a3')
    
    out_layer = tf.matmul(a3, weights['out']) + biases['out']
    return out_layer

In [22]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [23]:
with tf.Session() as sess:

    sess.run(init)

    for step in range(1, epochs+1):
        # Run optimization(backprop)
        sess.run(train_op, feed_dict={X: X_train, Y: y_train})
        if display_epoch:
            # Calculate loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train,
                                                                 Y: y_train})
            print("Step " + str(step) + ", Loss= " + \
                  "{:.4f}".format(loss) + ", Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")
    print()
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: X_test,
                                      Y: y_test}))

Step 1, Loss= 897.7988, Accuracy= 0.505
Step 2, Loss= 265.3820, Accuracy= 0.505
Step 3, Loss= 286.9371, Accuracy= 0.534
Step 4, Loss= 119.0917, Accuracy= 0.577
Step 5, Loss= 182.0977, Accuracy= 0.590
Step 6, Loss= 122.3391, Accuracy= 0.606
Step 7, Loss= 67.2420, Accuracy= 0.619
Step 8, Loss= 96.3561, Accuracy= 0.625
Step 9, Loss= 81.1545, Accuracy= 0.628
Step 10, Loss= 48.3432, Accuracy= 0.639
Step 11, Loss= 71.3577, Accuracy= 0.654
Step 12, Loss= 59.4069, Accuracy= 0.665
Step 13, Loss= 34.0796, Accuracy= 0.678
Step 14, Loss= 38.9757, Accuracy= 0.692
Step 15, Loss= 39.3344, Accuracy= 0.702
Step 16, Loss= 26.6386, Accuracy= 0.718
Step 17, Loss= 22.6015, Accuracy= 0.726
Step 18, Loss= 27.0128, Accuracy= 0.735
Step 19, Loss= 25.1094, Accuracy= 0.744
Step 20, Loss= 18.9809, Accuracy= 0.743
Step 21, Loss= 16.5664, Accuracy= 0.740
Step 22, Loss= 17.1752, Accuracy= 0.746
Step 23, Loss= 16.7863, Accuracy= 0.757
Step 24, Loss= 15.0763, Accuracy= 0.759
Step 25, Loss= 13.4477, Accuracy= 0.752
Opt