In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk, re, time
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

nltk.download('stopwords')

import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ideis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
data = pd.read_csv("data/labeledTrainData.tsv", delimiter="\t")

# data['review'] = [BeautifulSoup(text).get_text() for text in data['review']]
# data['review'][0]

In [25]:
def clean_text(text, remove_stopwords=True):
    '''Clean the text, with the option to remove stopwords'''
    text = BeautifulSoup(text).get_text()
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    text = re.sub(r"[^a-z]", " ", text)
    text = re.sub(r"   ", " ", text) # Remove any extra spaces
    text = re.sub(r"  ", " ", text)
    
    return(text)

data['review'] = data.apply(lambda x: clean_text(x['review']), axis=1)

In [26]:
data['review'][0]

'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate workin

In [None]:
def split_data(data, labels, train_test_split=0.8):
    data_size = len(data)
    test_size = int(data_size - round(data_size * train_test_split))
    
    print("\nTraining set:")
    X_train = data[test_size:]
    print("  X_train: {}".format(len(X_train)))
    y_train = labels[test_size:]
    print("  y_train: {}".format(len(y_train)))
    
    print("\nTesting set:")
    X_test = data[:test_size]
    print("  X_test: {}".format(len(X_test)))
    y_test = labels[:test_size]
    print("  y_test: {}".format(len(y_test)))

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = split_data(data, data['sentiment'])

In [None]:
num_words = 1000

all_reviews = data['review']
all_reviews.head()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(all_reviews)

total_words = len(tokenizer.word_index)
print('{} words in a dictionary'.format(total_words))

X_train = tokenizer.texts_to_matrix(X_train['review'])
X_test = tokenizer.texts_to_matrix(X_test['review'])


In [None]:
from keras.utils import to_categorical
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Hyperparameters
learning_rate = 0.05
epochs = 25
display_epoch = True

# Network Parameters
n_hidden_1 = 128
n_hidden_2 = 64
n_hidden_3 = 32
num_input = X_train.shape[1]
num_classes = y_train.shape[1]

X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)

In [None]:
# Store layers weight & bias
weights = {
    'w1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'w2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'w3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [None]:
def neural_net(X):
    h1 = tf.add(tf.matmul(X, weights['w1']), biases['b1'])
    a1 = tf.nn.relu(h1, name='a1')
#     a1 = tf.nn.dropout(a1, 0.2) 

    h2 = tf.add(tf.matmul(a1, weights['w2']), biases['b2'])
    a2 = tf.nn.relu(h2, name='a2')
#     a2 = tf.nn.dropout(a2, 0.2) 

    h3 = tf.add(tf.matmul(a2, weights['w3']), biases['b3'])
    a3 = tf.nn.relu(h3, name='a3')
    
    out_layer = tf.matmul(a3, weights['out']) + biases['out']
    return out_layer

In [None]:
# Construct model
logits = neural_net(X)

# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:

    sess.run(init)

    for step in range(1, epochs+1):
        # Run optimization(backprop)
        sess.run(train_op, feed_dict={X: X_train, Y: y_train})
        if display_epoch:
            # Calculate loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train,
                                                                 Y: y_train})
            print("Step " + str(step) + ", Loss= " + \
                  "{:.4f}".format(loss) + ", Accuracy= " + \
                  "{:.3f}".format(acc))

    print("Optimization Finished!")
    print()
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: X_test,
                                      Y: y_test}))