# Senticheck

_A simple sentiment classifier for the IMDb review dataset_

I roughly followed the approach taken [here](http://bit.ly/2O4PNEd).

In [None]:
%load_ext autoreload
%autoreload 2

In [119]:
import numpy as np
import pandas as pd
import re
import string
from utils import *

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

### Data Preprocessing

In [None]:
train_df = makeDF("./aclImdb/train")
test_df = makeDF("./aclImdb/test")

In [4]:
train_df.head()

Unnamed: 0,string,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,pos
1,Homelessness (or Houselessness as George Carli...,pos
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos
3,This is easily the most underrated film inn th...,pos
4,This is not the typical Mel Brooks film. It wa...,pos


In [50]:
test_df.head()

Unnamed: 0,string,sentiment
0,I went and saw this movie last night after bei...,pos
1,Actor turned director Bill Paxton follows up h...,pos
2,As a recreational golfer with some knowledge o...,pos
3,"I saw this film in a sneak preview, and it is ...",pos
4,Bill Paxton has taken the true story of the 19...,pos


We first make a function to clean unwanted characters and numbers from the strings.

In [121]:
def clean_string(sample):
    cleaner = re.compile('<.*?>')
    sample = re.sub(r'\d+', '', sample)
    sample = re.sub(cleaner, '', sample)
    sample = re.sub("'", '', sample)
    sample = re.sub(r'\W+', ' ', sample)
    sample = sample.replace('_', '')
    sample = sample.lower() # make lowercase
    return sample

Now we need to process the input data.

In [122]:
# Clean samples
X_tr = train_df['string'].apply(lambda x: clean_string(x)).values
X_te = test_df['string'].apply(lambda x: clean_string(x)).values

# Create tokenizer
vocab_size = 2000
tokenizer = Tokenizer(num_words=vocab_size, split=' ') 
tokenizer.fit_on_texts(X_tr)

# Integer-encode
X_tr = np.array(tokenizer.texts_to_sequences(X_tr))
X_te = np.array(tokenizer.texts_to_sequences(X_te))

Ideally, we'd take the full reviews for training, however, this would take far too long on my machine. I've chosen to take the first 32 words instead.

In [123]:
# Padding
# maxlen = max([max([len(x) for x in X_tr]),max([len(x) for x in X_te])])
maxlen = 32 # only take the first 32 words for the sake of speed
X_train = pad_sequences(X_tr, maxlen=maxlen)
X_test = pad_sequences(X_te, maxlen=maxlen)

Now we shall convert the pos/neg column into a binary token 0/1.

In [124]:
Y_train = pd.get_dummies(train_df['sentiment']).values
Y_test = pd.get_dummies(test_df['sentiment']).values

In [125]:
print('X_train shape:', X_train.shape, '\n', 'X_test shape:', X_test.shape)
print('Y_train shape:', Y_train.shape, '\n', 'Y_test shape:', Y_test.shape)

X_train shape: (25000, 32) 
 X_test shape: (25000, 32)
Y_train shape: (25000, 2) 
 Y_test shape: (25000, 2)


The train/test split has already been carried out but we need to now split the test data into the validation/test data.

In [126]:
X_test, X_val, Y_test, Y_val = train_test_split(X_test, Y_test, test_size=0.5, random_state=1)

# Keras

### Model

In [128]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(vocab_size, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.6))
model.add(LSTM(lstm_out, dropout=0.6, recurrent_dropout=0.6))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 32, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


### Training

In [129]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 1, validation_data=(X_val, Y_val))

Train on 25000 samples, validate on 12500 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x128e81128>

### Validation

In [130]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("accuracy: %.2f" % (acc))

score: 0.45
accuracy: 0.79


[This](https://blog.paralleldots.com/data-science/breakthrough-research-papers-and-models-for-sentiment-analysis/) article runs through some of the accuracies scored on the IMDb benchmark. Given the size of reviews and amount of training, we haven't performed too poorly.

# Tensorflow

Let's try running a similar model but with lower-level Tensorflow. Roughly following the code from [here](https://bit.ly/2DOGvYh).

In [10]:
import tensorflow as tf

### Data

In [94]:
class SimpleDataIterator():
    def __init__(self, data):
        self.data = data
        self.size = data[0][0].shape[0]
        self.epochs = 0
        self.shuffle()

    def shuffle(self):
        s = np.arange(self.data[0].shape[0])
        np.random.shuffle(s)
        self.data = [self.data[0][s],self.data[1][s]]
        self.cursor = 0

    def next_batch(self, n):
        if self.cursor+n-1 > self.size:
            self.epochs += 1
            self.shuffle()
        X_batch = self.data[0][self.cursor:self.cursor+n]
        Y_batch = self.data[1][self.cursor:self.cursor+n]
        self.cursor += n
        return X_batch, Y_batch, np.array(n*[X_batch.shape[1]])

In [95]:
# Convert Y from one-hot encoded to integer encoded
train = [X_train, np.argmax(Y_train, axis=1)] 
val = [X_val, np.argmax(Y_val, axis=1)]
test = [X_test, np.argmax(Y_test, axis=1)]

### Model

In [113]:
def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def build_graph(
    vocab_size = vocab_size,
    state_size = 64,
    batch_size = 256,
    num_classes = 2):

    reset_graph()

    # Placeholders
    x = tf.placeholder(tf.int32, shape=[batch_size, None]) # [batch_size, num_steps]
    seqlen = tf.placeholder(tf.int32, shape=[batch_size])
    y = tf.placeholder(tf.int32, shape=[batch_size])
    keep_prob = tf.placeholder(tf.float32,[])

    # Embedding layer
    embeddings = tf.get_variable('embedding_matrix', shape=[vocab_size, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings, x)

    # RNN
    cell = tf.nn.rnn_cell.GRUCell(state_size)
    init_state = tf.get_variable('init_state', [1, state_size],
                                 initializer=tf.constant_initializer(0.0))
    init_state = tf.tile(init_state, [batch_size, 1])
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, sequence_length=seqlen,
                                                 initial_state=init_state)

    # Add dropout, as the model otherwise quickly overfits
    rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob)

    idx = tf.range(batch_size)*tf.shape(rnn_outputs)[1] + (seqlen - 1)
    last_rnn_output = tf.gather(tf.reshape(rnn_outputs, [-1, state_size]), idx)

    # Softmax layer
    with tf.variable_scope('softmax'):
        W = tf.get_variable('W', [state_size, num_classes])
        b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
    logits = tf.matmul(last_rnn_output, W) + b
    preds = tf.nn.softmax(logits)
    correct = tf.equal(tf.cast(tf.argmax(preds,1),tf.int32), y)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits))
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss)

    return {
        'x': x,
        'seqlen': seqlen,
        'y': y,
        'dropout': keep_prob,
        'loss': loss,
        'ts': train_step,
        'preds': preds,
        'accuracy': accuracy
    }

In [109]:
def train_graph(graph, batch_size = 256, num_epochs = 10, iterator = SimpleDataIterator):
    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        tr = iterator(train)
        te = iterator(val)
        g = graph

        step, accuracy = 0, 0
        tr_losses, te_losses = [], []
        current_epoch = 0
        while current_epoch < num_epochs:
            step += 1
            batch = tr.next_batch(batch_size)
            feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 0.2}
            accuracy_, _ = sess.run([g['accuracy'], g['ts']], feed_dict=feed)
            accuracy += accuracy_

            if tr.epochs > current_epoch:
                current_epoch += 1
                tr_losses.append(accuracy / step)
                step, accuracy = 0, 0

                #eval test set
                te_epoch = te.epochs
                while te.epochs == te_epoch:
                    step += 1
                    batch = te.next_batch(batch_size)
                    feed = {g['x']: batch[0], g['y']: batch[1], g['seqlen']: batch[2], g['dropout']: 0.2}
                    accuracy_ = sess.run([g['accuracy']], feed_dict=feed)[0]
                    accuracy += accuracy_

                te_losses.append(accuracy / step)
                step, accuracy = 0,0
                print("Accuracy after epoch", current_epoch, " - tr:", tr_losses[-1], "- te:", te_losses[-1])

    return tr_losses, te_losses

In [None]:
g = build_graph()
tr_losses, te_losses = train_graph(g)