# Senticheck

#### A simple sentiment classifier for the IMDb review dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import re
import string
from utils import *

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


### Data Preprocessing

In [2]:
from keras.datasets import imdb

Using TensorFlow backend.


In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [None]:
train_df = makeDF("./aclImdb/train")
test_df = makeDF("./aclImdb/test")

In [4]:
train_df.head()

Unnamed: 0,string,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,pos
1,Homelessness (or Houselessness as George Carli...,pos
2,Brilliant over-acting by Lesley Ann Warren. Be...,pos
3,This is easily the most underrated film inn th...,pos
4,This is not the typical Mel Brooks film. It wa...,pos


In [50]:
test_df.head()

Unnamed: 0,string,sentiment
0,I went and saw this movie last night after bei...,pos
1,Actor turned director Bill Paxton follows up h...,pos
2,As a recreational golfer with some knowledge o...,pos
3,"I saw this film in a sneak preview, and it is ...",pos
4,Bill Paxton has taken the true story of the 19...,pos


We first make a function to clean unwanted characters and numbers from the strings.

In [None]:
def clean_string(sample):
    cleaner = re.compile('<.*?>')
    sample = re.sub(r'\d+', '', sample)
    sample = re.sub(cleaner, '', sample)
    sample = re.sub("'", '', sample)
    sample = re.sub(r'\W+', ' ', sample)
    sample = sample.replace('_', '')
    sample = sample.lower() # make lowercase
    return sample

First we need to process the input data.

In [69]:
# Clean samples
X_train = train_df['string'].apply(lambda x: clean_string(x)).values
X_test = test_df['string'].apply(lambda x: clean_string(x)).values

# Create tokenizer
tokenizer = Tokenizer(num_words=2000, split=' ') 
tokenizer.fit_on_texts(X_train)

# Integer-encode / pad train and test data
X_train = np.array(tokenizer.texts_to_sequences(X_train))
X_train = pad_sequences(X_train)
X_test = np.array(tokenizer.texts_to_sequences(X_test))
X_test = pad_sequences(X_test)

X_train shape: (25000, 1691) 
 X_test shape: (25000, 1864)


Now we shall convert the pos/neg column into a binary token 0/1.

In [88]:
Y_train = np.array(pd.Series(np.searchsorted(['neg', 'pos'], train_df.sentiment), train_df.index))
Y_test = np.array(pd.Series(np.searchsorted(['neg', 'pos'], test_df.sentiment), train_df.index))

In [97]:
print('X_train shape:', X_train.shape, '\n', 'X_test shape:', X_test.shape)
print('Y_train shape:', Y_train.shape, '\n', 'Y_test shape:', Y_test.shape)

X_train shape: (25000, 1691) 
 X_test shape: (25000, 1864)
Y_train shape: (25000,) 
 Y_test shape: (25000,)


In [71]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X_train.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1691, 128)         256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 1691, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
