# Glove Bidirectional-LSTM Text Classification Simple Deep Learning Model

### Import Module

In [15]:
import numpy as np
import pandas as pd
from collections import defaultdict
import re

from bs4 import BeautifulSoup

import sys
import os
import codecs

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras.models import Sequential

from keras import backend as K
from keras.engine.topology import Layer, InputSpec

### member variable

In [2]:
max_features = 20000
# cut texts after this number of words
# (among top max_features most common words)
maxlen = 1000
batch_size = 50
epoch_num = 6
embedding_dim = 100
validation_split = 0.2

### Cleansing Function

In [3]:
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\\", "", string)    
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

### Data Load

In [4]:
data_train = pd.read_csv('./embedding_data/imdb/labeledTrainData.tsv', sep='\t')
print(data_train.shape)

(25000, 3)


### Text Data Pre-processing

In [5]:
texts = []
labels = []

for idx in range(data_train.review.shape[0]):
    text = BeautifulSoup(data_train.review[idx])
    texts.append(clean_str(text.get_text()))
    labels.append(data_train.sentiment[idx])



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [6]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 81501 unique tokens.


In [7]:
data = pad_sequences(sequences, maxlen=maxlen)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (25000, 1000)
Shape of label tensor: (25000, 2)


In [8]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Traing and validation set number of positive and negative reviews')
print(y_train.sum(axis=0))
print(y_val.sum(axis=0))

Traing and validation set number of positive and negative reviews
[10003.  9997.]
[2497. 2503.]


### Glove Word Embedding

In [12]:
GLOVE_FILE_PATH = ".\embedding_data\glove\glove.6B.100d.txt"
embeddings_index = {}
f = codecs.open(GLOVE_FILE_PATH,'r','utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [13]:
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=True)

print('Word Embedding Layer Initialized!')

Word Embedding Layer Initialized!


### LSTM Model Define

In [28]:
print('Build model...')
sequence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(2, activation='softmax'))


'''
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(2, activation='softmax')(l_lstm)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])
'''
print("model fitting - Bidirectional LSTM")
model.summary()

Build model...
model fitting - Bidirectional LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         8150200   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               234496    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 514       
Total params: 8,385,210
Trainable params: 8,385,210
Non-trainable params: 0
_________________________________________________________________


### Model Compile

In [29]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Model Train

In [None]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epoch_num,
          validation_data=(x_val, y_val))
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)

Train...
Train on 20000 samples, validate on 5000 samples
Epoch 1/6
 4450/20000 [=====>........................] - ETA: 3:16:01 - loss: 0.6782 - acc: 0.5728

In [None]:
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=epoch_num, batch_size=batch_size)
score, acc = model.evaluate(x_val, y_val,
                            batch_size=batch_size)

### Accuracy Check 

In [None]:
print('Test score:', score)
print('Test accuracy:', acc)