In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import zipfile
import sys
import json
import random
import pickle
import gzip
import shutil

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Bidirectional, Softmax
from tensorflow.keras.layers import Concatenate, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import Constant

from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from sklearn.metrics import roc_auc_score

In [2]:
amz_url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"
glove_url = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'

In [3]:
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return local_filename

In [4]:
# Uncomment lines below to download the files
# Amazon file is ~500 MB
# Glove file is ~2.1 GB

# amz_file = download_file(amz_url)
# glove_filename = download_file(glove_url)
# amz_file = 'reviews_Electronics_5.json.gz'
amz_filename = 'Electronics_5.json'
# with gzip.open(amz_file, 'rb') as f_in:
#     with open(amz_filename, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

glove_filename = 'glove.840B.300d.zip'

In [5]:
MAX_SEQUENCE_LENGTH = 200
MAX_NUM_WORDS = 5000
EMBEDDING_DIM = 300
# Validation split is taken care with a parameter validation_split in model.fit()
# VALIDATION_SPLIT = 0.25
TEST_SPLIT = 0.25

# Number of neurons
num_lstm = 100
num_dense = 200

lstm_dropout_rate = 0.2
dense_dropout_rate = 0.2

In [6]:
%%time
########################################
## index word vectors.
########################################
if os.path.isfile('./embedding_index.pickle'):
    print('Importing word vectors')
    with open('embedding_index.pickle', 'rb') as handle:
        embedding_index = pickle.load(handle)
else:
    print('Indexing word vectors')

    encoding = 'utf-8'
    embedding_index = {}

    with zipfile.ZipFile(glove_filename) as myzip:
        with myzip.open(myzip.namelist()[0]) as file:
            for line in file:
                values = line.split()
                word = values[0].decode(encoding)
                vector = np.asarray(values[1:], "float32")
                embedding_index[word] = vector    
    with open('embedding_index.pickle', 'wb') as handle:
        pickle.dump(embedding_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

print('Indexed the word vectors')   
print('Found %s word vectors.' %len(embedding_index))


Importing word vectors
Indexed the word vectors
Found 2196016 word vectors.
Wall time: 9.51 s


In [7]:
%%time
amz_reviews_pos = []
amz_reviews_neg = []
with open(amz_filename, 'r', encoding='utf-8') as file:
    for line in file:
        data = json.loads(line)
        rating = int(data['overall'])
        if rating != 3:
            if (rating == 1) or (rating==2):
                amz_reviews_neg.append(data['reviewText'])
            else:
                amz_reviews_pos.append(data['reviewText'])

Wall time: 21.6 s


In [8]:
# Randomly sample 5000 positive and 5000 negative samples.
# That corresponds to the 25% of the whole dataset (40000 samples).
NUM_POS = 5000
NUM_NEG = 5000
amz_reviews_neg = random.sample(amz_reviews_neg, NUM_NEG)
amz_reviews_pos = random.sample(amz_reviews_pos, NUM_POS)
labels = np.asarray([0] * NUM_NEG + [1] * NUM_POS)

In [9]:
%%time
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS, filters='', lower=False)
tokenizer.fit_on_texts(amz_reviews_neg + amz_reviews_pos)
sequences = tokenizer.texts_to_sequences(amz_reviews_neg + amz_reviews_pos)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(f'Found {len(tokenizer.word_index)} unique tokens.')
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 79164 unique tokens.
Shape of data tensor: (10000, 200)
Shape of label tensor: (10000,)
Wall time: 1.79 s


In [10]:
# split the data into a training set, a validation set and a test set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
num_test_samples = int(TEST_SPLIT * data.shape[0])

x_train = data[:-num_test_samples]
y_train = labels[:-num_test_samples]
x_test = data[-num_test_samples:]
y_test = labels[-num_test_samples:]
# x_train = data[:-num_validation_samples-num_test_samples]
# y_train = labels[:-num_validation_samples-num_test_samples]
# x_val = data[-num_validation_samples-num_test_samples:-num_test_samples]
# y_val = labels[-num_validation_samples-num_test_samples:-num_test_samples]
# x_test = data[-num_test_samples:]
# y_test = labels[-num_test_samples:]

print('Train data shape:', x_train.shape)
# print('Validation data shape:', x_val.shape)
print('Test data shape:', x_test.shape)

Train data shape: (7500, 200)
Test data shape: (2500, 200)


In [11]:
%%time
# Prepare word embeddings
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embedding_index.get(word) 
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Wall time: 27 ms


In [12]:
# This is a custome attention layer found in a seq2seq example.
class attention(tf.keras.Model):
    def __init__(self, units, name):
        super().__init__(name=name)
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [13]:
# Keras model
input_text = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype = 'int32', name='input')

embedding_layer = Embedding(num_words, 
                            EMBEDDING_DIM, 
                            embeddings_initializer=Constant(embedding_matrix), 
                            input_length = MAX_SEQUENCE_LENGTH, 
                            trainable = False, 
                            name='embedding_layer')

embedded_sequence = embedding_layer(input_text)

lstm = keras.layers.Bidirectional(keras.layers.LSTM
                                  (num_lstm,
                                   dropout=0.2,
                                   recurrent_dropout=0.2,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform'), 
                                   name="bi_lstm_f")(embedded_sequence)

lstm, forward_h, forward_c, backward_h, backward_c = \
       keras.layers.Bidirectional(keras.layers.LSTM
                                   (num_lstm,
                                    dropout=0.2,
                                    recurrent_dropout=0.2,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform'),
                                    name="bi_lstm_b")(lstm)

state_h = Concatenate(name="concatenate")([forward_h, backward_h])
# state_c = Concatenate()([forward_c, backward_c])
context_vector, attention_weights = attention(200, name="attention_layer")(lstm, state_h)
output = Dense(1, activation='sigmoid', name="dense_layer")(context_vector)
 
model = Model(inputs=input_text, outputs=output)

# summarize layers
# print(model.summary())

# #########################
# ## train the model.
# #########################
# # model = Model(inputs = [input_text], outputs = preds)
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
print(model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_layer (Embedding)     (None, 200, 300)     1500000     input[0][0]                      
__________________________________________________________________________________________________
bi_lstm_f (Bidirectional)       [(None, 200, 200), ( 320800      embedding_layer[0][0]            
__________________________________________________________________________________________________
bi_lstm_b (Bidirectional)       [(None, 200, 200), ( 240800      bi_lstm_f[0][0]                  
                                                                 bi_lstm_f[0][1]              

In [14]:
early_stopping = EarlyStopping(monitor='val_loss',
                               min_delta=0,
                               patience=2,
                               verbose=0, 
                               mode='auto')

# model_checkpoint = ModelCheckpoint(best_model_path, save_best_only = True, save_weights_only = True)

history = model.fit(x = x_train,
                    y = y_train,
                    epochs=10,
                    batch_size=200,
                    validation_split=.25, verbose=1, callbacks=[early_stopping])

# y_test_predicted = model.predict([x_test], batch_size = 1024, verbose = 1)
# sample_submission = pd.read_csv("../input/sample_submission.csv")
# sample_submission[classes_to_predict] = y_test_predicted

# sample_submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
